[318408a] | 1 | Submitted By: Ken Moffat <ken at linuxfromscratch dot org>
|
---|
| 2 | Date: 2008-02-19
|
---|
| 3 | Initial Package Version: 2.5.3
|
---|
| 4 | Upstream Status: uncertain
|
---|
| 5 | Origin: from debian.
|
---|
| 6 | Description: Various fixes, particularly speed improvements for UTF-8 locales.
|
---|
| 7 | Also adds a 'standard input' marker into the results for certain obscure uses.
|
---|
| 8 |
|
---|
| 9 | diff -Naur grep-2.5.3.orig/lib/posix/regex.h grep-2.5.3.lfs/lib/posix/regex.h
|
---|
| 10 | --- grep-2.5.3.orig/lib/posix/regex.h 2007-06-28 19:57:18.000000000 +0100
|
---|
| 11 | +++ grep-2.5.3.lfs/lib/posix/regex.h 2008-02-10 18:56:07.000000000 +0000
|
---|
| 12 | @@ -165,6 +165,10 @@
|
---|
| 13 | treated as 'a\{1'. */
|
---|
| 14 | #define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1)
|
---|
| 15 |
|
---|
| 16 | +/* If this bit is set, then ignore case when matching.
|
---|
| 17 | + If not set, then case is significant. */
|
---|
| 18 | +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
|
---|
| 19 | +
|
---|
| 20 | /* This global variable defines the particular regexp syntax to use (for
|
---|
| 21 | some interfaces). When a regexp is compiled, the syntax used is
|
---|
| 22 | stored in the pattern buffer, so changing this does not affect
|
---|
| 23 | diff -Naur grep-2.5.3.orig/src/dfa.c grep-2.5.3.lfs/src/dfa.c
|
---|
| 24 | --- grep-2.5.3.orig/src/dfa.c 2007-06-28 19:57:19.000000000 +0100
|
---|
| 25 | +++ grep-2.5.3.lfs/src/dfa.c 2008-02-10 18:55:29.000000000 +0000
|
---|
| 26 | @@ -594,6 +594,17 @@
|
---|
| 27 | /* build character class. */
|
---|
| 28 | {
|
---|
| 29 | wctype_t wt;
|
---|
| 30 | + /* NOTE:
|
---|
| 31 | + * when case_fold, character class [:upper:] and [:lower:]
|
---|
| 32 | + * should be treated as [:alpha:], this is the same way
|
---|
| 33 | + * of glibc/posix/regcomp.c:build_charclass().
|
---|
| 34 | + * reported by Bug#276202
|
---|
| 35 | + * - fixed by Fumitoshi UKAI
|
---|
| 36 | + */
|
---|
| 37 | + if (case_fold
|
---|
| 38 | + && (strcmp (str, "upper") == 0 || strcmp (str, "lower") == 0))
|
---|
| 39 | + strcpy (str, "alpha");
|
---|
| 40 | +
|
---|
| 41 | /* Query the character class as wctype_t. */
|
---|
| 42 | wt = wctype (str);
|
---|
| 43 |
|
---|
| 44 | @@ -681,6 +692,29 @@
|
---|
| 45 | REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
|
---|
| 46 | range_ends_al, work_mbc->nranges + 1);
|
---|
| 47 | work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
|
---|
| 48 | + if (case_fold
|
---|
| 49 | + && (iswlower((wint_t)wc) || iswupper((wint_t)wc))
|
---|
| 50 | + && (iswlower((wint_t)wc2) || iswupper((wint_t)wc2))) {
|
---|
| 51 | + wint_t altcase;
|
---|
| 52 | + altcase = wc;
|
---|
| 53 | + if (iswlower((wint_t)wc))
|
---|
| 54 | + altcase = towupper((wint_t)wc);
|
---|
| 55 | + else
|
---|
| 56 | + altcase = towlower((wint_t)wc);
|
---|
| 57 | + REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
|
---|
| 58 | + range_sts_al, work_mbc->nranges + 1);
|
---|
| 59 | + work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
|
---|
| 60 | +
|
---|
| 61 | + altcase = wc2;
|
---|
| 62 | + if (iswlower((wint_t)wc2))
|
---|
| 63 | + altcase = towupper((wint_t)wc2);
|
---|
| 64 | + else
|
---|
| 65 | + altcase = towlower((wint_t)wc2);
|
---|
| 66 | + REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
|
---|
| 67 | + range_ends_al, work_mbc->nranges + 1);
|
---|
| 68 | + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
|
---|
| 69 | +
|
---|
| 70 | + }
|
---|
| 71 | }
|
---|
| 72 | else if (wc != WEOF)
|
---|
| 73 | /* build normal characters. */
|
---|
| 74 | @@ -688,6 +722,20 @@
|
---|
| 75 | REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
|
---|
| 76 | work_mbc->nchars + 1);
|
---|
| 77 | work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
|
---|
| 78 | + if (case_fold && (iswlower((wint_t) wc) || iswupper((wint_t) wc)))
|
---|
| 79 | + {
|
---|
| 80 | + wint_t altcase;
|
---|
| 81 | +
|
---|
| 82 | + altcase = wc; /* keeps compiler happy */
|
---|
| 83 | + if (iswlower((wint_t) wc))
|
---|
| 84 | + altcase = towupper((wint_t) wc);
|
---|
| 85 | + else if (iswupper((wint_t) wc))
|
---|
| 86 | + altcase = towlower((wint_t) wc);
|
---|
| 87 | +
|
---|
| 88 | + REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
|
---|
| 89 | + work_mbc->nchars + 1);
|
---|
| 90 | + work_mbc->chars[work_mbc->nchars++] = (wchar_t) altcase;
|
---|
| 91 | + }
|
---|
| 92 | }
|
---|
| 93 | }
|
---|
| 94 | while ((wc = wc1) != L']');
|
---|
| 95 | diff -Naur grep-2.5.3.orig/src/grep.c grep-2.5.3.lfs/src/grep.c
|
---|
| 96 | --- grep-2.5.3.orig/src/grep.c 2007-06-28 19:57:19.000000000 +0100
|
---|
| 97 | +++ grep-2.5.3.lfs/src/grep.c 2008-02-10 18:54:53.000000000 +0000
|
---|
| 98 | @@ -274,6 +274,12 @@
|
---|
| 99 | #endif
|
---|
| 100 | ;
|
---|
| 101 |
|
---|
| 102 | +/* Default for `file_list' if no files are given on the command line. */
|
---|
| 103 | +static char *stdin_argv[] =
|
---|
| 104 | +{
|
---|
| 105 | + "-", NULL
|
---|
| 106 | +};
|
---|
| 107 | +
|
---|
| 108 | /* Non-boolean long options that have no corresponding short equivalents. */
|
---|
| 109 | enum
|
---|
| 110 | {
|
---|
| 111 | @@ -534,7 +540,16 @@
|
---|
| 112 | for byte sentinels fore and aft. */
|
---|
| 113 | newalloc = newsize + pagesize + 1;
|
---|
| 114 |
|
---|
| 115 | - newbuf = bufalloc < newalloc ? xmalloc (bufalloc = newalloc) : buffer;
|
---|
| 116 | + newbuf = bufalloc < newalloc ? malloc (bufalloc = newalloc) : buffer;
|
---|
| 117 | + if (newbuf == NULL)
|
---|
| 118 | + {
|
---|
| 119 | + int saved_errno = errno;
|
---|
| 120 | + free (buffer);
|
---|
| 121 | + bufalloc = ALIGN_TO (INITIAL_BUFSIZE, pagesize) + pagesize + 1;
|
---|
| 122 | + buffer = xmalloc (bufalloc);
|
---|
| 123 | + errno = saved_errno;
|
---|
| 124 | + return 0;
|
---|
| 125 | + }
|
---|
| 126 | readbuf = ALIGN_TO (newbuf + 1 + save, pagesize);
|
---|
| 127 | bufbeg = readbuf - save;
|
---|
| 128 | memmove (bufbeg, buffer + saved_offset, save);
|
---|
| 129 | @@ -1825,6 +1840,7 @@
|
---|
| 130 | FILE *fp;
|
---|
| 131 | extern char *optarg;
|
---|
| 132 | extern int optind;
|
---|
| 133 | + char **file_list;
|
---|
| 134 |
|
---|
| 135 | initialize_main (&argc, &argv);
|
---|
| 136 | program_name = argv[0];
|
---|
| 137 | @@ -2244,29 +2260,29 @@
|
---|
| 138 | if (max_count == 0)
|
---|
| 139 | exit (1);
|
---|
| 140 |
|
---|
| 141 | - if (optind < argc)
|
---|
| 142 | + file_list = (optind == argc ? stdin_argv : &argv[optind]);
|
---|
| 143 | +
|
---|
| 144 | + status = 1;
|
---|
| 145 | + while (1)
|
---|
| 146 | {
|
---|
| 147 | - status = 1;
|
---|
| 148 | - do
|
---|
| 149 | + char *file = *file_list++;
|
---|
| 150 | +
|
---|
| 151 | + if (file == NULL)
|
---|
| 152 | + break;
|
---|
| 153 | +
|
---|
| 154 | + if ((included_patterns || excluded_patterns)
|
---|
| 155 | + && !isdir (file))
|
---|
| 156 | {
|
---|
| 157 | - char *file = argv[optind];
|
---|
| 158 | - if ((included_patterns || excluded_patterns)
|
---|
| 159 | - && !isdir (file))
|
---|
| 160 | - {
|
---|
| 161 | - if (included_patterns &&
|
---|
| 162 | - ! excluded_filename (included_patterns, file, 0))
|
---|
| 163 | - continue;
|
---|
| 164 | - if (excluded_patterns &&
|
---|
| 165 | - excluded_filename (excluded_patterns, file, 0))
|
---|
| 166 | - continue;
|
---|
| 167 | - }
|
---|
| 168 | - status &= grepfile (strcmp (file, "-") == 0 ? (char *) NULL : file,
|
---|
| 169 | - &stats_base);
|
---|
| 170 | + if (included_patterns &&
|
---|
| 171 | + ! excluded_filename (included_patterns, file, 0))
|
---|
| 172 | + continue;
|
---|
| 173 | + if (excluded_patterns &&
|
---|
| 174 | + excluded_filename (excluded_patterns, file, 0))
|
---|
| 175 | + continue;
|
---|
| 176 | }
|
---|
| 177 | - while ( ++optind < argc);
|
---|
| 178 | + status &= grepfile (strcmp (file, "-") == 0
|
---|
| 179 | + ? (char *) NULL : file, &stats_base);
|
---|
| 180 | }
|
---|
| 181 | - else
|
---|
| 182 | - status = grepfile ((char *) NULL, &stats_base);
|
---|
| 183 |
|
---|
| 184 | /* We register via atexit() to test stdout. */
|
---|
| 185 | exit (errseen ? 2 : status);
|
---|
| 186 | diff -Naur grep-2.5.3.orig/src/search.c grep-2.5.3.lfs/src/search.c
|
---|
| 187 | --- grep-2.5.3.orig/src/search.c 2007-06-28 19:57:19.000000000 +0100
|
---|
| 188 | +++ grep-2.5.3.lfs/src/search.c 2008-02-10 18:56:18.000000000 +0000
|
---|
| 189 | @@ -18,10 +18,15 @@
|
---|
| 190 |
|
---|
| 191 | /* Written August 1992 by Mike Haertel. */
|
---|
| 192 |
|
---|
| 193 | +#ifndef _GNU_SOURCE
|
---|
| 194 | +# define _GNU_SOURCE 1
|
---|
| 195 | +#endif
|
---|
| 196 | #ifdef HAVE_CONFIG_H
|
---|
| 197 | # include <config.h>
|
---|
| 198 | #endif
|
---|
| 199 |
|
---|
| 200 | +#include <assert.h>
|
---|
| 201 | +
|
---|
| 202 | #include <sys/types.h>
|
---|
| 203 |
|
---|
| 204 | #include "mbsupport.h"
|
---|
| 205 | @@ -43,6 +48,9 @@
|
---|
| 206 | #ifdef HAVE_LIBPCRE
|
---|
| 207 | # include <pcre.h>
|
---|
| 208 | #endif
|
---|
| 209 | +#ifdef HAVE_LANGINFO_CODESET
|
---|
| 210 | +# include <langinfo.h>
|
---|
| 211 | +#endif
|
---|
| 212 |
|
---|
| 213 | #define NCHAR (UCHAR_MAX + 1)
|
---|
| 214 |
|
---|
| 215 | @@ -68,6 +76,19 @@
|
---|
| 216 | error (2, 0, _("memory exhausted"));
|
---|
| 217 | }
|
---|
| 218 |
|
---|
| 219 | +/* UTF-8 encoding allows some optimizations that we can't otherwise
|
---|
| 220 | + assume in a multibyte encoding. */
|
---|
| 221 | +static int using_utf8;
|
---|
| 222 | +
|
---|
| 223 | +void
|
---|
| 224 | +check_utf8 (void)
|
---|
| 225 | +{
|
---|
| 226 | +#ifdef HAVE_LANGINFO_CODESET
|
---|
| 227 | + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
|
---|
| 228 | + using_utf8 = 1;
|
---|
| 229 | +#endif
|
---|
| 230 | +}
|
---|
| 231 | +
|
---|
| 232 | #ifndef FGREP_PROGRAM
|
---|
| 233 | /* DFA compiled regexp. */
|
---|
| 234 | static struct dfa dfa;
|
---|
| 235 | @@ -134,49 +155,6 @@
|
---|
| 236 | }
|
---|
| 237 | #endif /* !FGREP_PROGRAM */
|
---|
| 238 |
|
---|
| 239 | -#ifdef MBS_SUPPORT
|
---|
| 240 | -/* This function allocate the array which correspond to "buf".
|
---|
| 241 | - Then this check multibyte string and mark on the positions which
|
---|
| 242 | - are not single byte character nor the first byte of a multibyte
|
---|
| 243 | - character. Caller must free the array. */
|
---|
| 244 | -static char*
|
---|
| 245 | -check_multibyte_string(char const *buf, size_t size)
|
---|
| 246 | -{
|
---|
| 247 | - char *mb_properties = xmalloc(size);
|
---|
| 248 | - mbstate_t cur_state;
|
---|
| 249 | - wchar_t wc;
|
---|
| 250 | - int i;
|
---|
| 251 | -
|
---|
| 252 | - memset(&cur_state, 0, sizeof(mbstate_t));
|
---|
| 253 | - memset(mb_properties, 0, sizeof(char)*size);
|
---|
| 254 | -
|
---|
| 255 | - for (i = 0; i < size ;)
|
---|
| 256 | - {
|
---|
| 257 | - size_t mbclen;
|
---|
| 258 | - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state);
|
---|
| 259 | -
|
---|
| 260 | - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
|
---|
| 261 | - {
|
---|
| 262 | - /* An invalid sequence, or a truncated multibyte character.
|
---|
| 263 | - We treat it as a single byte character. */
|
---|
| 264 | - mbclen = 1;
|
---|
| 265 | - }
|
---|
| 266 | - else if (match_icase)
|
---|
| 267 | - {
|
---|
| 268 | - if (iswupper((wint_t)wc))
|
---|
| 269 | - {
|
---|
| 270 | - wc = towlower((wint_t)wc);
|
---|
| 271 | - wcrtomb(buf + i, wc, &cur_state);
|
---|
| 272 | - }
|
---|
| 273 | - }
|
---|
| 274 | - mb_properties[i] = mbclen;
|
---|
| 275 | - i += mbclen;
|
---|
| 276 | - }
|
---|
| 277 | -
|
---|
| 278 | - return mb_properties;
|
---|
| 279 | -}
|
---|
| 280 | -#endif /* MBS_SUPPORT */
|
---|
| 281 | -
|
---|
| 282 | #if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM)
|
---|
| 283 | #ifdef EGREP_PROGRAM
|
---|
| 284 | COMPILE_FCT(Ecompile)
|
---|
| 285 | @@ -193,10 +171,9 @@
|
---|
| 286 | size_t total = size;
|
---|
| 287 | char const *motif = pattern;
|
---|
| 288 |
|
---|
| 289 | -#if 0
|
---|
| 290 | + check_utf8 ();
|
---|
| 291 | if (match_icase)
|
---|
| 292 | syntax_bits |= RE_ICASE;
|
---|
| 293 | -#endif
|
---|
| 294 | re_set_syntax (syntax_bits);
|
---|
| 295 | dfasyntax (syntax_bits, match_icase, eolbyte);
|
---|
| 296 |
|
---|
| 297 | @@ -301,23 +278,35 @@
|
---|
| 298 | char eol = eolbyte;
|
---|
| 299 | int backref, start, len, best_len;
|
---|
| 300 | struct kwsmatch kwsm;
|
---|
| 301 | + static int use_dfa;
|
---|
| 302 | + static int use_dfa_checked = 0;
|
---|
| 303 | size_t i, ret_val;
|
---|
| 304 | #ifdef MBS_SUPPORT
|
---|
| 305 | - char *mb_properties = NULL;
|
---|
| 306 | - if (MB_CUR_MAX > 1)
|
---|
| 307 | + const char *last_char = NULL;
|
---|
| 308 | + int mb_cur_max = MB_CUR_MAX;
|
---|
| 309 | + mbstate_t mbs;
|
---|
| 310 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 311 | +#endif /* MBS_SUPPORT */
|
---|
| 312 | +
|
---|
| 313 | + if (!use_dfa_checked)
|
---|
| 314 | {
|
---|
| 315 | - if (match_icase)
|
---|
| 316 | - {
|
---|
| 317 | - char *case_buf = xmalloc(size);
|
---|
| 318 | - memcpy(case_buf, buf, size);
|
---|
| 319 | - if (start_ptr)
|
---|
| 320 | - start_ptr = case_buf + (start_ptr - buf);
|
---|
| 321 | - buf = case_buf;
|
---|
| 322 | - }
|
---|
| 323 | - if (kwset)
|
---|
| 324 | - mb_properties = check_multibyte_string(buf, size);
|
---|
| 325 | - }
|
---|
| 326 | + char *grep_use_dfa = getenv ("GREP_USE_DFA");
|
---|
| 327 | + if (!grep_use_dfa)
|
---|
| 328 | + {
|
---|
| 329 | +#ifdef MBS_SUPPORT
|
---|
| 330 | + /* Turn off DFA when processing multibyte input. */
|
---|
| 331 | + use_dfa = (MB_CUR_MAX == 1);
|
---|
| 332 | +#else
|
---|
| 333 | + use_dfa = 1;
|
---|
| 334 | #endif /* MBS_SUPPORT */
|
---|
| 335 | + }
|
---|
| 336 | + else
|
---|
| 337 | + {
|
---|
| 338 | + use_dfa = atoi (grep_use_dfa);
|
---|
| 339 | + }
|
---|
| 340 | +
|
---|
| 341 | + use_dfa_checked = 1;
|
---|
| 342 | + }
|
---|
| 343 |
|
---|
| 344 | buflim = buf + size;
|
---|
| 345 |
|
---|
| 346 | @@ -329,40 +318,123 @@
|
---|
| 347 | if (kwset)
|
---|
| 348 | {
|
---|
| 349 | /* Find a possible match using the KWset matcher. */
|
---|
| 350 | - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
|
---|
| 351 | +#ifdef MBS_SUPPORT
|
---|
| 352 | + size_t bytes_left = 0;
|
---|
| 353 | +#endif /* MBS_SUPPORT */
|
---|
| 354 | + size_t offset;
|
---|
| 355 | +#ifdef MBS_SUPPORT
|
---|
| 356 | + /* kwsexec doesn't work with match_icase and multibyte input. */
|
---|
| 357 | + if (match_icase && mb_cur_max > 1)
|
---|
| 358 | + /* Avoid kwset */
|
---|
| 359 | + offset = 0;
|
---|
| 360 | + else
|
---|
| 361 | +#endif /* MBS_SUPPORT */
|
---|
| 362 | + offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
|
---|
| 363 | if (offset == (size_t) -1)
|
---|
| 364 | - goto failure;
|
---|
| 365 | + return (size_t)-1;
|
---|
| 366 | +#ifdef MBS_SUPPORT
|
---|
| 367 | + if (mb_cur_max > 1 && !using_utf8)
|
---|
| 368 | + {
|
---|
| 369 | + bytes_left = offset;
|
---|
| 370 | + while (bytes_left)
|
---|
| 371 | + {
|
---|
| 372 | + size_t mlen = mbrlen (beg, bytes_left, &mbs);
|
---|
| 373 | +
|
---|
| 374 | + last_char = beg;
|
---|
| 375 | + if (mlen == (size_t) -1 || mlen == 0)
|
---|
| 376 | + {
|
---|
| 377 | + /* Incomplete character: treat as single-byte. */
|
---|
| 378 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 379 | + beg++;
|
---|
| 380 | + bytes_left--;
|
---|
| 381 | + continue;
|
---|
| 382 | + }
|
---|
| 383 | +
|
---|
| 384 | + if (mlen == (size_t) -2)
|
---|
| 385 | + /* Offset points inside multibyte character:
|
---|
| 386 | + * no good. */
|
---|
| 387 | + break;
|
---|
| 388 | +
|
---|
| 389 | + beg += mlen;
|
---|
| 390 | + bytes_left -= mlen;
|
---|
| 391 | + }
|
---|
| 392 | + }
|
---|
| 393 | + else
|
---|
| 394 | +#endif /* MBS_SUPPORT */
|
---|
| 395 | beg += offset;
|
---|
| 396 | /* Narrow down to the line containing the candidate, and
|
---|
| 397 | run it through DFA. */
|
---|
| 398 | end = memchr(beg, eol, buflim - beg);
|
---|
| 399 | end++;
|
---|
| 400 | #ifdef MBS_SUPPORT
|
---|
| 401 | - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
|
---|
| 402 | + if (mb_cur_max > 1 && bytes_left)
|
---|
| 403 | continue;
|
---|
| 404 | #endif
|
---|
| 405 | while (beg > buf && beg[-1] != eol)
|
---|
| 406 | --beg;
|
---|
| 407 | - if (kwsm.index < kwset_exact_matches)
|
---|
| 408 | + if (
|
---|
| 409 | +#ifdef MBS_SUPPORT
|
---|
| 410 | + !(match_icase && mb_cur_max > 1) &&
|
---|
| 411 | +#endif /* MBS_SUPPORT */
|
---|
| 412 | + (kwsm.index < kwset_exact_matches))
|
---|
| 413 | goto success;
|
---|
| 414 | - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
|
---|
| 415 | + if (use_dfa &&
|
---|
| 416 | + dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
|
---|
| 417 | continue;
|
---|
| 418 | }
|
---|
| 419 | else
|
---|
| 420 | {
|
---|
| 421 | /* No good fixed strings; start with DFA. */
|
---|
| 422 | - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
|
---|
| 423 | +#ifdef MBS_SUPPORT
|
---|
| 424 | + size_t bytes_left = 0;
|
---|
| 425 | +#endif /* MBS_SUPPORT */
|
---|
| 426 | + size_t offset = 0;
|
---|
| 427 | + if (use_dfa)
|
---|
| 428 | + offset = dfaexec (&dfa, beg, buflim - beg, &backref);
|
---|
| 429 | if (offset == (size_t) -1)
|
---|
| 430 | break;
|
---|
| 431 | /* Narrow down to the line we've found. */
|
---|
| 432 | +#ifdef MBS_SUPPORT
|
---|
| 433 | + if (mb_cur_max > 1 && !using_utf8)
|
---|
| 434 | + {
|
---|
| 435 | + bytes_left = offset;
|
---|
| 436 | + while (bytes_left)
|
---|
| 437 | + {
|
---|
| 438 | + size_t mlen = mbrlen (beg, bytes_left, &mbs);
|
---|
| 439 | +
|
---|
| 440 | + last_char = beg;
|
---|
| 441 | + if (mlen == (size_t) -1 || mlen == 0)
|
---|
| 442 | + {
|
---|
| 443 | + /* Incomplete character: treat as single-byte. */
|
---|
| 444 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 445 | + beg++;
|
---|
| 446 | + bytes_left--;
|
---|
| 447 | + continue;
|
---|
| 448 | + }
|
---|
| 449 | +
|
---|
| 450 | + if (mlen == (size_t) -2)
|
---|
| 451 | + /* Offset points inside multibyte character:
|
---|
| 452 | + * no good. */
|
---|
| 453 | + break;
|
---|
| 454 | +
|
---|
| 455 | + beg += mlen;
|
---|
| 456 | + bytes_left -= mlen;
|
---|
| 457 | + }
|
---|
| 458 | + }
|
---|
| 459 | + else
|
---|
| 460 | +#endif /* MBS_SUPPORT */
|
---|
| 461 | beg += offset;
|
---|
| 462 | end = memchr (beg, eol, buflim - beg);
|
---|
| 463 | end++;
|
---|
| 464 | +#ifdef MBS_SUPPORT
|
---|
| 465 | + if (mb_cur_max > 1 && bytes_left)
|
---|
| 466 | + continue;
|
---|
| 467 | +#endif /* MBS_SUPPORT */
|
---|
| 468 | while (beg > buf && beg[-1] != eol)
|
---|
| 469 | --beg;
|
---|
| 470 | }
|
---|
| 471 | /* Successful, no backreferences encountered! */
|
---|
| 472 | - if (!backref)
|
---|
| 473 | + if (use_dfa && !backref)
|
---|
| 474 | goto success;
|
---|
| 475 | }
|
---|
| 476 | else
|
---|
| 477 | @@ -408,10 +480,84 @@
|
---|
| 478 | if (match_words)
|
---|
| 479 | while (match <= best_match)
|
---|
| 480 | {
|
---|
| 481 | - if ((match == buf || !WCHAR ((unsigned char) match[-1]))
|
---|
| 482 | - && (len == end - beg - 1
|
---|
| 483 | - || !WCHAR ((unsigned char) match[len])))
|
---|
| 484 | - goto assess_pattern_match;
|
---|
| 485 | + int lword_match = 0;
|
---|
| 486 | + if (match == buf)
|
---|
| 487 | + lword_match = 1;
|
---|
| 488 | + else
|
---|
| 489 | + {
|
---|
| 490 | + assert (start > 0);
|
---|
| 491 | +#ifdef MBS_SUPPORT
|
---|
| 492 | + if (mb_cur_max > 1)
|
---|
| 493 | + {
|
---|
| 494 | + const char *s;
|
---|
| 495 | + int mr;
|
---|
| 496 | + wchar_t pwc;
|
---|
| 497 | + if (using_utf8)
|
---|
| 498 | + {
|
---|
| 499 | + s = match - 1;
|
---|
| 500 | + while (s > buf
|
---|
| 501 | + && (unsigned char) *s >= 0x80
|
---|
| 502 | + && (unsigned char) *s <= 0xbf)
|
---|
| 503 | + --s;
|
---|
| 504 | + }
|
---|
| 505 | + else
|
---|
| 506 | + s = last_char;
|
---|
| 507 | + mr = mbtowc (&pwc, s, match - s);
|
---|
| 508 | + if (mr <= 0)
|
---|
| 509 | + {
|
---|
| 510 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 511 | + lword_match = 1;
|
---|
| 512 | + }
|
---|
| 513 | + else if (!(iswalnum (pwc) || pwc == L'_')
|
---|
| 514 | + && mr == (int) (match - s))
|
---|
| 515 | + lword_match = 1;
|
---|
| 516 | + }
|
---|
| 517 | + else
|
---|
| 518 | +#endif /* MBS_SUPPORT */
|
---|
| 519 | + if (!WCHAR ((unsigned char) match[-1]))
|
---|
| 520 | + lword_match = 1;
|
---|
| 521 | + }
|
---|
| 522 | +
|
---|
| 523 | + if (lword_match)
|
---|
| 524 | + {
|
---|
| 525 | + int rword_match = 0;
|
---|
| 526 | + if (start + len == end - beg - 1)
|
---|
| 527 | + rword_match = 1;
|
---|
| 528 | + else
|
---|
| 529 | + {
|
---|
| 530 | +#ifdef MBS_SUPPORT
|
---|
| 531 | + if (mb_cur_max > 1)
|
---|
| 532 | + {
|
---|
| 533 | + wchar_t nwc;
|
---|
| 534 | + int mr;
|
---|
| 535 | +
|
---|
| 536 | + mr = mbtowc (&nwc, buf + start + len,
|
---|
| 537 | + end - buf - start - len - 1);
|
---|
| 538 | + if (mr <= 0)
|
---|
| 539 | + {
|
---|
| 540 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 541 | + rword_match = 1;
|
---|
| 542 | + }
|
---|
| 543 | + else if (!iswalnum (nwc) && nwc != L'_')
|
---|
| 544 | + rword_match = 1;
|
---|
| 545 | + }
|
---|
| 546 | + else
|
---|
| 547 | +#endif /* MBS_SUPPORT */
|
---|
| 548 | + if (!WCHAR ((unsigned char) match[len]))
|
---|
| 549 | + rword_match = 1;
|
---|
| 550 | + }
|
---|
| 551 | +
|
---|
| 552 | + if (rword_match)
|
---|
| 553 | + {
|
---|
| 554 | + if (!start_ptr)
|
---|
| 555 | + /* Returns the whole line. */
|
---|
| 556 | + goto success;
|
---|
| 557 | + else
|
---|
| 558 | + {
|
---|
| 559 | + goto assess_pattern_match;
|
---|
| 560 | + }
|
---|
| 561 | + }
|
---|
| 562 | + }
|
---|
| 563 | if (len > 0)
|
---|
| 564 | {
|
---|
| 565 | /* Try a shorter length anchored at the same place. */
|
---|
| 566 | @@ -475,24 +621,144 @@
|
---|
| 567 | *match_size = len;
|
---|
| 568 | ret_val = beg - buf;
|
---|
| 569 | out:
|
---|
| 570 | -#ifdef MBS_SUPPORT
|
---|
| 571 | - if (MB_CUR_MAX > 1)
|
---|
| 572 | - {
|
---|
| 573 | - if (match_icase)
|
---|
| 574 | - free((char*)buf);
|
---|
| 575 | - if (mb_properties)
|
---|
| 576 | - free(mb_properties);
|
---|
| 577 | - }
|
---|
| 578 | -#endif /* MBS_SUPPORT */
|
---|
| 579 | return ret_val;
|
---|
| 580 | }
|
---|
| 581 | #endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
|
---|
| 582 |
|
---|
| 583 | +#ifdef MBS_SUPPORT
|
---|
| 584 | +static int f_i_multibyte; /* whether we're using the new -Fi MB method */
|
---|
| 585 | +static struct
|
---|
| 586 | +{
|
---|
| 587 | + wchar_t **patterns;
|
---|
| 588 | + size_t count, maxlen;
|
---|
| 589 | + unsigned char *match;
|
---|
| 590 | +} Fimb;
|
---|
| 591 | +#endif
|
---|
| 592 | +
|
---|
| 593 | #if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
|
---|
| 594 | COMPILE_FCT(Fcompile)
|
---|
| 595 | {
|
---|
| 596 | + int mb_cur_max = MB_CUR_MAX;
|
---|
| 597 | char const *beg, *lim, *err;
|
---|
| 598 |
|
---|
| 599 | + check_utf8 ();
|
---|
| 600 | +#ifdef MBS_SUPPORT
|
---|
| 601 | + /* Support -F -i for UTF-8 input. */
|
---|
| 602 | + if (match_icase && mb_cur_max > 1)
|
---|
| 603 | + {
|
---|
| 604 | + mbstate_t mbs;
|
---|
| 605 | + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
|
---|
| 606 | + const char *patternend = pattern;
|
---|
| 607 | + size_t wcsize;
|
---|
| 608 | + kwset_t fimb_kwset = NULL;
|
---|
| 609 | + char *starts = NULL;
|
---|
| 610 | + wchar_t *wcbeg, *wclim;
|
---|
| 611 | + size_t allocated = 0;
|
---|
| 612 | +
|
---|
| 613 | + memset (&mbs, '\0', sizeof (mbs));
|
---|
| 614 | +# ifdef __GNU_LIBRARY__
|
---|
| 615 | + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
|
---|
| 616 | + if (patternend != pattern + size)
|
---|
| 617 | + wcsize = (size_t) -1;
|
---|
| 618 | +# else
|
---|
| 619 | + {
|
---|
| 620 | + char *patterncopy = xmalloc (size + 1);
|
---|
| 621 | +
|
---|
| 622 | + memcpy (patterncopy, pattern, size);
|
---|
| 623 | + patterncopy[size] = '\0';
|
---|
| 624 | + patternend = patterncopy;
|
---|
| 625 | + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
|
---|
| 626 | + if (patternend != patterncopy + size)
|
---|
| 627 | + wcsize = (size_t) -1;
|
---|
| 628 | + free (patterncopy);
|
---|
| 629 | + }
|
---|
| 630 | +# endif
|
---|
| 631 | + if (wcsize + 2 <= 2)
|
---|
| 632 | + {
|
---|
| 633 | +fimb_fail:
|
---|
| 634 | + free (wcpattern);
|
---|
| 635 | + free (starts);
|
---|
| 636 | + if (fimb_kwset)
|
---|
| 637 | + kwsfree (fimb_kwset);
|
---|
| 638 | + free (Fimb.patterns);
|
---|
| 639 | + Fimb.patterns = NULL;
|
---|
| 640 | + }
|
---|
| 641 | + else
|
---|
| 642 | + {
|
---|
| 643 | + if (!(fimb_kwset = kwsalloc (NULL)))
|
---|
| 644 | + error (2, 0, _("memory exhausted"));
|
---|
| 645 | +
|
---|
| 646 | + starts = xmalloc (mb_cur_max * 3);
|
---|
| 647 | + wcbeg = wcpattern;
|
---|
| 648 | + do
|
---|
| 649 | + {
|
---|
| 650 | + int i;
|
---|
| 651 | + size_t wclen;
|
---|
| 652 | +
|
---|
| 653 | + if (Fimb.count >= allocated)
|
---|
| 654 | + {
|
---|
| 655 | + if (allocated == 0)
|
---|
| 656 | + allocated = 128;
|
---|
| 657 | + else
|
---|
| 658 | + allocated *= 2;
|
---|
| 659 | + Fimb.patterns = xrealloc (Fimb.patterns,
|
---|
| 660 | + sizeof (wchar_t *) * allocated);
|
---|
| 661 | + }
|
---|
| 662 | + Fimb.patterns[Fimb.count++] = wcbeg;
|
---|
| 663 | + for (wclim = wcbeg;
|
---|
| 664 | + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
|
---|
| 665 | + *wclim = towlower (*wclim);
|
---|
| 666 | + *wclim = L'\0';
|
---|
| 667 | + wclen = wclim - wcbeg;
|
---|
| 668 | + if (wclen > Fimb.maxlen)
|
---|
| 669 | + Fimb.maxlen = wclen;
|
---|
| 670 | + if (wclen > 3)
|
---|
| 671 | + wclen = 3;
|
---|
| 672 | + if (wclen == 0)
|
---|
| 673 | + {
|
---|
| 674 | + if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
|
---|
| 675 | + error (2, 0, err);
|
---|
| 676 | + }
|
---|
| 677 | + else
|
---|
| 678 | + for (i = 0; i < (1 << wclen); i++)
|
---|
| 679 | + {
|
---|
| 680 | + char *p = starts;
|
---|
| 681 | + int j, k;
|
---|
| 682 | +
|
---|
| 683 | + for (j = 0; j < wclen; ++j)
|
---|
| 684 | + {
|
---|
| 685 | + wchar_t wc = wcbeg[j];
|
---|
| 686 | + if (i & (1 << j))
|
---|
| 687 | + {
|
---|
| 688 | + wc = towupper (wc);
|
---|
| 689 | + if (wc == wcbeg[j])
|
---|
| 690 | + continue;
|
---|
| 691 | + }
|
---|
| 692 | + k = wctomb (p, wc);
|
---|
| 693 | + if (k <= 0)
|
---|
| 694 | + goto fimb_fail;
|
---|
| 695 | + p += k;
|
---|
| 696 | + }
|
---|
| 697 | + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
|
---|
| 698 | + error (2, 0, err);
|
---|
| 699 | + }
|
---|
| 700 | + if (wclim < wcpattern + wcsize)
|
---|
| 701 | + ++wclim;
|
---|
| 702 | + wcbeg = wclim;
|
---|
| 703 | + }
|
---|
| 704 | + while (wcbeg < wcpattern + wcsize);
|
---|
| 705 | + f_i_multibyte = 1;
|
---|
| 706 | + kwset = fimb_kwset;
|
---|
| 707 | + free (starts);
|
---|
| 708 | + Fimb.match = xmalloc (Fimb.count);
|
---|
| 709 | + if ((err = kwsprep (kwset)) != 0)
|
---|
| 710 | + error (2, 0, err);
|
---|
| 711 | + return;
|
---|
| 712 | + }
|
---|
| 713 | + }
|
---|
| 714 | +#endif /* MBS_SUPPORT */
|
---|
| 715 | +
|
---|
| 716 | +
|
---|
| 717 | kwsinit ();
|
---|
| 718 | beg = pattern;
|
---|
| 719 | do
|
---|
| 720 | @@ -511,6 +777,76 @@
|
---|
| 721 | error (2, 0, err);
|
---|
| 722 | }
|
---|
| 723 |
|
---|
| 724 | +#ifdef MBS_SUPPORT
|
---|
| 725 | +static int
|
---|
| 726 | +Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
|
---|
| 727 | +{
|
---|
| 728 | + size_t len, letter, i;
|
---|
| 729 | + int ret = -1;
|
---|
| 730 | + mbstate_t mbs;
|
---|
| 731 | + wchar_t wc;
|
---|
| 732 | + int patterns_left;
|
---|
| 733 | +
|
---|
| 734 | + assert (match_icase && f_i_multibyte == 1);
|
---|
| 735 | + assert (MB_CUR_MAX > 1);
|
---|
| 736 | +
|
---|
| 737 | + memset (&mbs, '\0', sizeof (mbs));
|
---|
| 738 | + memset (Fimb.match, '\1', Fimb.count);
|
---|
| 739 | + letter = len = 0;
|
---|
| 740 | + patterns_left = 1;
|
---|
| 741 | + while (patterns_left && len <= size)
|
---|
| 742 | + {
|
---|
| 743 | + size_t c;
|
---|
| 744 | +
|
---|
| 745 | + patterns_left = 0;
|
---|
| 746 | + if (len < size)
|
---|
| 747 | + {
|
---|
| 748 | + c = mbrtowc (&wc, buf + len, size - len, &mbs);
|
---|
| 749 | + if (c + 2 <= 2)
|
---|
| 750 | + return ret;
|
---|
| 751 | +
|
---|
| 752 | + wc = towlower (wc);
|
---|
| 753 | + }
|
---|
| 754 | + else
|
---|
| 755 | + {
|
---|
| 756 | + c = 1;
|
---|
| 757 | + wc = L'\0';
|
---|
| 758 | + }
|
---|
| 759 | +
|
---|
| 760 | + for (i = 0; i < Fimb.count; i++)
|
---|
| 761 | + {
|
---|
| 762 | + if (Fimb.match[i])
|
---|
| 763 | + {
|
---|
| 764 | + if (Fimb.patterns[i][letter] == L'\0')
|
---|
| 765 | + {
|
---|
| 766 | + /* Found a match. */
|
---|
| 767 | + *plen = len;
|
---|
| 768 | + if (!exact && !match_words)
|
---|
| 769 | + return 0;
|
---|
| 770 | + else
|
---|
| 771 | + {
|
---|
| 772 | + /* For -w or exact look for longest match. */
|
---|
| 773 | + ret = 0;
|
---|
| 774 | + Fimb.match[i] = '\0';
|
---|
| 775 | + continue;
|
---|
| 776 | + }
|
---|
| 777 | + }
|
---|
| 778 | +
|
---|
| 779 | + if (Fimb.patterns[i][letter] == wc)
|
---|
| 780 | + patterns_left = 1;
|
---|
| 781 | + else
|
---|
| 782 | + Fimb.match[i] = '\0';
|
---|
| 783 | + }
|
---|
| 784 | + }
|
---|
| 785 | +
|
---|
| 786 | + len += c;
|
---|
| 787 | + letter++;
|
---|
| 788 | + }
|
---|
| 789 | +
|
---|
| 790 | + return ret;
|
---|
| 791 | +}
|
---|
| 792 | +#endif /* MBS_SUPPORT */
|
---|
| 793 | +
|
---|
| 794 | EXECUTE_FCT(Fexecute)
|
---|
| 795 | {
|
---|
| 796 | register char const *beg, *try, *end;
|
---|
| 797 | @@ -519,69 +855,256 @@
|
---|
| 798 | struct kwsmatch kwsmatch;
|
---|
| 799 | size_t ret_val;
|
---|
| 800 | #ifdef MBS_SUPPORT
|
---|
| 801 | - char *mb_properties = NULL;
|
---|
| 802 | - if (MB_CUR_MAX > 1)
|
---|
| 803 | - {
|
---|
| 804 | - if (match_icase)
|
---|
| 805 | - {
|
---|
| 806 | - char *case_buf = xmalloc(size);
|
---|
| 807 | - memcpy(case_buf, buf, size);
|
---|
| 808 | - if (start_ptr)
|
---|
| 809 | - start_ptr = case_buf + (start_ptr - buf);
|
---|
| 810 | - buf = case_buf;
|
---|
| 811 | - }
|
---|
| 812 | - mb_properties = check_multibyte_string(buf, size);
|
---|
| 813 | - }
|
---|
| 814 | + int mb_cur_max = MB_CUR_MAX;
|
---|
| 815 | + mbstate_t mbs;
|
---|
| 816 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 817 | + const char *last_char = NULL;
|
---|
| 818 | #endif /* MBS_SUPPORT */
|
---|
| 819 |
|
---|
| 820 | for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
|
---|
| 821 | {
|
---|
| 822 | size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
|
---|
| 823 | if (offset == (size_t) -1)
|
---|
| 824 | - goto failure;
|
---|
| 825 | + return offset;
|
---|
| 826 | #ifdef MBS_SUPPORT
|
---|
| 827 | - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
|
---|
| 828 | - continue; /* It is a part of multibyte character. */
|
---|
| 829 | + if (mb_cur_max > 1 && !using_utf8)
|
---|
| 830 | + {
|
---|
| 831 | + size_t bytes_left = offset;
|
---|
| 832 | + while (bytes_left)
|
---|
| 833 | + {
|
---|
| 834 | + size_t mlen = mbrlen (beg, bytes_left, &mbs);
|
---|
| 835 | +
|
---|
| 836 | + last_char = beg;
|
---|
| 837 | + if (mlen == (size_t) -1 || mlen == 0)
|
---|
| 838 | + {
|
---|
| 839 | + /* Incomplete character: treat as single-byte. */
|
---|
| 840 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 841 | + beg++;
|
---|
| 842 | + bytes_left--;
|
---|
| 843 | + continue;
|
---|
| 844 | + }
|
---|
| 845 | +
|
---|
| 846 | + if (mlen == (size_t) -2)
|
---|
| 847 | + /* Offset points inside multibyte character: no good. */
|
---|
| 848 | + break;
|
---|
| 849 | +
|
---|
| 850 | + beg += mlen;
|
---|
| 851 | + bytes_left -= mlen;
|
---|
| 852 | + }
|
---|
| 853 | +
|
---|
| 854 | + if (bytes_left)
|
---|
| 855 | + continue;
|
---|
| 856 | + }
|
---|
| 857 | + else
|
---|
| 858 | #endif /* MBS_SUPPORT */
|
---|
| 859 | beg += offset;
|
---|
| 860 | +#ifdef MBS_SUPPORT
|
---|
| 861 | + /* For f_i_multibyte, the string at beg now matches first 3 chars of
|
---|
| 862 | + one of the search strings (less if there are shorter search strings).
|
---|
| 863 | + See if this is a real match. */
|
---|
| 864 | + if (f_i_multibyte
|
---|
| 865 | + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], start_ptr == NULL))
|
---|
| 866 | + goto next_char;
|
---|
| 867 | +#endif /* MBS_SUPPORT */
|
---|
| 868 | len = kwsmatch.size[0];
|
---|
| 869 | if (start_ptr && !match_words)
|
---|
| 870 | goto success_in_beg_and_len;
|
---|
| 871 | if (match_lines)
|
---|
| 872 | {
|
---|
| 873 | if (beg > buf && beg[-1] != eol)
|
---|
| 874 | - continue;
|
---|
| 875 | + goto next_char;
|
---|
| 876 | if (beg + len < buf + size && beg[len] != eol)
|
---|
| 877 | - continue;
|
---|
| 878 | + goto next_char;
|
---|
| 879 | goto success;
|
---|
| 880 | }
|
---|
| 881 | else if (match_words)
|
---|
| 882 | - for (try = beg; len; )
|
---|
| 883 | - {
|
---|
| 884 | - if (try > buf && WCHAR((unsigned char) try[-1]))
|
---|
| 885 | - break;
|
---|
| 886 | - if (try + len < buf + size && WCHAR((unsigned char) try[len]))
|
---|
| 887 | - {
|
---|
| 888 | - offset = kwsexec (kwset, beg, --len, &kwsmatch);
|
---|
| 889 | - if (offset == (size_t) -1)
|
---|
| 890 | - break;
|
---|
| 891 | - try = beg + offset;
|
---|
| 892 | - len = kwsmatch.size[0];
|
---|
| 893 | - }
|
---|
| 894 | - else if (!start_ptr)
|
---|
| 895 | - goto success;
|
---|
| 896 | - else
|
---|
| 897 | - goto success_in_beg_and_len;
|
---|
| 898 | - } /* for (try) */
|
---|
| 899 | - else
|
---|
| 900 | + {
|
---|
| 901 | + while (len)
|
---|
| 902 | + {
|
---|
| 903 | + int word_match = 0;
|
---|
| 904 | + if (beg > buf)
|
---|
| 905 | + {
|
---|
| 906 | +#ifdef MBS_SUPPORT
|
---|
| 907 | + if (mb_cur_max > 1)
|
---|
| 908 | + {
|
---|
| 909 | + const char *s;
|
---|
| 910 | + int mr;
|
---|
| 911 | + wchar_t pwc;
|
---|
| 912 | +
|
---|
| 913 | + if (using_utf8)
|
---|
| 914 | + {
|
---|
| 915 | + s = beg - 1;
|
---|
| 916 | + while (s > buf
|
---|
| 917 | + && (unsigned char) *s >= 0x80
|
---|
| 918 | + && (unsigned char) *s <= 0xbf)
|
---|
| 919 | + --s;
|
---|
| 920 | + }
|
---|
| 921 | + else
|
---|
| 922 | + s = last_char;
|
---|
| 923 | + mr = mbtowc (&pwc, s, beg - s);
|
---|
| 924 | + if (mr <= 0)
|
---|
| 925 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 926 | + else if ((iswalnum (pwc) || pwc == L'_')
|
---|
| 927 | + && mr == (int) (beg - s))
|
---|
| 928 | + goto next_char;
|
---|
| 929 | + }
|
---|
| 930 | + else
|
---|
| 931 | +#endif /* MBS_SUPPORT */
|
---|
| 932 | + if (WCHAR ((unsigned char) beg[-1]))
|
---|
| 933 | + goto next_char;
|
---|
| 934 | + }
|
---|
| 935 | +#ifdef MBS_SUPPORT
|
---|
| 936 | + if (mb_cur_max > 1)
|
---|
| 937 | + {
|
---|
| 938 | + wchar_t nwc;
|
---|
| 939 | + int mr;
|
---|
| 940 | +
|
---|
| 941 | + mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
|
---|
| 942 | + if (mr <= 0)
|
---|
| 943 | + {
|
---|
| 944 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 945 | + word_match = 1;
|
---|
| 946 | + }
|
---|
| 947 | + else if (!iswalnum (nwc) && nwc != L'_')
|
---|
| 948 | + word_match = 1;
|
---|
| 949 | + }
|
---|
| 950 | + else
|
---|
| 951 | +#endif /* MBS_SUPPORT */
|
---|
| 952 | + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
|
---|
| 953 | + word_match = 1;
|
---|
| 954 | + if (word_match)
|
---|
| 955 | + {
|
---|
| 956 | + if (start_ptr == NULL)
|
---|
| 957 | + /* Returns the whole line now we know there's a word match. */
|
---|
| 958 | + goto success;
|
---|
| 959 | + else {
|
---|
| 960 | + /* Returns just this word match. */
|
---|
| 961 | + *match_size = len;
|
---|
| 962 | + return beg - buf;
|
---|
| 963 | + }
|
---|
| 964 | + }
|
---|
| 965 | + if (len > 0)
|
---|
| 966 | + {
|
---|
| 967 | + /* Try a shorter length anchored at the same place. */
|
---|
| 968 | + --len;
|
---|
| 969 | + offset = kwsexec (kwset, beg, len, &kwsmatch);
|
---|
| 970 | +
|
---|
| 971 | + if (offset == -1)
|
---|
| 972 | + goto next_char; /* Try a different anchor. */
|
---|
| 973 | +#ifdef MBS_SUPPORT
|
---|
| 974 | +
|
---|
| 975 | + if (mb_cur_max > 1 && !using_utf8)
|
---|
| 976 | + {
|
---|
| 977 | + size_t bytes_left = offset;
|
---|
| 978 | + while (bytes_left)
|
---|
| 979 | + {
|
---|
| 980 | + size_t mlen = mbrlen (beg, bytes_left, &mbs);
|
---|
| 981 | +
|
---|
| 982 | + last_char = beg;
|
---|
| 983 | + if (mlen == (size_t) -1 || mlen == 0)
|
---|
| 984 | + {
|
---|
| 985 | + /* Incomplete character: treat as single-byte. */
|
---|
| 986 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 987 | + beg++;
|
---|
| 988 | + bytes_left--;
|
---|
| 989 | + continue;
|
---|
| 990 | + }
|
---|
| 991 | +
|
---|
| 992 | + if (mlen == (size_t) -2)
|
---|
| 993 | + {
|
---|
| 994 | + /* Offset points inside multibyte character:
|
---|
| 995 | + * no good. */
|
---|
| 996 | + break;
|
---|
| 997 | + }
|
---|
| 998 | +
|
---|
| 999 | + beg += mlen;
|
---|
| 1000 | + bytes_left -= mlen;
|
---|
| 1001 | + }
|
---|
| 1002 | +
|
---|
| 1003 | + if (bytes_left)
|
---|
| 1004 | + {
|
---|
| 1005 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 1006 | + goto next_char; /* Try a different anchor. */
|
---|
| 1007 | + }
|
---|
| 1008 | + }
|
---|
| 1009 | + else
|
---|
| 1010 | +#endif /* MBS_SUPPORT */
|
---|
| 1011 | + beg += offset;
|
---|
| 1012 | +#ifdef MBS_SUPPORT
|
---|
| 1013 | + /* The string at beg now matches first 3 chars of one of
|
---|
| 1014 | + the search strings (less if there are shorter search
|
---|
| 1015 | + strings). See if this is a real match. */
|
---|
| 1016 | + if (f_i_multibyte
|
---|
| 1017 | + && Fimbexec (beg, len - offset, &kwsmatch.size[0],
|
---|
| 1018 | + start_ptr == NULL))
|
---|
| 1019 | + goto next_char;
|
---|
| 1020 | +#endif /* MBS_SUPPORT */
|
---|
| 1021 | + len = kwsmatch.size[0];
|
---|
| 1022 | + }
|
---|
| 1023 | + }
|
---|
| 1024 | + }
|
---|
| 1025 | + else
|
---|
| 1026 | goto success;
|
---|
| 1027 | - } /* for (beg in buf) */
|
---|
| 1028 | +next_char:;
|
---|
| 1029 | +#ifdef MBS_SUPPORT
|
---|
| 1030 | + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled
|
---|
| 1031 | + by ++beg above. */
|
---|
| 1032 | + if (mb_cur_max > 1)
|
---|
| 1033 | + {
|
---|
| 1034 | + if (using_utf8)
|
---|
| 1035 | + {
|
---|
| 1036 | + unsigned char c = *beg;
|
---|
| 1037 | + if (c >= 0xc2)
|
---|
| 1038 | + {
|
---|
| 1039 | + if (c < 0xe0)
|
---|
| 1040 | + ++beg;
|
---|
| 1041 | + else if (c < 0xf0)
|
---|
| 1042 | + beg += 2;
|
---|
| 1043 | + else if (c < 0xf8)
|
---|
| 1044 | + beg += 3;
|
---|
| 1045 | + else if (c < 0xfc)
|
---|
| 1046 | + beg += 4;
|
---|
| 1047 | + else if (c < 0xfe)
|
---|
| 1048 | + beg += 5;
|
---|
| 1049 | + }
|
---|
| 1050 | + }
|
---|
| 1051 | + else
|
---|
| 1052 | + {
|
---|
| 1053 | + size_t l = mbrlen (beg, buf + size - beg, &mbs);
|
---|
| 1054 |
|
---|
| 1055 | - failure:
|
---|
| 1056 | - ret_val = -1;
|
---|
| 1057 | - goto out;
|
---|
| 1058 | + last_char = beg;
|
---|
| 1059 | + if (l + 2 >= 2)
|
---|
| 1060 | + beg += l - 1;
|
---|
| 1061 | + else
|
---|
| 1062 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 1063 | + }
|
---|
| 1064 | + }
|
---|
| 1065 | +#endif /* MBS_SUPPORT */
|
---|
| 1066 | + }
|
---|
| 1067 | +
|
---|
| 1068 | + return -1;
|
---|
| 1069 |
|
---|
| 1070 | success:
|
---|
| 1071 | +#ifdef MBS_SUPPORT
|
---|
| 1072 | + if (mb_cur_max > 1 && !using_utf8)
|
---|
| 1073 | + {
|
---|
| 1074 | + end = beg + len;
|
---|
| 1075 | + while (end < buf + size)
|
---|
| 1076 | + {
|
---|
| 1077 | + size_t mlen = mbrlen (end, buf + size - end, &mbs);
|
---|
| 1078 | + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
|
---|
| 1079 | + {
|
---|
| 1080 | + memset (&mbs, '\0', sizeof (mbstate_t));
|
---|
| 1081 | + mlen = 1;
|
---|
| 1082 | + }
|
---|
| 1083 | + if (mlen == 1 && *end == eol)
|
---|
| 1084 | + break;
|
---|
| 1085 | +
|
---|
| 1086 | + end += mlen;
|
---|
| 1087 | + }
|
---|
| 1088 | + }
|
---|
| 1089 | + else
|
---|
| 1090 | + #endif /* MBS_SUPPORT */
|
---|
| 1091 | end = memchr (beg + len, eol, (buf + size) - (beg + len));
|
---|
| 1092 | end++;
|
---|
| 1093 | while (buf < beg && beg[-1] != eol)
|
---|
| 1094 | @@ -591,15 +1114,6 @@
|
---|
| 1095 | *match_size = len;
|
---|
| 1096 | ret_val = beg - buf;
|
---|
| 1097 | out:
|
---|
| 1098 | -#ifdef MBS_SUPPORT
|
---|
| 1099 | - if (MB_CUR_MAX > 1)
|
---|
| 1100 | - {
|
---|
| 1101 | - if (match_icase)
|
---|
| 1102 | - free((char*)buf);
|
---|
| 1103 | - if (mb_properties)
|
---|
| 1104 | - free(mb_properties);
|
---|
| 1105 | - }
|
---|
| 1106 | -#endif /* MBS_SUPPORT */
|
---|
| 1107 | return ret_val;
|
---|
| 1108 | }
|
---|
| 1109 | #endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */
|
---|