source:
scripts/patches/grep-2.5.1a-redhat_fixes-2.patch@
4dd194e2
Last change on this file since 4dd194e2 was c0cf39e, checked in by , 19 years ago | |
---|---|
|
|
File size: 54.8 KB |
-
doc/grep.1
Submitted by: Alexander E. Patrakov Date: 2005-08-13 Initial Package Version: 2.5.1a Upstream Status: Partially accepted, partially rejected, but required for LSB >= 2.0 certification Origin: RedHat Description: Various fixes from RedHat. Individual patches: grep-2.5.1-fgrep.patch grep-2.5.1-bracket.patch grep-2.5-i18n.patch grep-2.5.1-oi.patch grep-2.5.1-manpage.patch grep-2.5.1-color.patch grep-2.5.1-icolor.patch grep-2.5.1-egf-speedup.patch grep-2.5.1-dfa-optional.patch grep-2.5.1-tests.patch grep-2.5.1-w.patch Testcases: -fgrep: ???, but required for other patches -bracket: echo "[" | LANG=en_US.UTF-8 grep "[[:space:]]" -i18n: many fixes for multibyte locale support, required for LSB. -oi: echo xxYYzz | LANG=C grep -i -o yy -manpage: typo -color: restore the background color correctly -icolor: ??? echo 'spam foo SPAM FOO' | grep -i --color spam (but that's also fixed by -oi. Is this patch just a cleanup?) -egf-speedup: without this, grep is as slow as a snail in UTF-8 locales. -dfa-optional: disables dfa in multibyte locales by default. -w: (echo 'foo';echo 'fo') > /tmp/testfile && grep -F -w fo /tmp/testfile diff -urN grep-2.5.1a.orig/doc/grep.1 grep-2.5.1a/doc/grep.1
old new 191 191 .I PATTERN 192 192 as a list of fixed strings, separated by newlines, 193 193 any of which is to be matched. 194 .TP 194 195 .BR \-P ", " \-\^\-perl-regexp 195 196 Interpret 196 197 .I PATTERN … … 302 303 This is especially useful for tools like zgrep, e.g. 303 304 .B "gzip -cd foo.gz |grep --label=foo something" 304 305 .TP 305 .BR \-\^\-line-buffer ing306 .BR \-\^\-line-buffered 306 307 Use line buffering, it can be a performance penality. 307 308 .TP 308 309 .BR \-q ", " \-\^\-quiet ", " \-\^\-silent -
lib/posix/regex.h
diff -urN grep-2.5.1a.orig/lib/posix/regex.h grep-2.5.1a/lib/posix/regex.h
old new 109 109 If not set, \{, \}, {, and } are literals. */ 110 110 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) 111 111 112 /* If this bit is set, then ignore case when matching. 113 If not set, then case is significant. */ 114 #define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) 115 112 116 /* If this bit is set, +, ? and | aren't recognized as operators. 113 117 If not set, they are. */ 114 118 #define RE_LIMITED_OPS (RE_INTERVALS << 1) -
grep-2.5.1a
diff -urN grep-2.5.1a.orig/src/dfa.c grep-2.5.1a/src/dfa.c
old new 414 414 415 415 /* This function fetch a wide character, and update cur_mb_len, 416 416 used only if the current locale is a multibyte environment. */ 417 static w char_t417 static wint_t 418 418 fetch_wc (char const *eoferr) 419 419 { 420 420 wchar_t wc; … … 423 423 if (eoferr != 0) 424 424 dfaerror (eoferr); 425 425 else 426 return -1;426 return WEOF; 427 427 } 428 428 429 429 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); … … 459 459 static void 460 460 parse_bracket_exp_mb () 461 461 { 462 w char_t wc, wc1, wc2;462 wint_t wc, wc1, wc2; 463 463 464 464 /* Work area to build a mb_char_classes. */ 465 465 struct mb_char_classes *work_mbc; … … 496 496 work_mbc->invert = 0; 497 497 do 498 498 { 499 wc1 = -1; /* mark wc1 is not initialized". */499 wc1 = WEOF; /* mark wc1 is not initialized". */ 500 500 501 501 /* Note that if we're looking at some other [:...:] construct, 502 502 we just treat it as a bunch of ordinary characters. We can do … … 586 586 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; 587 587 } 588 588 } 589 wc = -1;589 wc1 = wc = WEOF; 590 590 } 591 591 else 592 592 /* We treat '[' as a normal character here. */ … … 600 600 wc = fetch_wc(("Unbalanced [")); 601 601 } 602 602 603 if (wc1 == -1)603 if (wc1 == WEOF) 604 604 wc1 = fetch_wc(_("Unbalanced [")); 605 605 606 606 if (wc1 == L'-') … … 630 630 } 631 631 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, 632 632 range_sts_al, work_mbc->nranges + 1); 633 work_mbc->range_sts[work_mbc->nranges] = wc;633 work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; 634 634 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, 635 635 range_ends_al, work_mbc->nranges + 1); 636 work_mbc->range_ends[work_mbc->nranges++] = wc2;636 work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; 637 637 } 638 else if (wc != -1)638 else if (wc != WEOF) 639 639 /* build normal characters. */ 640 640 { 641 641 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 642 642 work_mbc->nchars + 1); 643 work_mbc->chars[work_mbc->nchars++] = wc;643 work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; 644 644 } 645 645 } 646 646 while ((wc = wc1) != L']'); … … 2552 2552 } 2553 2553 2554 2554 /* match with a character? */ 2555 if (case_fold) 2556 wc = towlower (wc); 2555 2557 for (i = 0; i<work_mbc->nchars; i++) 2556 2558 { 2557 2559 if (wc == work_mbc->chars[i]) -
src/grep.c
diff -urN grep-2.5.1a.orig/src/grep.c grep-2.5.1a/src/grep.c
old new 30 30 # include <sys/time.h> 31 31 # include <sys/resource.h> 32 32 #endif 33 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 34 /* We can handle multibyte string. */ 35 # define MBS_SUPPORT 36 # include <wchar.h> 37 # include <wctype.h> 38 #endif 33 39 #include <stdio.h> 34 40 #include "system.h" 35 41 #include "getopt.h" … … 558 564 { 559 565 size_t match_size; 560 566 size_t match_offset; 561 if(match_icase)562 {563 /* Yuck, this is tricky */564 char *buf = (char*) xmalloc (lim - beg);565 char *ibeg = buf;566 char *ilim = ibeg + (lim - beg);567 int i;568 for (i = 0; i < lim - beg; i++)569 ibeg[i] = tolower (beg[i]);570 while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1))571 != (size_t) -1)572 {573 char const *b = beg + match_offset;574 if (b == lim)575 break;576 fwrite (beg, sizeof (char), match_offset, stdout);577 printf ("\33[%sm", grep_color);578 fwrite (b, sizeof (char), match_size, stdout);579 fputs ("\33[00m", stdout);580 beg = b + match_size;581 ibeg = ibeg + match_offset + match_size;582 }583 fwrite (beg, 1, lim - beg, stdout);584 free (buf);585 lastout = lim;586 return;587 }588 567 while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1)) 589 568 != (size_t) -1) 590 569 { … … 601 580 fputs ("\33[00m", stdout); 602 581 beg = b + match_size; 603 582 } 583 fputs ("\33[K", stdout); 604 584 } 605 585 fwrite (beg, 1, lim - beg, stdout); 606 586 if (ferror (stdout)) … … 1697 1677 if (!install_matcher (matcher) && !install_matcher ("default")) 1698 1678 abort (); 1699 1679 1680 #ifdef MBS_SUPPORT 1681 if (MB_CUR_MAX != 1 && match_icase) 1682 { 1683 wchar_t wc; 1684 mbstate_t cur_state, prev_state; 1685 int i, len = strlen(keys); 1686 1687 memset(&cur_state, 0, sizeof(mbstate_t)); 1688 for (i = 0; i <= len ;) 1689 { 1690 size_t mbclen; 1691 mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state); 1692 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) 1693 { 1694 /* An invalid sequence, or a truncated multibyte character. 1695 We treat it as a singlebyte character. */ 1696 mbclen = 1; 1697 } 1698 else 1699 { 1700 if (iswupper((wint_t)wc)) 1701 { 1702 wc = towlower((wint_t)wc); 1703 wcrtomb(keys + i, wc, &cur_state); 1704 } 1705 } 1706 i += mbclen; 1707 } 1708 } 1709 #endif /* MBS_SUPPORT */ 1710 1700 1711 (*compile)(keys, keycc); 1701 1712 1702 1713 if ((argc - optind > 1 && !no_filenames) || with_filenames) -
src/search.c
diff -urN grep-2.5.1a.orig/src/search.c grep-2.5.1a/src/search.c
old new 18 18 19 19 /* Written August 1992 by Mike Haertel. */ 20 20 21 #ifndef _GNU_SOURCE 22 # define _GNU_SOURCE 1 23 #endif 21 24 #ifdef HAVE_CONFIG_H 22 25 # include <config.h> 23 26 #endif 27 #include <assert.h> 24 28 #include <sys/types.h> 25 29 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 26 30 /* We can handle multibyte string. */ … … 31 35 32 36 #include "system.h" 33 37 #include "grep.h" 34 #include "regex.h"38 #include <regex.h> 35 39 #include "dfa.h" 36 40 #include "kwset.h" 37 41 #include "error.h" … … 39 43 #ifdef HAVE_LIBPCRE 40 44 # include <pcre.h> 41 45 #endif 46 #ifdef HAVE_LANGINFO_CODESET 47 # include <langinfo.h> 48 #endif 42 49 43 50 #define NCHAR (UCHAR_MAX + 1) 44 51 … … 70 77 call the regexp matcher at all. */ 71 78 static int kwset_exact_matches; 72 79 73 #if defined(MBS_SUPPORT) 74 static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); 75 #endif 80 /* UTF-8 encoding allows some optimizations that we can't otherwise 81 assume in a multibyte encoding. */ 82 static int using_utf8; 83 76 84 static void kwsinit PARAMS ((void)); 77 85 static void kwsmusts PARAMS ((void)); 78 86 static void Gcompile PARAMS ((char const *, size_t)); … … 84 92 static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); 85 93 86 94 void 95 check_utf8 (void) 96 { 97 #ifdef HAVE_LANGINFO_CODESET 98 if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) 99 using_utf8 = 1; 100 #endif 101 } 102 103 void 87 104 dfaerror (char const *mesg) 88 105 { 89 106 error (2, 0, mesg); … … 141 158 } 142 159 } 143 160 144 #ifdef MBS_SUPPORT145 /* This function allocate the array which correspond to "buf".146 Then this check multibyte string and mark on the positions which147 are not singlebyte character nor the first byte of a multibyte148 character. Caller must free the array. */149 static char*150 check_multibyte_string(char const *buf, size_t size)151 {152 char *mb_properties = malloc(size);153 mbstate_t cur_state;154 int i;155 memset(&cur_state, 0, sizeof(mbstate_t));156 memset(mb_properties, 0, sizeof(char)*size);157 for (i = 0; i < size ;)158 {159 size_t mbclen;160 mbclen = mbrlen(buf + i, size - i, &cur_state);161 162 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)163 {164 /* An invalid sequence, or a truncated multibyte character.165 We treat it as a singlebyte character. */166 mbclen = 1;167 }168 mb_properties[i] = mbclen;169 i += mbclen;170 }171 172 return mb_properties;173 }174 #endif175 176 161 static void 177 162 Gcompile (char const *pattern, size_t size) 178 163 { … … 181 166 size_t total = size; 182 167 char const *motif = pattern; 183 168 184 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); 169 check_utf8 (); 170 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); 185 171 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); 186 172 187 173 /* For GNU regex compiler we have to pass the patterns separately to detect … … 233 219 static char const line_end[] = "\\)$"; 234 220 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 235 221 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 236 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);222 char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); 237 223 size_t i; 238 224 strcpy (n, match_lines ? line_beg : word_beg); 239 225 i = strlen (n); … … 257 243 size_t total = size; 258 244 char const *motif = pattern; 259 245 246 check_utf8 (); 260 247 if (strcmp (matcher, "awk") == 0) 261 248 { 262 re_set_syntax (RE_SYNTAX_AWK );249 re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); 263 250 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); 264 251 } 265 252 else 266 253 { 267 re_set_syntax (RE_SYNTAX_POSIX_EGREP );254 re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0)); 268 255 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); 269 256 } 270 257 … … 316 303 static char const line_end[] = ")$"; 317 304 static char const word_beg[] = "(^|[^[:alnum:]_])("; 318 305 static char const word_end[] = ")([^[:alnum:]_]|$)"; 319 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);306 char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); 320 307 size_t i; 321 308 strcpy (n, match_lines ? line_beg : word_beg); 322 309 i = strlen(n); … … 339 326 char eol = eolbyte; 340 327 int backref, start, len; 341 328 struct kwsmatch kwsm; 342 size_t i; 329 size_t i, ret_val; 330 static int use_dfa; 331 static int use_dfa_checked = 0; 343 332 #ifdef MBS_SUPPORT 344 char *mb_properties = NULL; 333 const char *last_char = NULL; 334 int mb_cur_max = MB_CUR_MAX; 335 mbstate_t mbs; 336 memset (&mbs, '\0', sizeof (mbstate_t)); 345 337 #endif /* MBS_SUPPORT */ 346 338 339 if (!use_dfa_checked) 340 { 341 char *grep_use_dfa = getenv ("GREP_USE_DFA"); 342 if (!grep_use_dfa) 343 { 347 344 #ifdef MBS_SUPPORT 348 if (MB_CUR_MAX > 1 && kwset) 349 mb_properties = check_multibyte_string(buf, size); 345 /* Turn off DFA when processing multibyte input. */ 346 use_dfa = (MB_CUR_MAX == 1); 347 #else 348 use_dfa = 1; 350 349 #endif /* MBS_SUPPORT */ 350 } 351 else 352 { 353 use_dfa = atoi (grep_use_dfa); 354 } 355 356 use_dfa_checked = 1; 357 } 351 358 352 359 buflim = buf + size; 353 360 … … 358 365 if (kwset) 359 366 { 360 367 /* Find a possible match using the KWset matcher. */ 361 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); 368 #ifdef MBS_SUPPORT 369 size_t bytes_left = 0; 370 #endif /* MBS_SUPPORT */ 371 size_t offset; 372 #ifdef MBS_SUPPORT 373 /* kwsexec doesn't work with match_icase and multibyte input. */ 374 if (match_icase && mb_cur_max > 1) 375 /* Avoid kwset */ 376 offset = 0; 377 else 378 #endif /* MBS_SUPPORT */ 379 offset = kwsexec (kwset, beg, buflim - beg, &kwsm); 362 380 if (offset == (size_t) -1) 363 {381 goto failure; 364 382 #ifdef MBS_SUPPORT 365 if (MB_CUR_MAX > 1) 366 free(mb_properties); 367 #endif 368 return (size_t)-1; 383 if (mb_cur_max > 1 && !using_utf8) 384 { 385 bytes_left = offset; 386 while (bytes_left) 387 { 388 size_t mlen = mbrlen (beg, bytes_left, &mbs); 389 390 last_char = beg; 391 if (mlen == (size_t) -1 || mlen == 0) 392 { 393 /* Incomplete character: treat as single-byte. */ 394 memset (&mbs, '\0', sizeof (mbstate_t)); 395 beg++; 396 bytes_left--; 397 continue; 398 } 399 400 if (mlen == (size_t) -2) 401 /* Offset points inside multibyte character: 402 * no good. */ 403 break; 404 405 beg += mlen; 406 bytes_left -= mlen; 407 } 369 408 } 409 else 410 #endif /* MBS_SUPPORT */ 370 411 beg += offset; 371 412 /* Narrow down to the line containing the candidate, and 372 413 run it through DFA. */ 373 414 end = memchr(beg, eol, buflim - beg); 374 415 end++; 375 416 #ifdef MBS_SUPPORT 376 if ( MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)417 if (mb_cur_max > 1 && bytes_left) 377 418 continue; 378 #endif 419 #endif /* MBS_SUPPORT */ 379 420 while (beg > buf && beg[-1] != eol) 380 421 --beg; 381 if (kwsm.index < kwset_exact_matches) 382 goto success; 383 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) 422 if ( 423 #ifdef MBS_SUPPORT 424 !(match_icase && mb_cur_max > 1) && 425 #endif /* MBS_SUPPORT */ 426 (kwsm.index < kwset_exact_matches)) 427 goto success_in_beg_and_end; 428 if (use_dfa && 429 dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) 384 430 continue; 385 431 } 386 432 else 387 433 { 388 434 /* No good fixed strings; start with DFA. */ 389 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); 435 #ifdef MBS_SUPPORT 436 size_t bytes_left = 0; 437 #endif /* MBS_SUPPORT */ 438 size_t offset = 0; 439 if (use_dfa) 440 offset = dfaexec (&dfa, beg, buflim - beg, &backref); 390 441 if (offset == (size_t) -1) 391 442 break; 392 443 /* Narrow down to the line we've found. */ 444 #ifdef MBS_SUPPORT 445 if (mb_cur_max > 1 && !using_utf8) 446 { 447 bytes_left = offset; 448 while (bytes_left) 449 { 450 size_t mlen = mbrlen (beg, bytes_left, &mbs); 451 452 last_char = beg; 453 if (mlen == (size_t) -1 || mlen == 0) 454 { 455 /* Incomplete character: treat as single-byte. */ 456 memset (&mbs, '\0', sizeof (mbstate_t)); 457 beg++; 458 bytes_left--; 459 continue; 460 } 461 462 if (mlen == (size_t) -2) 463 /* Offset points inside multibyte character: 464 * no good. */ 465 break; 466 467 beg += mlen; 468 bytes_left -= mlen; 469 } 470 } 471 else 472 #endif /* MBS_SUPPORT */ 393 473 beg += offset; 394 474 end = memchr (beg, eol, buflim - beg); 395 475 end++; 476 #ifdef MBS_SUPPORT 477 if (mb_cur_max > 1 && bytes_left) 478 continue; 479 #endif /* MBS_SUPPORT */ 396 480 while (beg > buf && beg[-1] != eol) 397 481 --beg; 398 482 } 399 483 /* Successful, no backreferences encountered! */ 400 if ( !backref)401 goto success ;484 if (use_dfa && !backref) 485 goto success_in_beg_and_end; 402 486 } 403 487 else 404 488 end = beg + size; … … 413 497 end - beg - 1, &(patterns[i].regs)))) 414 498 { 415 499 len = patterns[i].regs.end[0] - start; 416 if (exact) 417 { 418 *match_size = len; 419 return start; 420 } 500 if (exact && !match_words) 501 goto success_in_start_and_len; 421 502 if ((!match_lines && !match_words) 422 503 || (match_lines && len == end - beg - 1)) 423 goto success ;504 goto success_in_beg_and_end; 424 505 /* If -w, check if the match aligns with word boundaries. 425 506 We do this iteratively because: 426 507 (a) the line may contain more than one occurence of the … … 431 512 if (match_words) 432 513 while (start >= 0) 433 514 { 434 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) 435 && (len == end - beg - 1 436 || !WCHAR ((unsigned char) beg[start + len]))) 437 goto success; 515 int lword_match = 0; 516 if (start == 0) 517 lword_match = 1; 518 else 519 { 520 assert (start > 0); 521 #ifdef MBS_SUPPORT 522 if (mb_cur_max > 1) 523 { 524 const char *s; 525 int mr; 526 wchar_t pwc; 527 528 if (using_utf8) 529 { 530 s = beg + start - 1; 531 while (s > buf 532 && (unsigned char) *s >= 0x80 533 && (unsigned char) *s <= 0xbf) 534 --s; 535 } 536 else 537 s = last_char; 538 mr = mbtowc (&pwc, s, beg + start - s); 539 if (mr <= 0) 540 { 541 memset (&mbs, '\0', sizeof (mbstate_t)); 542 lword_match = 1; 543 } 544 else if (!(iswalnum (pwc) || pwc == L'_') 545 && mr == (int) (beg + start - s)) 546 lword_match = 1; 547 } 548 else 549 #endif /* MBS_SUPPORT */ 550 if (!WCHAR ((unsigned char) beg[start - 1])) 551 lword_match = 1; 552 } 553 554 if (lword_match) 555 { 556 int rword_match = 0; 557 if (start + len == end - beg - 1) 558 rword_match = 1; 559 else 560 { 561 #ifdef MBS_SUPPORT 562 if (mb_cur_max > 1) 563 { 564 wchar_t nwc; 565 int mr; 566 567 mr = mbtowc (&nwc, beg + start + len, 568 end - beg - start - len - 1); 569 if (mr <= 0) 570 { 571 memset (&mbs, '\0', sizeof (mbstate_t)); 572 rword_match = 1; 573 } 574 else if (!iswalnum (nwc) && nwc != L'_') 575 rword_match = 1; 576 } 577 else 578 #endif /* MBS_SUPPORT */ 579 if (!WCHAR ((unsigned char) beg[start + len])) 580 rword_match = 1; 581 } 582 583 if (rword_match) 584 { 585 if (!exact) 586 /* Returns the whole line. */ 587 goto success_in_beg_and_end; 588 else 589 /* Returns just this word match. */ 590 goto success_in_start_and_len; 591 } 592 } 438 593 if (len > 0) 439 594 { 440 595 /* Try a shorter length anchored at the same place. */ … … 461 616 } 462 617 } /* for Regex patterns. */ 463 618 } /* for (beg = end ..) */ 464 #ifdef MBS_SUPPORT 465 if (MB_CUR_MAX > 1 && mb_properties) 466 free (mb_properties); 467 #endif /* MBS_SUPPORT */ 619 620 failure: 468 621 return (size_t) -1; 469 622 470 success: 471 #ifdef MBS_SUPPORT 472 if (MB_CUR_MAX > 1 && mb_properties) 473 free (mb_properties); 474 #endif /* MBS_SUPPORT */ 475 *match_size = end - beg; 476 return beg - buf; 623 success_in_beg_and_end: 624 len = end - beg; 625 start = beg - buf; 626 /* FALLTHROUGH */ 627 628 success_in_start_and_len: 629 *match_size = len; 630 return start; 477 631 } 478 632 633 #ifdef MBS_SUPPORT 634 static int f_i_multibyte; /* whether we're using the new -Fi MB method */ 635 static struct 636 { 637 wchar_t **patterns; 638 size_t count, maxlen; 639 unsigned char *match; 640 } Fimb; 641 #endif 642 479 643 static void 480 644 Fcompile (char const *pattern, size_t size) 481 645 { 646 int mb_cur_max = MB_CUR_MAX; 482 647 char const *beg, *lim, *err; 483 648 649 check_utf8 (); 650 #ifdef MBS_SUPPORT 651 /* Support -F -i for UTF-8 input. */ 652 if (match_icase && mb_cur_max > 1) 653 { 654 mbstate_t mbs; 655 wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); 656 const char *patternend = pattern; 657 size_t wcsize; 658 kwset_t fimb_kwset = NULL; 659 char *starts = NULL; 660 wchar_t *wcbeg, *wclim; 661 size_t allocated = 0; 662 663 memset (&mbs, '\0', sizeof (mbs)); 664 # ifdef __GNU_LIBRARY__ 665 wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); 666 if (patternend != pattern + size) 667 wcsize = (size_t) -1; 668 # else 669 { 670 char *patterncopy = xmalloc (size + 1); 671 672 memcpy (patterncopy, pattern, size); 673 patterncopy[size] = '\0'; 674 patternend = patterncopy; 675 wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); 676 if (patternend != patterncopy + size) 677 wcsize = (size_t) -1; 678 free (patterncopy); 679 } 680 # endif 681 if (wcsize + 2 <= 2) 682 { 683 fimb_fail: 684 free (wcpattern); 685 free (starts); 686 if (fimb_kwset) 687 kwsfree (fimb_kwset); 688 free (Fimb.patterns); 689 Fimb.patterns = NULL; 690 } 691 else 692 { 693 if (!(fimb_kwset = kwsalloc (NULL))) 694 error (2, 0, _("memory exhausted")); 695 696 starts = xmalloc (mb_cur_max * 3); 697 wcbeg = wcpattern; 698 do 699 { 700 int i; 701 size_t wclen; 702 703 if (Fimb.count >= allocated) 704 { 705 if (allocated == 0) 706 allocated = 128; 707 else 708 allocated *= 2; 709 Fimb.patterns = xrealloc (Fimb.patterns, 710 sizeof (wchar_t *) * allocated); 711 } 712 Fimb.patterns[Fimb.count++] = wcbeg; 713 for (wclim = wcbeg; 714 wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) 715 *wclim = towlower (*wclim); 716 *wclim = L'\0'; 717 wclen = wclim - wcbeg; 718 if (wclen > Fimb.maxlen) 719 Fimb.maxlen = wclen; 720 if (wclen > 3) 721 wclen = 3; 722 if (wclen == 0) 723 { 724 if ((err = kwsincr (fimb_kwset, "", 0)) != 0) 725 error (2, 0, err); 726 } 727 else 728 for (i = 0; i < (1 << wclen); i++) 729 { 730 char *p = starts; 731 int j, k; 732 733 for (j = 0; j < wclen; ++j) 734 { 735 wchar_t wc = wcbeg[j]; 736 if (i & (1 << j)) 737 { 738 wc = towupper (wc); 739 if (wc == wcbeg[j]) 740 continue; 741 } 742 k = wctomb (p, wc); 743 if (k <= 0) 744 goto fimb_fail; 745 p += k; 746 } 747 if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) 748 error (2, 0, err); 749 } 750 if (wclim < wcpattern + wcsize) 751 ++wclim; 752 wcbeg = wclim; 753 } 754 while (wcbeg < wcpattern + wcsize); 755 f_i_multibyte = 1; 756 kwset = fimb_kwset; 757 free (starts); 758 Fimb.match = xmalloc (Fimb.count); 759 if ((err = kwsprep (kwset)) != 0) 760 error (2, 0, err); 761 return; 762 } 763 } 764 #endif /* MBS_SUPPORT */ 765 766 484 767 kwsinit (); 485 768 beg = pattern; 486 769 do … … 499 782 error (2, 0, err); 500 783 } 501 784 785 #ifdef MBS_SUPPORT 786 static int 787 Fimbexec (const char *buf, size_t size, size_t *plen, int exact) 788 { 789 size_t len, letter, i; 790 int ret = -1; 791 mbstate_t mbs; 792 wchar_t wc; 793 int patterns_left; 794 795 assert (match_icase && f_i_multibyte == 1); 796 assert (MB_CUR_MAX > 1); 797 798 memset (&mbs, '\0', sizeof (mbs)); 799 memset (Fimb.match, '\1', Fimb.count); 800 letter = len = 0; 801 patterns_left = 1; 802 while (patterns_left && len <= size) 803 { 804 size_t c; 805 806 patterns_left = 0; 807 if (len < size) 808 { 809 c = mbrtowc (&wc, buf + len, size - len, &mbs); 810 if (c + 2 <= 2) 811 return ret; 812 813 wc = towlower (wc); 814 } 815 else 816 { 817 c = 1; 818 wc = L'\0'; 819 } 820 821 for (i = 0; i < Fimb.count; i++) 822 { 823 if (Fimb.match[i]) 824 { 825 if (Fimb.patterns[i][letter] == L'\0') 826 { 827 /* Found a match. */ 828 *plen = len; 829 if (!exact && !match_words) 830 return 0; 831 else 832 { 833 /* For -w or exact look for longest match. */ 834 ret = 0; 835 Fimb.match[i] = '\0'; 836 continue; 837 } 838 } 839 840 if (Fimb.patterns[i][letter] == wc) 841 patterns_left = 1; 842 else 843 Fimb.match[i] = '\0'; 844 } 845 } 846 847 len += c; 848 letter++; 849 } 850 851 return ret; 852 } 853 #endif /* MBS_SUPPORT */ 854 502 855 static size_t 503 856 Fexecute (char const *buf, size_t size, size_t *match_size, int exact) 504 857 { … … 506 859 register size_t len; 507 860 char eol = eolbyte; 508 861 struct kwsmatch kwsmatch; 862 size_t ret_val; 509 863 #ifdef MBS_SUPPORT 510 char *mb_properties; 511 if (MB_CUR_MAX > 1) 512 mb_properties = check_multibyte_string (buf, size); 864 int mb_cur_max = MB_CUR_MAX; 865 mbstate_t mbs; 866 memset (&mbs, '\0', sizeof (mbstate_t)); 867 const char *last_char = NULL; 513 868 #endif /* MBS_SUPPORT */ 514 869 515 870 for (beg = buf; beg <= buf + size; ++beg) 516 871 { 517 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); 872 size_t offset; 873 offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); 874 518 875 if (offset == (size_t) -1) 519 {876 goto failure; 520 877 #ifdef MBS_SUPPORT 521 if (MB_CUR_MAX > 1) 522 free(mb_properties); 523 #endif /* MBS_SUPPORT */ 524 return offset; 878 if (mb_cur_max > 1 && !using_utf8) 879 { 880 size_t bytes_left = offset; 881 while (bytes_left) 882 { 883 size_t mlen = mbrlen (beg, bytes_left, &mbs); 884 885 last_char = beg; 886 if (mlen == (size_t) -1 || mlen == 0) 887 { 888 /* Incomplete character: treat as single-byte. */ 889 memset (&mbs, '\0', sizeof (mbstate_t)); 890 beg++; 891 bytes_left--; 892 continue; 893 } 894 895 if (mlen == (size_t) -2) 896 /* Offset points inside multibyte character: no good. */ 897 break; 898 899 beg += mlen; 900 bytes_left -= mlen; 901 } 902 903 if (bytes_left) 904 continue; 525 905 } 526 #ifdef MBS_SUPPORT 527 if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) 528 continue; /* It is a part of multibyte character. */ 906 else 529 907 #endif /* MBS_SUPPORT */ 530 908 beg += offset; 531 len = kwsmatch.size[0];532 if (exact)533 {534 *match_size = len;535 909 #ifdef MBS_SUPPORT 536 if (MB_CUR_MAX > 1) 537 free (mb_properties); 910 /* For f_i_multibyte, the string at beg now matches first 3 chars of 911 one of the search strings (less if there are shorter search strings). 912 See if this is a real match. */ 913 if (f_i_multibyte 914 && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact)) 915 goto next_char; 538 916 #endif /* MBS_SUPPORT */ 539 return beg - buf; 540 } 917 len = kwsmatch.size[0]; 918 if (exact && !match_words) 919 goto success_in_beg_and_len; 541 920 if (match_lines) 542 921 { 543 922 if (beg > buf && beg[-1] != eol) 544 continue;923 goto next_char; 545 924 if (beg + len < buf + size && beg[len] != eol) 546 continue;925 goto next_char; 547 926 goto success; 548 927 } 549 928 else if (match_words) 550 for (try = beg; len; ) 551 { 552 if (try > buf && WCHAR((unsigned char) try[-1])) 553 break; 554 if (try + len < buf + size && WCHAR((unsigned char) try[len])) 555 { 556 offset = kwsexec (kwset, beg, --len, &kwsmatch); 557 if (offset == (size_t) -1) 558 { 929 { 930 while (len) 931 { 932 int word_match = 0; 933 if (beg > buf) 934 { 559 935 #ifdef MBS_SUPPORT 560 if (MB_CUR_MAX > 1) 561 free (mb_properties); 936 if (mb_cur_max > 1) 937 { 938 const char *s; 939 int mr; 940 wchar_t pwc; 941 942 if (using_utf8) 943 { 944 s = beg - 1; 945 while (s > buf 946 && (unsigned char) *s >= 0x80 947 && (unsigned char) *s <= 0xbf) 948 --s; 949 } 950 else 951 s = last_char; 952 mr = mbtowc (&pwc, s, beg - s); 953 if (mr <= 0) 954 memset (&mbs, '\0', sizeof (mbstate_t)); 955 else if ((iswalnum (pwc) || pwc == L'_') 956 && mr == (int) (beg - s)) 957 goto next_char; 958 } 959 else 562 960 #endif /* MBS_SUPPORT */ 563 return offset; 564 } 565 try = beg + offset; 566 len = kwsmatch.size[0]; 567 } 568 else 569 goto success; 570 } 961 if (WCHAR ((unsigned char) beg[-1])) 962 goto next_char; 963 } 964 #ifdef MBS_SUPPORT 965 if (mb_cur_max > 1) 966 { 967 wchar_t nwc; 968 int mr; 969 970 mr = mbtowc (&nwc, beg + len, buf + size - beg - len); 971 if (mr <= 0) 972 { 973 memset (&mbs, '\0', sizeof (mbstate_t)); 974 word_match = 1; 975 } 976 else if (!iswalnum (nwc) && nwc != L'_') 977 word_match = 1; 978 } 979 else 980 #endif /* MBS_SUPPORT */ 981 if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) 982 word_match = 1; 983 if (word_match) 984 { 985 if (!exact) 986 /* Returns the whole line now we know there's a word match. */ 987 goto success; 988 else 989 /* Returns just this word match. */ 990 goto success_in_beg_and_len; 991 } 992 if (len > 0) 993 { 994 /* Try a shorter length anchored at the same place. */ 995 --len; 996 offset = kwsexec (kwset, beg, len, &kwsmatch); 997 998 if (offset == -1) 999 goto next_char; /* Try a different anchor. */ 1000 #ifdef MBS_SUPPORT 1001 if (mb_cur_max > 1 && !using_utf8) 1002 { 1003 size_t bytes_left = offset; 1004 while (bytes_left) 1005 { 1006 size_t mlen = mbrlen (beg, bytes_left, &mbs); 1007 1008 last_char = beg; 1009 if (mlen == (size_t) -1 || mlen == 0) 1010 { 1011 /* Incomplete character: treat as single-byte. */ 1012 memset (&mbs, '\0', sizeof (mbstate_t)); 1013 beg++; 1014 bytes_left--; 1015 continue; 1016 } 1017 1018 if (mlen == (size_t) -2) 1019 { 1020 /* Offset points inside multibyte character: 1021 * no good. */ 1022 break; 1023 } 1024 1025 beg += mlen; 1026 bytes_left -= mlen; 1027 } 1028 1029 if (bytes_left) 1030 { 1031 memset (&mbs, '\0', sizeof (mbstate_t)); 1032 goto next_char; /* Try a different anchor. */ 1033 } 1034 } 1035 else 1036 #endif /* MBS_SUPPORT */ 1037 beg += offset; 1038 #ifdef MBS_SUPPORT 1039 /* The string at beg now matches first 3 chars of one of 1040 the search strings (less if there are shorter search 1041 strings). See if this is a real match. */ 1042 if (f_i_multibyte 1043 && Fimbexec (beg, len - offset, &kwsmatch.size[0], 1044 exact)) 1045 goto next_char; 1046 #endif /* MBS_SUPPORT */ 1047 len = kwsmatch.size[0]; 1048 } 1049 } 1050 } 571 1051 else 572 1052 goto success; 573 } 574 1053 next_char:; 575 1054 #ifdef MBS_SUPPORT 576 if (MB_CUR_MAX > 1) 577 free (mb_properties); 1055 /* Advance to next character. For MB_CUR_MAX == 1 case this is handled 1056 by ++beg above. */ 1057 if (mb_cur_max > 1) 1058 { 1059 if (using_utf8) 1060 { 1061 unsigned char c = *beg; 1062 if (c >= 0xc2) 1063 { 1064 if (c < 0xe0) 1065 ++beg; 1066 else if (c < 0xf0) 1067 beg += 2; 1068 else if (c < 0xf8) 1069 beg += 3; 1070 else if (c < 0xfc) 1071 beg += 4; 1072 else if (c < 0xfe) 1073 beg += 5; 1074 } 1075 } 1076 else 1077 { 1078 size_t l = mbrlen (beg, buf + size - beg, &mbs); 1079 1080 last_char = beg; 1081 if (l + 2 >= 2) 1082 beg += l - 1; 1083 else 1084 memset (&mbs, '\0', sizeof (mbstate_t)); 1085 } 1086 } 578 1087 #endif /* MBS_SUPPORT */ 1088 } 1089 1090 failure: 579 1091 return -1; 580 1092 581 1093 success: 1094 #ifdef MBS_SUPPORT 1095 if (mb_cur_max > 1 && !using_utf8) 1096 { 1097 end = beg + len; 1098 while (end < buf + size) 1099 { 1100 size_t mlen = mbrlen (end, buf + size - end, &mbs); 1101 if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) 1102 { 1103 memset (&mbs, '\0', sizeof (mbstate_t)); 1104 mlen = 1; 1105 } 1106 if (mlen == 1 && *end == eol) 1107 break; 1108 1109 end += mlen; 1110 } 1111 } 1112 else 1113 #endif /* MBS_SUPPORT */ 582 1114 end = memchr (beg + len, eol, (buf + size) - (beg + len)); 1115 583 1116 end++; 584 1117 while (buf < beg && beg[-1] != eol) 585 1118 --beg; 586 *match_size= end - beg;587 #ifdef MBS_SUPPORT 588 if (MB_CUR_MAX > 1) 589 free (mb_properties);590 #endif /* MBS_SUPPORT */ 1119 len = end - beg; 1120 /* FALLTHROUGH */ 1121 1122 success_in_beg_and_len: 1123 *match_size = len; 591 1124 return beg - buf; 592 1125 } 593 1126 -
src/search.c.orig
diff -urN grep-2.5.1a.orig/src/search.c.orig grep-2.5.1a/src/search.c.orig
old new 1 /* search.c - searching subroutines using dfa, kwset and regex for grep. 2 Copyright 1992, 1998, 2000 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 17 02111-1307, USA. */ 18 19 /* Written August 1992 by Mike Haertel. */ 20 21 #ifdef HAVE_CONFIG_H 22 # include <config.h> 23 #endif 24 #include <sys/types.h> 25 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 26 /* We can handle multibyte string. */ 27 # define MBS_SUPPORT 28 # include <wchar.h> 29 # include <wctype.h> 30 #endif 31 32 #include "system.h" 33 #include "grep.h" 34 #include "regex.h" 35 #include "dfa.h" 36 #include "kwset.h" 37 #include "error.h" 38 #include "xalloc.h" 39 #ifdef HAVE_LIBPCRE 40 # include <pcre.h> 41 #endif 42 43 #define NCHAR (UCHAR_MAX + 1) 44 45 /* For -w, we also consider _ to be word constituent. */ 46 #define WCHAR(C) (ISALNUM(C) || (C) == '_') 47 48 /* DFA compiled regexp. */ 49 static struct dfa dfa; 50 51 /* The Regex compiled patterns. */ 52 static struct patterns 53 { 54 /* Regex compiled regexp. */ 55 struct re_pattern_buffer regexbuf; 56 struct re_registers regs; /* This is here on account of a BRAIN-DEAD 57 Q@#%!# library interface in regex.c. */ 58 } patterns0; 59 60 struct patterns *patterns; 61 size_t pcount; 62 63 /* KWset compiled pattern. For Ecompile and Gcompile, we compile 64 a list of strings, at least one of which is known to occur in 65 any string matching the regexp. */ 66 static kwset_t kwset; 67 68 /* Number of compiled fixed strings known to exactly match the regexp. 69 If kwsexec returns < kwset_exact_matches, then we don't need to 70 call the regexp matcher at all. */ 71 static int kwset_exact_matches; 72 73 #if defined(MBS_SUPPORT) 74 static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); 75 #endif 76 static void kwsinit PARAMS ((void)); 77 static void kwsmusts PARAMS ((void)); 78 static void Gcompile PARAMS ((char const *, size_t)); 79 static void Ecompile PARAMS ((char const *, size_t)); 80 static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int )); 81 static void Fcompile PARAMS ((char const *, size_t)); 82 static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int)); 83 static void Pcompile PARAMS ((char const *, size_t )); 84 static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); 85 86 void 87 dfaerror (char const *mesg) 88 { 89 error (2, 0, mesg); 90 } 91 92 static void 93 kwsinit (void) 94 { 95 static char trans[NCHAR]; 96 int i; 97 98 if (match_icase) 99 for (i = 0; i < NCHAR; ++i) 100 trans[i] = TOLOWER (i); 101 102 if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0))) 103 error (2, 0, _("memory exhausted")); 104 } 105 106 /* If the DFA turns out to have some set of fixed strings one of 107 which must occur in the match, then we build a kwset matcher 108 to find those strings, and thus quickly filter out impossible 109 matches. */ 110 static void 111 kwsmusts (void) 112 { 113 struct dfamust const *dm; 114 char const *err; 115 116 if (dfa.musts) 117 { 118 kwsinit (); 119 /* First, we compile in the substrings known to be exact 120 matches. The kwset matcher will return the index 121 of the matching string that it chooses. */ 122 for (dm = dfa.musts; dm; dm = dm->next) 123 { 124 if (!dm->exact) 125 continue; 126 ++kwset_exact_matches; 127 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) 128 error (2, 0, err); 129 } 130 /* Now, we compile the substrings that will require 131 the use of the regexp matcher. */ 132 for (dm = dfa.musts; dm; dm = dm->next) 133 { 134 if (dm->exact) 135 continue; 136 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) 137 error (2, 0, err); 138 } 139 if ((err = kwsprep (kwset)) != 0) 140 error (2, 0, err); 141 } 142 } 143 144 #ifdef MBS_SUPPORT 145 /* This function allocate the array which correspond to "buf". 146 Then this check multibyte string and mark on the positions which 147 are not singlebyte character nor the first byte of a multibyte 148 character. Caller must free the array. */ 149 static char* 150 check_multibyte_string(char const *buf, size_t size) 151 { 152 char *mb_properties = malloc(size); 153 mbstate_t cur_state; 154 int i; 155 memset(&cur_state, 0, sizeof(mbstate_t)); 156 memset(mb_properties, 0, sizeof(char)*size); 157 for (i = 0; i < size ;) 158 { 159 size_t mbclen; 160 mbclen = mbrlen(buf + i, size - i, &cur_state); 161 162 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) 163 { 164 /* An invalid sequence, or a truncated multibyte character. 165 We treat it as a singlebyte character. */ 166 mbclen = 1; 167 } 168 mb_properties[i] = mbclen; 169 i += mbclen; 170 } 171 172 return mb_properties; 173 } 174 #endif 175 176 static void 177 Gcompile (char const *pattern, size_t size) 178 { 179 const char *err; 180 char const *sep; 181 size_t total = size; 182 char const *motif = pattern; 183 184 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); 185 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); 186 187 /* For GNU regex compiler we have to pass the patterns separately to detect 188 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 189 GNU regex should have raise a syntax error. The same for backref, where 190 the backref should have been local to each pattern. */ 191 do 192 { 193 size_t len; 194 sep = memchr (motif, '\n', total); 195 if (sep) 196 { 197 len = sep - motif; 198 sep++; 199 total -= (len + 1); 200 } 201 else 202 { 203 len = total; 204 total = 0; 205 } 206 207 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); 208 if (patterns == NULL) 209 error (2, errno, _("memory exhausted")); 210 211 patterns[pcount] = patterns0; 212 213 if ((err = re_compile_pattern (motif, len, 214 &(patterns[pcount].regexbuf))) != 0) 215 error (2, 0, err); 216 pcount++; 217 218 motif = sep; 219 } while (sep && total != 0); 220 221 /* In the match_words and match_lines cases, we use a different pattern 222 for the DFA matcher that will quickly throw out cases that won't work. 223 Then if DFA succeeds we do some hairy stuff using the regex matcher 224 to decide whether the match should really count. */ 225 if (match_words || match_lines) 226 { 227 /* In the whole-word case, we use the pattern: 228 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). 229 In the whole-line case, we use the pattern: 230 ^\(userpattern\)$. */ 231 232 static char const line_beg[] = "^\\("; 233 static char const line_end[] = "\\)$"; 234 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 235 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 236 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); 237 size_t i; 238 strcpy (n, match_lines ? line_beg : word_beg); 239 i = strlen (n); 240 memcpy (n + i, pattern, size); 241 i += size; 242 strcpy (n + i, match_lines ? line_end : word_end); 243 i += strlen (n + i); 244 pattern = n; 245 size = i; 246 } 247 248 dfacomp (pattern, size, &dfa, 1); 249 kwsmusts (); 250 } 251 252 static void 253 Ecompile (char const *pattern, size_t size) 254 { 255 const char *err; 256 const char *sep; 257 size_t total = size; 258 char const *motif = pattern; 259 260 if (strcmp (matcher, "awk") == 0) 261 { 262 re_set_syntax (RE_SYNTAX_AWK); 263 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); 264 } 265 else 266 { 267 re_set_syntax (RE_SYNTAX_POSIX_EGREP); 268 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); 269 } 270 271 /* For GNU regex compiler we have to pass the patterns separately to detect 272 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 273 GNU regex should have raise a syntax error. The same for backref, where 274 the backref should have been local to each pattern. */ 275 do 276 { 277 size_t len; 278 sep = memchr (motif, '\n', total); 279 if (sep) 280 { 281 len = sep - motif; 282 sep++; 283 total -= (len + 1); 284 } 285 else 286 { 287 len = total; 288 total = 0; 289 } 290 291 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); 292 if (patterns == NULL) 293 error (2, errno, _("memory exhausted")); 294 patterns[pcount] = patterns0; 295 296 if ((err = re_compile_pattern (motif, len, 297 &(patterns[pcount].regexbuf))) != 0) 298 error (2, 0, err); 299 pcount++; 300 301 motif = sep; 302 } while (sep && total != 0); 303 304 /* In the match_words and match_lines cases, we use a different pattern 305 for the DFA matcher that will quickly throw out cases that won't work. 306 Then if DFA succeeds we do some hairy stuff using the regex matcher 307 to decide whether the match should really count. */ 308 if (match_words || match_lines) 309 { 310 /* In the whole-word case, we use the pattern: 311 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). 312 In the whole-line case, we use the pattern: 313 ^(userpattern)$. */ 314 315 static char const line_beg[] = "^("; 316 static char const line_end[] = ")$"; 317 static char const word_beg[] = "(^|[^[:alnum:]_])("; 318 static char const word_end[] = ")([^[:alnum:]_]|$)"; 319 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); 320 size_t i; 321 strcpy (n, match_lines ? line_beg : word_beg); 322 i = strlen(n); 323 memcpy (n + i, pattern, size); 324 i += size; 325 strcpy (n + i, match_lines ? line_end : word_end); 326 i += strlen (n + i); 327 pattern = n; 328 size = i; 329 } 330 331 dfacomp (pattern, size, &dfa, 1); 332 kwsmusts (); 333 } 334 335 static size_t 336 EGexecute (char const *buf, size_t size, size_t *match_size, int exact) 337 { 338 register char const *buflim, *beg, *end; 339 char eol = eolbyte; 340 int backref, start, len; 341 struct kwsmatch kwsm; 342 size_t i; 343 #ifdef MBS_SUPPORT 344 char *mb_properties = NULL; 345 #endif /* MBS_SUPPORT */ 346 347 #ifdef MBS_SUPPORT 348 if (MB_CUR_MAX > 1 && kwset) 349 mb_properties = check_multibyte_string(buf, size); 350 #endif /* MBS_SUPPORT */ 351 352 buflim = buf + size; 353 354 for (beg = end = buf; end < buflim; beg = end) 355 { 356 if (!exact) 357 { 358 if (kwset) 359 { 360 /* Find a possible match using the KWset matcher. */ 361 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); 362 if (offset == (size_t) -1) 363 goto failure; 364 beg += offset; 365 /* Narrow down to the line containing the candidate, and 366 run it through DFA. */ 367 end = memchr(beg, eol, buflim - beg); 368 end++; 369 #ifdef MBS_SUPPORT 370 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) 371 continue; 372 #endif 373 while (beg > buf && beg[-1] != eol) 374 --beg; 375 if (kwsm.index < kwset_exact_matches) 376 goto success_in_beg_and_end; 377 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) 378 continue; 379 } 380 else 381 { 382 /* No good fixed strings; start with DFA. */ 383 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); 384 if (offset == (size_t) -1) 385 break; 386 /* Narrow down to the line we've found. */ 387 beg += offset; 388 end = memchr (beg, eol, buflim - beg); 389 end++; 390 while (beg > buf && beg[-1] != eol) 391 --beg; 392 } 393 /* Successful, no backreferences encountered! */ 394 if (!backref) 395 goto success_in_beg_and_end; 396 } 397 else 398 end = beg + size; 399 400 /* If we've made it to this point, this means DFA has seen 401 a probable match, and we need to run it through Regex. */ 402 for (i = 0; i < pcount; i++) 403 { 404 patterns[i].regexbuf.not_eol = 0; 405 if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, 406 end - beg - 1, 0, 407 end - beg - 1, &(patterns[i].regs)))) 408 { 409 len = patterns[i].regs.end[0] - start; 410 if (exact && !match_words) 411 goto success_in_start_and_len; 412 if ((!match_lines && !match_words) 413 || (match_lines && len == end - beg - 1)) 414 goto success_in_beg_and_end; 415 /* If -w, check if the match aligns with word boundaries. 416 We do this iteratively because: 417 (a) the line may contain more than one occurence of the 418 pattern, and 419 (b) Several alternatives in the pattern might be valid at a 420 given point, and we may need to consider a shorter one to 421 find a word boundary. */ 422 if (match_words) 423 while (start >= 0) 424 { 425 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) 426 && (len == end - beg - 1 427 || !WCHAR ((unsigned char) beg[start + len]))) 428 goto success_in_beg_and_end; 429 if (len > 0) 430 { 431 /* Try a shorter length anchored at the same place. */ 432 --len; 433 patterns[i].regexbuf.not_eol = 1; 434 len = re_match (&(patterns[i].regexbuf), beg, 435 start + len, start, 436 &(patterns[i].regs)); 437 } 438 if (len <= 0) 439 { 440 /* Try looking further on. */ 441 if (start == end - beg - 1) 442 break; 443 ++start; 444 patterns[i].regexbuf.not_eol = 0; 445 start = re_search (&(patterns[i].regexbuf), beg, 446 end - beg - 1, 447 start, end - beg - 1 - start, 448 &(patterns[i].regs)); 449 len = patterns[i].regs.end[0] - start; 450 } 451 } 452 } 453 } /* for Regex patterns. */ 454 } /* for (beg = end ..) */ 455 456 failure: 457 #ifdef MBS_SUPPORT 458 if (MB_CUR_MAX > 1 && mb_properties) 459 free (mb_properties); 460 #endif /* MBS_SUPPORT */ 461 return (size_t) -1; 462 463 success_in_beg_and_end: 464 len = end - beg; 465 start = beg - buf; 466 /* FALLTHROUGH */ 467 468 success_in_start_and_len: 469 #ifdef MBS_SUPPORT 470 if (MB_CUR_MAX > 1 && mb_properties) 471 free (mb_properties); 472 #endif /* MBS_SUPPORT */ 473 *match_size = len; 474 return start; 475 } 476 477 static void 478 Fcompile (char const *pattern, size_t size) 479 { 480 char const *beg, *lim, *err; 481 482 kwsinit (); 483 beg = pattern; 484 do 485 { 486 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim) 487 ; 488 if ((err = kwsincr (kwset, beg, lim - beg)) != 0) 489 error (2, 0, err); 490 if (lim < pattern + size) 491 ++lim; 492 beg = lim; 493 } 494 while (beg < pattern + size); 495 496 if ((err = kwsprep (kwset)) != 0) 497 error (2, 0, err); 498 } 499 500 static size_t 501 Fexecute (char const *buf, size_t size, size_t *match_size, int exact) 502 { 503 register char const *beg, *try, *end; 504 register size_t len; 505 char eol = eolbyte; 506 struct kwsmatch kwsmatch; 507 #ifdef MBS_SUPPORT 508 char *mb_properties; 509 if (MB_CUR_MAX > 1) 510 mb_properties = check_multibyte_string (buf, size); 511 #endif /* MBS_SUPPORT */ 512 513 for (beg = buf; beg <= buf + size; ++beg) 514 { 515 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); 516 if (offset == (size_t) -1) 517 goto failure; 518 #ifdef MBS_SUPPORT 519 if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) 520 continue; /* It is a part of multibyte character. */ 521 #endif /* MBS_SUPPORT */ 522 beg += offset; 523 len = kwsmatch.size[0]; 524 if (exact && !match_words) 525 goto success_in_beg_and_len; 526 if (match_lines) 527 { 528 if (beg > buf && beg[-1] != eol) 529 continue; 530 if (beg + len < buf + size && beg[len] != eol) 531 continue; 532 goto success; 533 } 534 else if (match_words) 535 for (try = beg; len; ) 536 { 537 if (try > buf && WCHAR((unsigned char) try[-1])) 538 break; 539 if (try + len < buf + size && WCHAR((unsigned char) try[len])) 540 { 541 offset = kwsexec (kwset, beg, --len, &kwsmatch); 542 if (offset == (size_t) -1) 543 { 544 #ifdef MBS_SUPPORT 545 if (MB_CUR_MAX > 1) 546 free (mb_properties); 547 #endif /* MBS_SUPPORT */ 548 return offset; 549 } 550 try = beg + offset; 551 len = kwsmatch.size[0]; 552 } 553 else 554 goto success; 555 } 556 else 557 goto success; 558 } 559 560 failure: 561 #ifdef MBS_SUPPORT 562 if (MB_CUR_MAX > 1) 563 free (mb_properties); 564 #endif /* MBS_SUPPORT */ 565 return -1; 566 567 success: 568 end = memchr (beg + len, eol, (buf + size) - (beg + len)); 569 end++; 570 while (buf < beg && beg[-1] != eol) 571 --beg; 572 len = end - beg; 573 /* FALLTHROUGH */ 574 575 success_in_beg_and_len: 576 *match_size = len; 577 #ifdef MBS_SUPPORT 578 if (MB_CUR_MAX > 1) 579 free (mb_properties); 580 #endif /* MBS_SUPPORT */ 581 return beg - buf; 582 } 583 584 #if HAVE_LIBPCRE 585 /* Compiled internal form of a Perl regular expression. */ 586 static pcre *cre; 587 588 /* Additional information about the pattern. */ 589 static pcre_extra *extra; 590 #endif 591 592 static void 593 Pcompile (char const *pattern, size_t size) 594 { 595 #if !HAVE_LIBPCRE 596 error (2, 0, _("The -P option is not supported")); 597 #else 598 int e; 599 char const *ep; 600 char *re = xmalloc (4 * size + 7); 601 int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); 602 char const *patlim = pattern + size; 603 char *n = re; 604 char const *p; 605 char const *pnul; 606 607 /* FIXME: Remove this restriction. */ 608 if (eolbyte != '\n') 609 error (2, 0, _("The -P and -z options cannot be combined")); 610 611 *n = '\0'; 612 if (match_lines) 613 strcpy (n, "^("); 614 if (match_words) 615 strcpy (n, "\\b("); 616 n += strlen (n); 617 618 /* The PCRE interface doesn't allow NUL bytes in the pattern, so 619 replace each NUL byte in the pattern with the four characters 620 "\000", removing a preceding backslash if there are an odd 621 number of backslashes before the NUL. 622 623 FIXME: This method does not work with some multibyte character 624 encodings, notably Shift-JIS, where a multibyte character can end 625 in a backslash byte. */ 626 for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) 627 { 628 memcpy (n, p, pnul - p); 629 n += pnul - p; 630 for (p = pnul; pattern < p && p[-1] == '\\'; p--) 631 continue; 632 n -= (pnul - p) & 1; 633 strcpy (n, "\\000"); 634 n += 4; 635 } 636 637 memcpy (n, p, patlim - p); 638 n += patlim - p; 639 *n = '\0'; 640 if (match_words) 641 strcpy (n, ")\\b"); 642 if (match_lines) 643 strcpy (n, ")$"); 644 645 cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); 646 if (!cre) 647 error (2, 0, ep); 648 649 extra = pcre_study (cre, 0, &ep); 650 if (ep) 651 error (2, 0, ep); 652 653 free (re); 654 #endif 655 } 656 657 static size_t 658 Pexecute (char const *buf, size_t size, size_t *match_size, int exact) 659 { 660 #if !HAVE_LIBPCRE 661 abort (); 662 return -1; 663 #else 664 /* This array must have at least two elements; everything after that 665 is just for performance improvement in pcre_exec. */ 666 int sub[300]; 667 668 int e = pcre_exec (cre, extra, buf, size, 0, 0, 669 sub, sizeof sub / sizeof *sub); 670 671 if (e <= 0) 672 { 673 switch (e) 674 { 675 case PCRE_ERROR_NOMATCH: 676 return -1; 677 678 case PCRE_ERROR_NOMEMORY: 679 error (2, 0, _("Memory exhausted")); 680 681 default: 682 abort (); 683 } 684 } 685 else 686 { 687 /* Narrow down to the line we've found. */ 688 char const *beg = buf + sub[0]; 689 char const *end = buf + sub[1]; 690 char const *buflim = buf + size; 691 char eol = eolbyte; 692 if (!exact) 693 { 694 end = memchr (end, eol, buflim - end); 695 end++; 696 while (buf < beg && beg[-1] != eol) 697 --beg; 698 } 699 700 *match_size = end - beg; 701 return beg - buf; 702 } 703 #endif 704 } 705 706 struct matcher const matchers[] = { 707 { "default", Gcompile, EGexecute }, 708 { "grep", Gcompile, EGexecute }, 709 { "egrep", Ecompile, EGexecute }, 710 { "awk", Ecompile, EGexecute }, 711 { "fgrep", Fcompile, Fexecute }, 712 { "perl", Pcompile, Pexecute }, 713 { "", 0, 0 }, 714 }; -
tests/fmbtest.sh
diff -urN grep-2.5.1a.orig/tests/fmbtest.sh grep-2.5.1a/tests/fmbtest.sh
old new 1 #!/bin/sh 2 3 : ${srcdir=.} 4 5 # If cs_CZ.UTF-8 locale doesn't work, skip this test silently 6 LC_ALL=cs_CZ.UTF-8 locale -k LC_CTYPE 2>/dev/null | ${GREP} -q charmap.*UTF-8 \ 7 || exit 77 8 9 failures=0 10 11 cat > csinput <<EOF 12 01 ÅœluÅ¥ouÄká ÄÃÅ¡e 13 ÄÃÅ E 02 14 03 Z ÄÃÅ¡Ã ÄiÅ¡Ã cosi 15 04 Äà 16 Å e 05 17 06 ÄÄÄÄÄÄÄÃÅ¡ÄÃÅ ÄÃÅ¡ 18 07 ÄÄÄ ÄÄÄÄÃÅ¡ÄÃÅ ÄÃÅ¡EEEE 19 ÄAs 08 20 09Äapka 21 10ÄaSy se mÄnà 22 ÄÃÅ¡E11 23 Äas12 24 ðÄÃÅ¡Eð13 25 ÅœÄÃÅ¡Eð14 26 ðÄÃÅ¡EÅœ15 27 ÅœÄÃÅ¡EÅœ16 28 ÄÃÅ¡Eð17 29 ÄÃÅ¡EÅœ18 30 19ðÄÃÅ¡e 31 20ÅœÄÃÅ¡e 32 EOF 33 cat > cspatfile <<EOF 34 ÄÃÅ¡E 35 Äas 36 EOF 37 38 for mode in F G E; do 39 40 test1="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode} -f cspatfile csinput \ 41 | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" 42 if test "$test1" != "11 12 13 14 15 16 17 18"; then 43 echo "Test #1 ${mode} failed: $test1" 44 failures=1 45 fi 46 47 test2="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -f cspatfile csinput \ 48 | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" 49 if test "$test2" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then 50 echo "Test #2 ${mode} failed: $test2" 51 failures=1 52 fi 53 54 test3="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -e 'ÄÃÅ¡E' -e 'Äas' csinput \ 55 | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" 56 if test "$test3" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then 57 echo "Test #3 ${mode} failed: $test3" 58 failures=1 59 fi 60 61 test4="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}iw -f cspatfile csinput \ 62 | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" 63 if test "$test4" != "01 02 08 13 17 19"; then 64 echo "Test #4 ${mode} failed: $test4" 65 failures=1 66 fi 67 68 done 69 70 # Test that -F --color=always prefers longer matches. 71 test5="`echo 'Cosi tu ÄiÅ¡Ã...' \ 72 | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -Fi -e 'ÄiÅ¡' -e 'ÄiÅ¡Ã'`" 73 if echo "$test5" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mÄiÅ¡Ã.*\[.*m\(.\[K\)\?\.\.\.'; then 74 : 75 else 76 echo "Test #5 F failed: $test5" 77 failures=1 78 fi 79 80 for mode in G E; do 81 82 # Test that -{G,E} --color=always prefers earlier pattern matches. 83 test6="`echo 'Cosi tu ÄiÅ¡Ã...' \ 84 | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'ÄiÅ¡' -e 'ÄiÅ¡Ã'`" 85 if echo "$test6" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mÄiÅ¡.*\[.*m\(.\[K\)\?Ã\.\.\.'; then 86 : 87 else 88 echo "Test #6 ${mode} failed: $test6" 89 failures=1 90 fi 91 92 # Test that -{G,E} --color=always prefers earlier pattern matches. 93 test7="`echo 'Cosi tu ÄiÅ¡Ã...' \ 94 | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'ÄiÅ¡Ã' -e 'ÄiÅ¡'`" 95 if echo "$test7" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mÄiÅ¡Ã.*\[.*m\(.\[K\)\?\.\.\.'; then 96 : 97 else 98 echo "Test #7 ${mode} failed: $test7" 99 failures=1 100 fi 101 102 test8="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -e 'Ä.Å¡E' -e 'Ä[a-f]s' csinput \ 103 | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)" 104 if test "$test8" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then 105 echo "Test #8 ${mode} failed: $test8" 106 failures=1 107 fi 108 109 done 110 111 exit $failures -
tests/Makefile.am
diff -urN grep-2.5.1a.orig/tests/Makefile.am grep-2.5.1a/tests/Makefile.am
old new 3 3 AWK=@AWK@ 4 4 5 5 TESTS = warning.sh khadafy.sh spencer1.sh bre.sh ere.sh \ 6 status.sh empty.sh options.sh backref.sh file.sh 6 status.sh empty.sh options.sh backref.sh file.sh \ 7 fmbtest.sh 7 8 EXTRA_DIST = $(TESTS) \ 8 9 khadafy.lines khadafy.regexp \ 9 10 spencer1.awk spencer1.tests \ -
tests/Makefile.in
diff -urN grep-2.5.1a.orig/tests/Makefile.in grep-2.5.1a/tests/Makefile.in
old new 97 97 AWK = @AWK@ 98 98 99 99 TESTS = warning.sh khadafy.sh spencer1.sh bre.sh ere.sh \ 100 status.sh empty.sh options.sh backref.sh file.sh 100 status.sh empty.sh options.sh backref.sh file.sh \ 101 fmbtest.sh 101 102 102 103 EXTRA_DIST = $(TESTS) \ 103 104 khadafy.lines khadafy.regexp \
Note:
See TracBrowser
for help on using the repository browser.