Submitted by: Alexander E. Patrakov Date: 2005-08-13 Initial Package Version: 2.5.1a Upstream Status: Partially accepted, partially rejected, but required for LSB >= 2.0 certification Origin: RedHat Description: Various fixes from RedHat. Individual patches: grep-2.5.1-fgrep.patch grep-2.5.1-bracket.patch grep-2.5-i18n.patch grep-2.5.1-oi.patch grep-2.5.1-manpage.patch grep-2.5.1-color.patch grep-2.5.1-icolor.patch grep-2.5.1-egf-speedup.patch grep-2.5.1-dfa-optional.patch grep-2.5.1-tests.patch grep-2.5.1-w.patch Testcases: -fgrep: ???, but required for other patches -bracket: echo "[" | LANG=en_US.UTF-8 grep "[[:space:]]" -i18n: many fixes for multibyte locale support, required for LSB. -oi: echo xxYYzz | LANG=C grep -i -o yy -manpage: typo -color: restore the background color correctly -icolor: ??? echo 'spam foo SPAM FOO' | grep -i --color spam (but that's also fixed by -oi. Is this patch just a cleanup?) -egf-speedup: without this, grep is as slow as a snail in UTF-8 locales. -dfa-optional: disables dfa in multibyte locales by default. -w: (echo 'foo';echo 'fo') > /tmp/testfile && grep -F -w fo /tmp/testfile diff -urN grep-2.5.1a.orig/doc/grep.1 grep-2.5.1a/doc/grep.1 --- grep-2.5.1a.orig/doc/grep.1 2004-11-12 16:26:37.000000000 +0500 +++ grep-2.5.1a/doc/grep.1 2005-10-23 09:49:43.000000000 +0600 @@ -191,6 +191,7 @@ .I PATTERN as a list of fixed strings, separated by newlines, any of which is to be matched. +.TP .BR \-P ", " \-\^\-perl-regexp Interpret .I PATTERN @@ -302,7 +303,7 @@ This is especially useful for tools like zgrep, e.g. .B "gzip -cd foo.gz |grep --label=foo something" .TP -.BR \-\^\-line-buffering +.BR \-\^\-line-buffered Use line buffering, it can be a performance penality. .TP .BR \-q ", " \-\^\-quiet ", " \-\^\-silent diff -urN grep-2.5.1a.orig/lib/posix/regex.h grep-2.5.1a/lib/posix/regex.h --- grep-2.5.1a.orig/lib/posix/regex.h 2001-04-02 23:56:50.000000000 +0600 +++ grep-2.5.1a/lib/posix/regex.h 2005-10-23 09:49:31.000000000 +0600 @@ -109,6 +109,10 @@ If not set, \{, \}, {, and } are literals. */ #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) +/* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) + /* If this bit is set, +, ? and | aren't recognized as operators. If not set, they are. */ #define RE_LIMITED_OPS (RE_INTERVALS << 1) diff -urN grep-2.5.1a.orig/src/dfa.c grep-2.5.1a/src/dfa.c --- grep-2.5.1a.orig/src/dfa.c 2001-09-26 22:57:55.000000000 +0600 +++ grep-2.5.1a/src/dfa.c 2005-10-23 09:49:17.000000000 +0600 @@ -414,7 +414,7 @@ /* This function fetch a wide character, and update cur_mb_len, used only if the current locale is a multibyte environment. */ -static wchar_t +static wint_t fetch_wc (char const *eoferr) { wchar_t wc; @@ -423,7 +423,7 @@ if (eoferr != 0) dfaerror (eoferr); else - return -1; + return WEOF; } cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); @@ -459,7 +459,7 @@ static void parse_bracket_exp_mb () { - wchar_t wc, wc1, wc2; + wint_t wc, wc1, wc2; /* Work area to build a mb_char_classes. */ struct mb_char_classes *work_mbc; @@ -496,7 +496,7 @@ work_mbc->invert = 0; do { - wc1 = -1; /* mark wc1 is not initialized". */ + wc1 = WEOF; /* mark wc1 is not initialized". */ /* Note that if we're looking at some other [:...:] construct, we just treat it as a bunch of ordinary characters. We can do @@ -586,7 +586,7 @@ work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; } } - wc = -1; + wc1 = wc = WEOF; } else /* We treat '[' as a normal character here. */ @@ -600,7 +600,7 @@ wc = fetch_wc(("Unbalanced [")); } - if (wc1 == -1) + if (wc1 == WEOF) wc1 = fetch_wc(_("Unbalanced [")); if (wc1 == L'-') @@ -630,17 +630,17 @@ } REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, range_sts_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = wc; + work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, range_ends_al, work_mbc->nranges + 1); - work_mbc->range_ends[work_mbc->nranges++] = wc2; + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; } - else if (wc != -1) + else if (wc != WEOF) /* build normal characters. */ { REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = wc; + work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; } } while ((wc = wc1) != L']'); @@ -2552,6 +2552,8 @@ } /* match with a character? */ + if (case_fold) + wc = towlower (wc); for (i = 0; inchars; i++) { if (wc == work_mbc->chars[i]) diff -urN grep-2.5.1a.orig/src/grep.c grep-2.5.1a/src/grep.c --- grep-2.5.1a.orig/src/grep.c 2004-11-12 16:25:35.000000000 +0500 +++ grep-2.5.1a/src/grep.c 2005-10-23 09:50:06.000000000 +0600 @@ -30,6 +30,12 @@ # include # include #endif +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC +/* We can handle multibyte string. */ +# define MBS_SUPPORT +# include +# include +#endif #include #include "system.h" #include "getopt.h" @@ -558,33 +564,6 @@ { size_t match_size; size_t match_offset; - if(match_icase) - { - /* Yuck, this is tricky */ - char *buf = (char*) xmalloc (lim - beg); - char *ibeg = buf; - char *ilim = ibeg + (lim - beg); - int i; - for (i = 0; i < lim - beg; i++) - ibeg[i] = tolower (beg[i]); - while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1)) - != (size_t) -1) - { - char const *b = beg + match_offset; - if (b == lim) - break; - fwrite (beg, sizeof (char), match_offset, stdout); - printf ("\33[%sm", grep_color); - fwrite (b, sizeof (char), match_size, stdout); - fputs ("\33[00m", stdout); - beg = b + match_size; - ibeg = ibeg + match_offset + match_size; - } - fwrite (beg, 1, lim - beg, stdout); - free (buf); - lastout = lim; - return; - } while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1)) != (size_t) -1) { @@ -601,6 +580,7 @@ fputs ("\33[00m", stdout); beg = b + match_size; } + fputs ("\33[K", stdout); } fwrite (beg, 1, lim - beg, stdout); if (ferror (stdout)) @@ -1697,6 +1677,37 @@ if (!install_matcher (matcher) && !install_matcher ("default")) abort (); +#ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1 && match_icase) + { + wchar_t wc; + mbstate_t cur_state, prev_state; + int i, len = strlen(keys); + + memset(&cur_state, 0, sizeof(mbstate_t)); + for (i = 0; i <= len ;) + { + size_t mbclen; + mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state); + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + { + /* An invalid sequence, or a truncated multibyte character. + We treat it as a singlebyte character. */ + mbclen = 1; + } + else + { + if (iswupper((wint_t)wc)) + { + wc = towlower((wint_t)wc); + wcrtomb(keys + i, wc, &cur_state); + } + } + i += mbclen; + } + } +#endif /* MBS_SUPPORT */ + (*compile)(keys, keycc); if ((argc - optind > 1 && !no_filenames) || with_filenames) diff -urN grep-2.5.1a.orig/src/search.c grep-2.5.1a/src/search.c --- grep-2.5.1a.orig/src/search.c 2001-04-19 09:42:14.000000000 +0600 +++ grep-2.5.1a/src/search.c 2005-10-23 09:51:25.000000000 +0600 @@ -18,9 +18,13 @@ /* Written August 1992 by Mike Haertel. */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif #ifdef HAVE_CONFIG_H # include #endif +#include #include #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC /* We can handle multibyte string. */ @@ -31,7 +35,7 @@ #include "system.h" #include "grep.h" -#include "regex.h" +#include #include "dfa.h" #include "kwset.h" #include "error.h" @@ -39,6 +43,9 @@ #ifdef HAVE_LIBPCRE # include #endif +#ifdef HAVE_LANGINFO_CODESET +# include +#endif #define NCHAR (UCHAR_MAX + 1) @@ -70,9 +77,10 @@ call the regexp matcher at all. */ static int kwset_exact_matches; -#if defined(MBS_SUPPORT) -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); -#endif +/* UTF-8 encoding allows some optimizations that we can't otherwise + assume in a multibyte encoding. */ +static int using_utf8; + static void kwsinit PARAMS ((void)); static void kwsmusts PARAMS ((void)); static void Gcompile PARAMS ((char const *, size_t)); @@ -84,6 +92,15 @@ static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); void +check_utf8 (void) +{ +#ifdef HAVE_LANGINFO_CODESET + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) + using_utf8 = 1; +#endif +} + +void dfaerror (char const *mesg) { error (2, 0, mesg); @@ -141,38 +158,6 @@ } } -#ifdef MBS_SUPPORT -/* This function allocate the array which correspond to "buf". - Then this check multibyte string and mark on the positions which - are not singlebyte character nor the first byte of a multibyte - character. Caller must free the array. */ -static char* -check_multibyte_string(char const *buf, size_t size) -{ - char *mb_properties = malloc(size); - mbstate_t cur_state; - int i; - memset(&cur_state, 0, sizeof(mbstate_t)); - memset(mb_properties, 0, sizeof(char)*size); - for (i = 0; i < size ;) - { - size_t mbclen; - mbclen = mbrlen(buf + i, size - i, &cur_state); - - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) - { - /* An invalid sequence, or a truncated multibyte character. - We treat it as a singlebyte character. */ - mbclen = 1; - } - mb_properties[i] = mbclen; - i += mbclen; - } - - return mb_properties; -} -#endif - static void Gcompile (char const *pattern, size_t size) { @@ -181,7 +166,8 @@ size_t total = size; char const *motif = pattern; - re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); + check_utf8 (); + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); /* For GNU regex compiler we have to pass the patterns separately to detect @@ -233,7 +219,7 @@ static char const line_end[] = "\\)$"; static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); size_t i; strcpy (n, match_lines ? line_beg : word_beg); i = strlen (n); @@ -257,14 +243,15 @@ size_t total = size; char const *motif = pattern; + check_utf8 (); if (strcmp (matcher, "awk") == 0) { - re_set_syntax (RE_SYNTAX_AWK); + re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); } else { - re_set_syntax (RE_SYNTAX_POSIX_EGREP); + re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0)); dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); } @@ -316,7 +303,7 @@ static char const line_end[] = ")$"; static char const word_beg[] = "(^|[^[:alnum:]_])("; static char const word_end[] = ")([^[:alnum:]_]|$)"; - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); size_t i; strcpy (n, match_lines ? line_beg : word_beg); i = strlen(n); @@ -339,15 +326,35 @@ char eol = eolbyte; int backref, start, len; struct kwsmatch kwsm; - size_t i; + size_t i, ret_val; + static int use_dfa; + static int use_dfa_checked = 0; #ifdef MBS_SUPPORT - char *mb_properties = NULL; + const char *last_char = NULL; + int mb_cur_max = MB_CUR_MAX; + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); #endif /* MBS_SUPPORT */ + if (!use_dfa_checked) + { + char *grep_use_dfa = getenv ("GREP_USE_DFA"); + if (!grep_use_dfa) + { #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && kwset) - mb_properties = check_multibyte_string(buf, size); + /* Turn off DFA when processing multibyte input. */ + use_dfa = (MB_CUR_MAX == 1); +#else + use_dfa = 1; #endif /* MBS_SUPPORT */ + } + else + { + use_dfa = atoi (grep_use_dfa); + } + + use_dfa_checked = 1; + } buflim = buf + size; @@ -358,47 +365,124 @@ if (kwset) { /* Find a possible match using the KWset matcher. */ - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ + size_t offset; +#ifdef MBS_SUPPORT + /* kwsexec doesn't work with match_icase and multibyte input. */ + if (match_icase && mb_cur_max > 1) + /* Avoid kwset */ + offset = 0; + else +#endif /* MBS_SUPPORT */ + offset = kwsexec (kwset, beg, buflim - beg, &kwsm); if (offset == (size_t) -1) - { + goto failure; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - free(mb_properties); -#endif - return (size_t)-1; + if (mb_cur_max > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) + { + size_t mlen = mbrlen (beg, bytes_left, &mbs); + + last_char = beg; + if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (mlen == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += mlen; + bytes_left -= mlen; + } } + else +#endif /* MBS_SUPPORT */ beg += offset; /* Narrow down to the line containing the candidate, and run it through DFA. */ end = memchr(beg, eol, buflim - beg); end++; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) + if (mb_cur_max > 1 && bytes_left) continue; -#endif +#endif /* MBS_SUPPORT */ while (beg > buf && beg[-1] != eol) --beg; - if (kwsm.index < kwset_exact_matches) - goto success; - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) + if ( +#ifdef MBS_SUPPORT + !(match_icase && mb_cur_max > 1) && +#endif /* MBS_SUPPORT */ + (kwsm.index < kwset_exact_matches)) + goto success_in_beg_and_end; + if (use_dfa && + dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) continue; } else { /* No good fixed strings; start with DFA. */ - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ + size_t offset = 0; + if (use_dfa) + offset = dfaexec (&dfa, beg, buflim - beg, &backref); if (offset == (size_t) -1) break; /* Narrow down to the line we've found. */ +#ifdef MBS_SUPPORT + if (mb_cur_max > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) + { + size_t mlen = mbrlen (beg, bytes_left, &mbs); + + last_char = beg; + if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (mlen == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += mlen; + bytes_left -= mlen; + } + } + else +#endif /* MBS_SUPPORT */ beg += offset; end = memchr (beg, eol, buflim - beg); end++; +#ifdef MBS_SUPPORT + if (mb_cur_max > 1 && bytes_left) + continue; +#endif /* MBS_SUPPORT */ while (beg > buf && beg[-1] != eol) --beg; } /* Successful, no backreferences encountered! */ - if (!backref) - goto success; + if (use_dfa && !backref) + goto success_in_beg_and_end; } else end = beg + size; @@ -413,14 +497,11 @@ end - beg - 1, &(patterns[i].regs)))) { len = patterns[i].regs.end[0] - start; - if (exact) - { - *match_size = len; - return start; - } + if (exact && !match_words) + goto success_in_start_and_len; if ((!match_lines && !match_words) || (match_lines && len == end - beg - 1)) - goto success; + goto success_in_beg_and_end; /* If -w, check if the match aligns with word boundaries. We do this iteratively because: (a) the line may contain more than one occurence of the @@ -431,10 +512,84 @@ if (match_words) while (start >= 0) { - if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) - && (len == end - beg - 1 - || !WCHAR ((unsigned char) beg[start + len]))) - goto success; + int lword_match = 0; + if (start == 0) + lword_match = 1; + else + { + assert (start > 0); +#ifdef MBS_SUPPORT + if (mb_cur_max > 1) + { + const char *s; + int mr; + wchar_t pwc; + + if (using_utf8) + { + s = beg + start - 1; + while (s > buf + && (unsigned char) *s >= 0x80 + && (unsigned char) *s <= 0xbf) + --s; + } + else + s = last_char; + mr = mbtowc (&pwc, s, beg + start - s); + if (mr <= 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + lword_match = 1; + } + else if (!(iswalnum (pwc) || pwc == L'_') + && mr == (int) (beg + start - s)) + lword_match = 1; + } + else +#endif /* MBS_SUPPORT */ + if (!WCHAR ((unsigned char) beg[start - 1])) + lword_match = 1; + } + + if (lword_match) + { + int rword_match = 0; + if (start + len == end - beg - 1) + rword_match = 1; + else + { +#ifdef MBS_SUPPORT + if (mb_cur_max > 1) + { + wchar_t nwc; + int mr; + + mr = mbtowc (&nwc, beg + start + len, + end - beg - start - len - 1); + if (mr <= 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + rword_match = 1; + } + else if (!iswalnum (nwc) && nwc != L'_') + rword_match = 1; + } + else +#endif /* MBS_SUPPORT */ + if (!WCHAR ((unsigned char) beg[start + len])) + rword_match = 1; + } + + if (rword_match) + { + if (!exact) + /* Returns the whole line. */ + goto success_in_beg_and_end; + else + /* Returns just this word match. */ + goto success_in_start_and_len; + } + } if (len > 0) { /* Try a shorter length anchored at the same place. */ @@ -461,26 +616,154 @@ } } /* for Regex patterns. */ } /* for (beg = end ..) */ -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties) - free (mb_properties); -#endif /* MBS_SUPPORT */ + + failure: return (size_t) -1; - success: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties) - free (mb_properties); -#endif /* MBS_SUPPORT */ - *match_size = end - beg; - return beg - buf; + success_in_beg_and_end: + len = end - beg; + start = beg - buf; + /* FALLTHROUGH */ + + success_in_start_and_len: + *match_size = len; + return start; } +#ifdef MBS_SUPPORT +static int f_i_multibyte; /* whether we're using the new -Fi MB method */ +static struct +{ + wchar_t **patterns; + size_t count, maxlen; + unsigned char *match; +} Fimb; +#endif + static void Fcompile (char const *pattern, size_t size) { + int mb_cur_max = MB_CUR_MAX; char const *beg, *lim, *err; + check_utf8 (); +#ifdef MBS_SUPPORT + /* Support -F -i for UTF-8 input. */ + if (match_icase && mb_cur_max > 1) + { + mbstate_t mbs; + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); + const char *patternend = pattern; + size_t wcsize; + kwset_t fimb_kwset = NULL; + char *starts = NULL; + wchar_t *wcbeg, *wclim; + size_t allocated = 0; + + memset (&mbs, '\0', sizeof (mbs)); +# ifdef __GNU_LIBRARY__ + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); + if (patternend != pattern + size) + wcsize = (size_t) -1; +# else + { + char *patterncopy = xmalloc (size + 1); + + memcpy (patterncopy, pattern, size); + patterncopy[size] = '\0'; + patternend = patterncopy; + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); + if (patternend != patterncopy + size) + wcsize = (size_t) -1; + free (patterncopy); + } +# endif + if (wcsize + 2 <= 2) + { +fimb_fail: + free (wcpattern); + free (starts); + if (fimb_kwset) + kwsfree (fimb_kwset); + free (Fimb.patterns); + Fimb.patterns = NULL; + } + else + { + if (!(fimb_kwset = kwsalloc (NULL))) + error (2, 0, _("memory exhausted")); + + starts = xmalloc (mb_cur_max * 3); + wcbeg = wcpattern; + do + { + int i; + size_t wclen; + + if (Fimb.count >= allocated) + { + if (allocated == 0) + allocated = 128; + else + allocated *= 2; + Fimb.patterns = xrealloc (Fimb.patterns, + sizeof (wchar_t *) * allocated); + } + Fimb.patterns[Fimb.count++] = wcbeg; + for (wclim = wcbeg; + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) + *wclim = towlower (*wclim); + *wclim = L'\0'; + wclen = wclim - wcbeg; + if (wclen > Fimb.maxlen) + Fimb.maxlen = wclen; + if (wclen > 3) + wclen = 3; + if (wclen == 0) + { + if ((err = kwsincr (fimb_kwset, "", 0)) != 0) + error (2, 0, err); + } + else + for (i = 0; i < (1 << wclen); i++) + { + char *p = starts; + int j, k; + + for (j = 0; j < wclen; ++j) + { + wchar_t wc = wcbeg[j]; + if (i & (1 << j)) + { + wc = towupper (wc); + if (wc == wcbeg[j]) + continue; + } + k = wctomb (p, wc); + if (k <= 0) + goto fimb_fail; + p += k; + } + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) + error (2, 0, err); + } + if (wclim < wcpattern + wcsize) + ++wclim; + wcbeg = wclim; + } + while (wcbeg < wcpattern + wcsize); + f_i_multibyte = 1; + kwset = fimb_kwset; + free (starts); + Fimb.match = xmalloc (Fimb.count); + if ((err = kwsprep (kwset)) != 0) + error (2, 0, err); + return; + } + } +#endif /* MBS_SUPPORT */ + + kwsinit (); beg = pattern; do @@ -499,6 +782,76 @@ error (2, 0, err); } +#ifdef MBS_SUPPORT +static int +Fimbexec (const char *buf, size_t size, size_t *plen, int exact) +{ + size_t len, letter, i; + int ret = -1; + mbstate_t mbs; + wchar_t wc; + int patterns_left; + + assert (match_icase && f_i_multibyte == 1); + assert (MB_CUR_MAX > 1); + + memset (&mbs, '\0', sizeof (mbs)); + memset (Fimb.match, '\1', Fimb.count); + letter = len = 0; + patterns_left = 1; + while (patterns_left && len <= size) + { + size_t c; + + patterns_left = 0; + if (len < size) + { + c = mbrtowc (&wc, buf + len, size - len, &mbs); + if (c + 2 <= 2) + return ret; + + wc = towlower (wc); + } + else + { + c = 1; + wc = L'\0'; + } + + for (i = 0; i < Fimb.count; i++) + { + if (Fimb.match[i]) + { + if (Fimb.patterns[i][letter] == L'\0') + { + /* Found a match. */ + *plen = len; + if (!exact && !match_words) + return 0; + else + { + /* For -w or exact look for longest match. */ + ret = 0; + Fimb.match[i] = '\0'; + continue; + } + } + + if (Fimb.patterns[i][letter] == wc) + patterns_left = 1; + else + Fimb.match[i] = '\0'; + } + } + + len += c; + letter++; + } + + return ret; +} +#endif /* MBS_SUPPORT */ + static size_t Fexecute (char const *buf, size_t size, size_t *match_size, int exact) { @@ -506,88 +859,268 @@ register size_t len; char eol = eolbyte; struct kwsmatch kwsmatch; + size_t ret_val; #ifdef MBS_SUPPORT - char *mb_properties; - if (MB_CUR_MAX > 1) - mb_properties = check_multibyte_string (buf, size); + int mb_cur_max = MB_CUR_MAX; + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); + const char *last_char = NULL; #endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) { - size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + size_t offset; + offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + if (offset == (size_t) -1) - { + goto failure; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - free(mb_properties); -#endif /* MBS_SUPPORT */ - return offset; + if (mb_cur_max > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t mlen = mbrlen (beg, bytes_left, &mbs); + + last_char = beg; + if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (mlen == (size_t) -2) + /* Offset points inside multibyte character: no good. */ + break; + + beg += mlen; + bytes_left -= mlen; + } + + if (bytes_left) + continue; } -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) - continue; /* It is a part of multibyte character. */ + else #endif /* MBS_SUPPORT */ beg += offset; - len = kwsmatch.size[0]; - if (exact) - { - *match_size = len; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - free (mb_properties); + /* For f_i_multibyte, the string at beg now matches first 3 chars of + one of the search strings (less if there are shorter search strings). + See if this is a real match. */ + if (f_i_multibyte + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact)) + goto next_char; #endif /* MBS_SUPPORT */ - return beg - buf; - } + len = kwsmatch.size[0]; + if (exact && !match_words) + goto success_in_beg_and_len; if (match_lines) { if (beg > buf && beg[-1] != eol) - continue; + goto next_char; if (beg + len < buf + size && beg[len] != eol) - continue; + goto next_char; goto success; } else if (match_words) - for (try = beg; len; ) - { - if (try > buf && WCHAR((unsigned char) try[-1])) - break; - if (try + len < buf + size && WCHAR((unsigned char) try[len])) - { - offset = kwsexec (kwset, beg, --len, &kwsmatch); - if (offset == (size_t) -1) - { + { + while (len) + { + int word_match = 0; + if (beg > buf) + { #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - free (mb_properties); + if (mb_cur_max > 1) + { + const char *s; + int mr; + wchar_t pwc; + + if (using_utf8) + { + s = beg - 1; + while (s > buf + && (unsigned char) *s >= 0x80 + && (unsigned char) *s <= 0xbf) + --s; + } + else + s = last_char; + mr = mbtowc (&pwc, s, beg - s); + if (mr <= 0) + memset (&mbs, '\0', sizeof (mbstate_t)); + else if ((iswalnum (pwc) || pwc == L'_') + && mr == (int) (beg - s)) + goto next_char; + } + else #endif /* MBS_SUPPORT */ - return offset; - } - try = beg + offset; - len = kwsmatch.size[0]; - } - else - goto success; - } + if (WCHAR ((unsigned char) beg[-1])) + goto next_char; + } +#ifdef MBS_SUPPORT + if (mb_cur_max > 1) + { + wchar_t nwc; + int mr; + + mr = mbtowc (&nwc, beg + len, buf + size - beg - len); + if (mr <= 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + word_match = 1; + } + else if (!iswalnum (nwc) && nwc != L'_') + word_match = 1; + } + else +#endif /* MBS_SUPPORT */ + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) + word_match = 1; + if (word_match) + { + if (!exact) + /* Returns the whole line now we know there's a word match. */ + goto success; + else + /* Returns just this word match. */ + goto success_in_beg_and_len; + } + if (len > 0) + { + /* Try a shorter length anchored at the same place. */ + --len; + offset = kwsexec (kwset, beg, len, &kwsmatch); + + if (offset == -1) + goto next_char; /* Try a different anchor. */ +#ifdef MBS_SUPPORT + if (mb_cur_max > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t mlen = mbrlen (beg, bytes_left, &mbs); + + last_char = beg; + if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (mlen == (size_t) -2) + { + /* Offset points inside multibyte character: + * no good. */ + break; + } + + beg += mlen; + bytes_left -= mlen; + } + + if (bytes_left) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + goto next_char; /* Try a different anchor. */ + } + } + else +#endif /* MBS_SUPPORT */ + beg += offset; +#ifdef MBS_SUPPORT + /* The string at beg now matches first 3 chars of one of + the search strings (less if there are shorter search + strings). See if this is a real match. */ + if (f_i_multibyte + && Fimbexec (beg, len - offset, &kwsmatch.size[0], + exact)) + goto next_char; +#endif /* MBS_SUPPORT */ + len = kwsmatch.size[0]; + } + } + } else goto success; - } - +next_char:; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - free (mb_properties); + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled + by ++beg above. */ + if (mb_cur_max > 1) + { + if (using_utf8) + { + unsigned char c = *beg; + if (c >= 0xc2) + { + if (c < 0xe0) + ++beg; + else if (c < 0xf0) + beg += 2; + else if (c < 0xf8) + beg += 3; + else if (c < 0xfc) + beg += 4; + else if (c < 0xfe) + beg += 5; + } + } + else + { + size_t l = mbrlen (beg, buf + size - beg, &mbs); + + last_char = beg; + if (l + 2 >= 2) + beg += l - 1; + else + memset (&mbs, '\0', sizeof (mbstate_t)); + } + } #endif /* MBS_SUPPORT */ + } + + failure: return -1; success: +#ifdef MBS_SUPPORT + if (mb_cur_max > 1 && !using_utf8) + { + end = beg + len; + while (end < buf + size) + { + size_t mlen = mbrlen (end, buf + size - end, &mbs); + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + mlen = 1; + } + if (mlen == 1 && *end == eol) + break; + + end += mlen; + } + } + else +#endif /* MBS_SUPPORT */ end = memchr (beg + len, eol, (buf + size) - (beg + len)); + end++; while (buf < beg && beg[-1] != eol) --beg; - *match_size = end - beg; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - free (mb_properties); -#endif /* MBS_SUPPORT */ + len = end - beg; + /* FALLTHROUGH */ + + success_in_beg_and_len: + *match_size = len; return beg - buf; } diff -urN grep-2.5.1a.orig/src/search.c.orig grep-2.5.1a/src/search.c.orig --- grep-2.5.1a.orig/src/search.c.orig 1970-01-01 05:00:00.000000000 +0500 +++ grep-2.5.1a/src/search.c.orig 2005-10-23 09:48:39.000000000 +0600 @@ -0,0 +1,714 @@ +/* search.c - searching subroutines using dfa, kwset and regex for grep. + Copyright 1992, 1998, 2000 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +/* Written August 1992 by Mike Haertel. */ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC +/* We can handle multibyte string. */ +# define MBS_SUPPORT +# include +# include +#endif + +#include "system.h" +#include "grep.h" +#include "regex.h" +#include "dfa.h" +#include "kwset.h" +#include "error.h" +#include "xalloc.h" +#ifdef HAVE_LIBPCRE +# include +#endif + +#define NCHAR (UCHAR_MAX + 1) + +/* For -w, we also consider _ to be word constituent. */ +#define WCHAR(C) (ISALNUM(C) || (C) == '_') + +/* DFA compiled regexp. */ +static struct dfa dfa; + +/* The Regex compiled patterns. */ +static struct patterns +{ + /* Regex compiled regexp. */ + struct re_pattern_buffer regexbuf; + struct re_registers regs; /* This is here on account of a BRAIN-DEAD + Q@#%!# library interface in regex.c. */ +} patterns0; + +struct patterns *patterns; +size_t pcount; + +/* KWset compiled pattern. For Ecompile and Gcompile, we compile + a list of strings, at least one of which is known to occur in + any string matching the regexp. */ +static kwset_t kwset; + +/* Number of compiled fixed strings known to exactly match the regexp. + If kwsexec returns < kwset_exact_matches, then we don't need to + call the regexp matcher at all. */ +static int kwset_exact_matches; + +#if defined(MBS_SUPPORT) +static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); +#endif +static void kwsinit PARAMS ((void)); +static void kwsmusts PARAMS ((void)); +static void Gcompile PARAMS ((char const *, size_t)); +static void Ecompile PARAMS ((char const *, size_t)); +static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int )); +static void Fcompile PARAMS ((char const *, size_t)); +static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int)); +static void Pcompile PARAMS ((char const *, size_t )); +static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); + +void +dfaerror (char const *mesg) +{ + error (2, 0, mesg); +} + +static void +kwsinit (void) +{ + static char trans[NCHAR]; + int i; + + if (match_icase) + for (i = 0; i < NCHAR; ++i) + trans[i] = TOLOWER (i); + + if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0))) + error (2, 0, _("memory exhausted")); +} + +/* If the DFA turns out to have some set of fixed strings one of + which must occur in the match, then we build a kwset matcher + to find those strings, and thus quickly filter out impossible + matches. */ +static void +kwsmusts (void) +{ + struct dfamust const *dm; + char const *err; + + if (dfa.musts) + { + kwsinit (); + /* First, we compile in the substrings known to be exact + matches. The kwset matcher will return the index + of the matching string that it chooses. */ + for (dm = dfa.musts; dm; dm = dm->next) + { + if (!dm->exact) + continue; + ++kwset_exact_matches; + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) + error (2, 0, err); + } + /* Now, we compile the substrings that will require + the use of the regexp matcher. */ + for (dm = dfa.musts; dm; dm = dm->next) + { + if (dm->exact) + continue; + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) + error (2, 0, err); + } + if ((err = kwsprep (kwset)) != 0) + error (2, 0, err); + } +} + +#ifdef MBS_SUPPORT +/* This function allocate the array which correspond to "buf". + Then this check multibyte string and mark on the positions which + are not singlebyte character nor the first byte of a multibyte + character. Caller must free the array. */ +static char* +check_multibyte_string(char const *buf, size_t size) +{ + char *mb_properties = malloc(size); + mbstate_t cur_state; + int i; + memset(&cur_state, 0, sizeof(mbstate_t)); + memset(mb_properties, 0, sizeof(char)*size); + for (i = 0; i < size ;) + { + size_t mbclen; + mbclen = mbrlen(buf + i, size - i, &cur_state); + + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + { + /* An invalid sequence, or a truncated multibyte character. + We treat it as a singlebyte character. */ + mbclen = 1; + } + mb_properties[i] = mbclen; + i += mbclen; + } + + return mb_properties; +} +#endif + +static void +Gcompile (char const *pattern, size_t size) +{ + const char *err; + char const *sep; + size_t total = size; + char const *motif = pattern; + + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); + dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); + + /* For GNU regex compiler we have to pass the patterns separately to detect + errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" + GNU regex should have raise a syntax error. The same for backref, where + the backref should have been local to each pattern. */ + do + { + size_t len; + sep = memchr (motif, '\n', total); + if (sep) + { + len = sep - motif; + sep++; + total -= (len + 1); + } + else + { + len = total; + total = 0; + } + + patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); + if (patterns == NULL) + error (2, errno, _("memory exhausted")); + + patterns[pcount] = patterns0; + + if ((err = re_compile_pattern (motif, len, + &(patterns[pcount].regexbuf))) != 0) + error (2, 0, err); + pcount++; + + motif = sep; + } while (sep && total != 0); + + /* In the match_words and match_lines cases, we use a different pattern + for the DFA matcher that will quickly throw out cases that won't work. + Then if DFA succeeds we do some hairy stuff using the regex matcher + to decide whether the match should really count. */ + if (match_words || match_lines) + { + /* In the whole-word case, we use the pattern: + \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). + In the whole-line case, we use the pattern: + ^\(userpattern\)$. */ + + static char const line_beg[] = "^\\("; + static char const line_end[] = "\\)$"; + static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; + static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; + char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + size_t i; + strcpy (n, match_lines ? line_beg : word_beg); + i = strlen (n); + memcpy (n + i, pattern, size); + i += size; + strcpy (n + i, match_lines ? line_end : word_end); + i += strlen (n + i); + pattern = n; + size = i; + } + + dfacomp (pattern, size, &dfa, 1); + kwsmusts (); +} + +static void +Ecompile (char const *pattern, size_t size) +{ + const char *err; + const char *sep; + size_t total = size; + char const *motif = pattern; + + if (strcmp (matcher, "awk") == 0) + { + re_set_syntax (RE_SYNTAX_AWK); + dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); + } + else + { + re_set_syntax (RE_SYNTAX_POSIX_EGREP); + dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); + } + + /* For GNU regex compiler we have to pass the patterns separately to detect + errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" + GNU regex should have raise a syntax error. The same for backref, where + the backref should have been local to each pattern. */ + do + { + size_t len; + sep = memchr (motif, '\n', total); + if (sep) + { + len = sep - motif; + sep++; + total -= (len + 1); + } + else + { + len = total; + total = 0; + } + + patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); + if (patterns == NULL) + error (2, errno, _("memory exhausted")); + patterns[pcount] = patterns0; + + if ((err = re_compile_pattern (motif, len, + &(patterns[pcount].regexbuf))) != 0) + error (2, 0, err); + pcount++; + + motif = sep; + } while (sep && total != 0); + + /* In the match_words and match_lines cases, we use a different pattern + for the DFA matcher that will quickly throw out cases that won't work. + Then if DFA succeeds we do some hairy stuff using the regex matcher + to decide whether the match should really count. */ + if (match_words || match_lines) + { + /* In the whole-word case, we use the pattern: + (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). + In the whole-line case, we use the pattern: + ^(userpattern)$. */ + + static char const line_beg[] = "^("; + static char const line_end[] = ")$"; + static char const word_beg[] = "(^|[^[:alnum:]_])("; + static char const word_end[] = ")([^[:alnum:]_]|$)"; + char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + size_t i; + strcpy (n, match_lines ? line_beg : word_beg); + i = strlen(n); + memcpy (n + i, pattern, size); + i += size; + strcpy (n + i, match_lines ? line_end : word_end); + i += strlen (n + i); + pattern = n; + size = i; + } + + dfacomp (pattern, size, &dfa, 1); + kwsmusts (); +} + +static size_t +EGexecute (char const *buf, size_t size, size_t *match_size, int exact) +{ + register char const *buflim, *beg, *end; + char eol = eolbyte; + int backref, start, len; + struct kwsmatch kwsm; + size_t i; +#ifdef MBS_SUPPORT + char *mb_properties = NULL; +#endif /* MBS_SUPPORT */ + +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && kwset) + mb_properties = check_multibyte_string(buf, size); +#endif /* MBS_SUPPORT */ + + buflim = buf + size; + + for (beg = end = buf; end < buflim; beg = end) + { + if (!exact) + { + if (kwset) + { + /* Find a possible match using the KWset matcher. */ + size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); + if (offset == (size_t) -1) + goto failure; + beg += offset; + /* Narrow down to the line containing the candidate, and + run it through DFA. */ + end = memchr(beg, eol, buflim - beg); + end++; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) + continue; +#endif + while (beg > buf && beg[-1] != eol) + --beg; + if (kwsm.index < kwset_exact_matches) + goto success_in_beg_and_end; + if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) + continue; + } + else + { + /* No good fixed strings; start with DFA. */ + size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); + if (offset == (size_t) -1) + break; + /* Narrow down to the line we've found. */ + beg += offset; + end = memchr (beg, eol, buflim - beg); + end++; + while (beg > buf && beg[-1] != eol) + --beg; + } + /* Successful, no backreferences encountered! */ + if (!backref) + goto success_in_beg_and_end; + } + else + end = beg + size; + + /* If we've made it to this point, this means DFA has seen + a probable match, and we need to run it through Regex. */ + for (i = 0; i < pcount; i++) + { + patterns[i].regexbuf.not_eol = 0; + if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, + end - beg - 1, 0, + end - beg - 1, &(patterns[i].regs)))) + { + len = patterns[i].regs.end[0] - start; + if (exact && !match_words) + goto success_in_start_and_len; + if ((!match_lines && !match_words) + || (match_lines && len == end - beg - 1)) + goto success_in_beg_and_end; + /* If -w, check if the match aligns with word boundaries. + We do this iteratively because: + (a) the line may contain more than one occurence of the + pattern, and + (b) Several alternatives in the pattern might be valid at a + given point, and we may need to consider a shorter one to + find a word boundary. */ + if (match_words) + while (start >= 0) + { + if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) + && (len == end - beg - 1 + || !WCHAR ((unsigned char) beg[start + len]))) + goto success_in_beg_and_end; + if (len > 0) + { + /* Try a shorter length anchored at the same place. */ + --len; + patterns[i].regexbuf.not_eol = 1; + len = re_match (&(patterns[i].regexbuf), beg, + start + len, start, + &(patterns[i].regs)); + } + if (len <= 0) + { + /* Try looking further on. */ + if (start == end - beg - 1) + break; + ++start; + patterns[i].regexbuf.not_eol = 0; + start = re_search (&(patterns[i].regexbuf), beg, + end - beg - 1, + start, end - beg - 1 - start, + &(patterns[i].regs)); + len = patterns[i].regs.end[0] - start; + } + } + } + } /* for Regex patterns. */ + } /* for (beg = end ..) */ + + failure: +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return (size_t) -1; + + success_in_beg_and_end: + len = end - beg; + start = beg - buf; + /* FALLTHROUGH */ + + success_in_start_and_len: +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties) + free (mb_properties); +#endif /* MBS_SUPPORT */ + *match_size = len; + return start; +} + +static void +Fcompile (char const *pattern, size_t size) +{ + char const *beg, *lim, *err; + + kwsinit (); + beg = pattern; + do + { + for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim) + ; + if ((err = kwsincr (kwset, beg, lim - beg)) != 0) + error (2, 0, err); + if (lim < pattern + size) + ++lim; + beg = lim; + } + while (beg < pattern + size); + + if ((err = kwsprep (kwset)) != 0) + error (2, 0, err); +} + +static size_t +Fexecute (char const *buf, size_t size, size_t *match_size, int exact) +{ + register char const *beg, *try, *end; + register size_t len; + char eol = eolbyte; + struct kwsmatch kwsmatch; +#ifdef MBS_SUPPORT + char *mb_properties; + if (MB_CUR_MAX > 1) + mb_properties = check_multibyte_string (buf, size); +#endif /* MBS_SUPPORT */ + + for (beg = buf; beg <= buf + size; ++beg) + { + size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + if (offset == (size_t) -1) + goto failure; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) + continue; /* It is a part of multibyte character. */ +#endif /* MBS_SUPPORT */ + beg += offset; + len = kwsmatch.size[0]; + if (exact && !match_words) + goto success_in_beg_and_len; + if (match_lines) + { + if (beg > buf && beg[-1] != eol) + continue; + if (beg + len < buf + size && beg[len] != eol) + continue; + goto success; + } + else if (match_words) + for (try = beg; len; ) + { + if (try > buf && WCHAR((unsigned char) try[-1])) + break; + if (try + len < buf + size && WCHAR((unsigned char) try[len])) + { + offset = kwsexec (kwset, beg, --len, &kwsmatch); + if (offset == (size_t) -1) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return offset; + } + try = beg + offset; + len = kwsmatch.size[0]; + } + else + goto success; + } + else + goto success; + } + + failure: +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return -1; + + success: + end = memchr (beg + len, eol, (buf + size) - (beg + len)); + end++; + while (buf < beg && beg[-1] != eol) + --beg; + len = end - beg; + /* FALLTHROUGH */ + + success_in_beg_and_len: + *match_size = len; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return beg - buf; +} + +#if HAVE_LIBPCRE +/* Compiled internal form of a Perl regular expression. */ +static pcre *cre; + +/* Additional information about the pattern. */ +static pcre_extra *extra; +#endif + +static void +Pcompile (char const *pattern, size_t size) +{ +#if !HAVE_LIBPCRE + error (2, 0, _("The -P option is not supported")); +#else + int e; + char const *ep; + char *re = xmalloc (4 * size + 7); + int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); + char const *patlim = pattern + size; + char *n = re; + char const *p; + char const *pnul; + + /* FIXME: Remove this restriction. */ + if (eolbyte != '\n') + error (2, 0, _("The -P and -z options cannot be combined")); + + *n = '\0'; + if (match_lines) + strcpy (n, "^("); + if (match_words) + strcpy (n, "\\b("); + n += strlen (n); + + /* The PCRE interface doesn't allow NUL bytes in the pattern, so + replace each NUL byte in the pattern with the four characters + "\000", removing a preceding backslash if there are an odd + number of backslashes before the NUL. + + FIXME: This method does not work with some multibyte character + encodings, notably Shift-JIS, where a multibyte character can end + in a backslash byte. */ + for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) + { + memcpy (n, p, pnul - p); + n += pnul - p; + for (p = pnul; pattern < p && p[-1] == '\\'; p--) + continue; + n -= (pnul - p) & 1; + strcpy (n, "\\000"); + n += 4; + } + + memcpy (n, p, patlim - p); + n += patlim - p; + *n = '\0'; + if (match_words) + strcpy (n, ")\\b"); + if (match_lines) + strcpy (n, ")$"); + + cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); + if (!cre) + error (2, 0, ep); + + extra = pcre_study (cre, 0, &ep); + if (ep) + error (2, 0, ep); + + free (re); +#endif +} + +static size_t +Pexecute (char const *buf, size_t size, size_t *match_size, int exact) +{ +#if !HAVE_LIBPCRE + abort (); + return -1; +#else + /* This array must have at least two elements; everything after that + is just for performance improvement in pcre_exec. */ + int sub[300]; + + int e = pcre_exec (cre, extra, buf, size, 0, 0, + sub, sizeof sub / sizeof *sub); + + if (e <= 0) + { + switch (e) + { + case PCRE_ERROR_NOMATCH: + return -1; + + case PCRE_ERROR_NOMEMORY: + error (2, 0, _("Memory exhausted")); + + default: + abort (); + } + } + else + { + /* Narrow down to the line we've found. */ + char const *beg = buf + sub[0]; + char const *end = buf + sub[1]; + char const *buflim = buf + size; + char eol = eolbyte; + if (!exact) + { + end = memchr (end, eol, buflim - end); + end++; + while (buf < beg && beg[-1] != eol) + --beg; + } + + *match_size = end - beg; + return beg - buf; + } +#endif +} + +struct matcher const matchers[] = { + { "default", Gcompile, EGexecute }, + { "grep", Gcompile, EGexecute }, + { "egrep", Ecompile, EGexecute }, + { "awk", Ecompile, EGexecute }, + { "fgrep", Fcompile, Fexecute }, + { "perl", Pcompile, Pexecute }, + { "", 0, 0 }, +}; diff -urN grep-2.5.1a.orig/tests/fmbtest.sh grep-2.5.1a/tests/fmbtest.sh --- grep-2.5.1a.orig/tests/fmbtest.sh 1970-01-01 05:00:00.000000000 +0500 +++ grep-2.5.1a/tests/fmbtest.sh 2005-10-23 09:51:12.000000000 +0600 @@ -0,0 +1,111 @@ +#!/bin/sh + +: ${srcdir=.} + +# If cs_CZ.UTF-8 locale doesn't work, skip this test silently +LC_ALL=cs_CZ.UTF-8 locale -k LC_CTYPE 2>/dev/null | ${GREP} -q charmap.*UTF-8 \ + || exit 77 + +failures=0 + +cat > csinput < cspatfile <