Submitted By: Ken Moffat Date: 2008-02-19 Initial Package Version: 2.5.3 Upstream Status: uncertain Origin: from debian. Description: Various fixes, particularly speed improvements for UTF-8 locales. Also adds a 'standard input' marker into the results for certain obscure uses. diff -Naur grep-2.5.3.orig/lib/posix/regex.h grep-2.5.3.lfs/lib/posix/regex.h --- grep-2.5.3.orig/lib/posix/regex.h 2007-06-28 19:57:18.000000000 +0100 +++ grep-2.5.3.lfs/lib/posix/regex.h 2008-02-10 18:56:07.000000000 +0000 @@ -165,6 +165,10 @@ treated as 'a\{1'. */ #define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1) +/* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) + /* This global variable defines the particular regexp syntax to use (for some interfaces). When a regexp is compiled, the syntax used is stored in the pattern buffer, so changing this does not affect diff -Naur grep-2.5.3.orig/src/dfa.c grep-2.5.3.lfs/src/dfa.c --- grep-2.5.3.orig/src/dfa.c 2007-06-28 19:57:19.000000000 +0100 +++ grep-2.5.3.lfs/src/dfa.c 2008-02-10 18:55:29.000000000 +0000 @@ -594,6 +594,17 @@ /* build character class. */ { wctype_t wt; + /* NOTE: + * when case_fold, character class [:upper:] and [:lower:] + * should be treated as [:alpha:], this is the same way + * of glibc/posix/regcomp.c:build_charclass(). + * reported by Bug#276202 + * - fixed by Fumitoshi UKAI + */ + if (case_fold + && (strcmp (str, "upper") == 0 || strcmp (str, "lower") == 0)) + strcpy (str, "alpha"); + /* Query the character class as wctype_t. */ wt = wctype (str); @@ -681,6 +692,29 @@ REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, range_ends_al, work_mbc->nranges + 1); work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; + if (case_fold + && (iswlower((wint_t)wc) || iswupper((wint_t)wc)) + && (iswlower((wint_t)wc2) || iswupper((wint_t)wc2))) { + wint_t altcase; + altcase = wc; + if (iswlower((wint_t)wc)) + altcase = towupper((wint_t)wc); + else + altcase = towlower((wint_t)wc); + REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, + range_sts_al, work_mbc->nranges + 1); + work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase; + + altcase = wc2; + if (iswlower((wint_t)wc2)) + altcase = towupper((wint_t)wc2); + else + altcase = towlower((wint_t)wc2); + REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, + range_ends_al, work_mbc->nranges + 1); + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase; + + } } else if (wc != WEOF) /* build normal characters. */ @@ -688,6 +722,20 @@ REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, work_mbc->nchars + 1); work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; + if (case_fold && (iswlower((wint_t) wc) || iswupper((wint_t) wc))) + { + wint_t altcase; + + altcase = wc; /* keeps compiler happy */ + if (iswlower((wint_t) wc)) + altcase = towupper((wint_t) wc); + else if (iswupper((wint_t) wc)) + altcase = towlower((wint_t) wc); + + REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, + work_mbc->nchars + 1); + work_mbc->chars[work_mbc->nchars++] = (wchar_t) altcase; + } } } while ((wc = wc1) != L']'); diff -Naur grep-2.5.3.orig/src/grep.c grep-2.5.3.lfs/src/grep.c --- grep-2.5.3.orig/src/grep.c 2007-06-28 19:57:19.000000000 +0100 +++ grep-2.5.3.lfs/src/grep.c 2008-02-10 18:54:53.000000000 +0000 @@ -274,6 +274,12 @@ #endif ; +/* Default for `file_list' if no files are given on the command line. */ +static char *stdin_argv[] = +{ + "-", NULL +}; + /* Non-boolean long options that have no corresponding short equivalents. */ enum { @@ -534,7 +540,16 @@ for byte sentinels fore and aft. */ newalloc = newsize + pagesize + 1; - newbuf = bufalloc < newalloc ? xmalloc (bufalloc = newalloc) : buffer; + newbuf = bufalloc < newalloc ? malloc (bufalloc = newalloc) : buffer; + if (newbuf == NULL) + { + int saved_errno = errno; + free (buffer); + bufalloc = ALIGN_TO (INITIAL_BUFSIZE, pagesize) + pagesize + 1; + buffer = xmalloc (bufalloc); + errno = saved_errno; + return 0; + } readbuf = ALIGN_TO (newbuf + 1 + save, pagesize); bufbeg = readbuf - save; memmove (bufbeg, buffer + saved_offset, save); @@ -1825,6 +1840,7 @@ FILE *fp; extern char *optarg; extern int optind; + char **file_list; initialize_main (&argc, &argv); program_name = argv[0]; @@ -2244,29 +2260,29 @@ if (max_count == 0) exit (1); - if (optind < argc) + file_list = (optind == argc ? stdin_argv : &argv[optind]); + + status = 1; + while (1) { - status = 1; - do + char *file = *file_list++; + + if (file == NULL) + break; + + if ((included_patterns || excluded_patterns) + && !isdir (file)) { - char *file = argv[optind]; - if ((included_patterns || excluded_patterns) - && !isdir (file)) - { - if (included_patterns && - ! excluded_filename (included_patterns, file, 0)) - continue; - if (excluded_patterns && - excluded_filename (excluded_patterns, file, 0)) - continue; - } - status &= grepfile (strcmp (file, "-") == 0 ? (char *) NULL : file, - &stats_base); + if (included_patterns && + ! excluded_filename (included_patterns, file, 0)) + continue; + if (excluded_patterns && + excluded_filename (excluded_patterns, file, 0)) + continue; } - while ( ++optind < argc); + status &= grepfile (strcmp (file, "-") == 0 + ? (char *) NULL : file, &stats_base); } - else - status = grepfile ((char *) NULL, &stats_base); /* We register via atexit() to test stdout. */ exit (errseen ? 2 : status); diff -Naur grep-2.5.3.orig/src/search.c grep-2.5.3.lfs/src/search.c --- grep-2.5.3.orig/src/search.c 2007-06-28 19:57:19.000000000 +0100 +++ grep-2.5.3.lfs/src/search.c 2008-02-10 18:56:18.000000000 +0000 @@ -18,10 +18,15 @@ /* Written August 1992 by Mike Haertel. */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif #ifdef HAVE_CONFIG_H # include #endif +#include + #include #include "mbsupport.h" @@ -43,6 +48,9 @@ #ifdef HAVE_LIBPCRE # include #endif +#ifdef HAVE_LANGINFO_CODESET +# include +#endif #define NCHAR (UCHAR_MAX + 1) @@ -68,6 +76,19 @@ error (2, 0, _("memory exhausted")); } +/* UTF-8 encoding allows some optimizations that we can't otherwise + assume in a multibyte encoding. */ +static int using_utf8; + +void +check_utf8 (void) +{ +#ifdef HAVE_LANGINFO_CODESET + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) + using_utf8 = 1; +#endif +} + #ifndef FGREP_PROGRAM /* DFA compiled regexp. */ static struct dfa dfa; @@ -134,49 +155,6 @@ } #endif /* !FGREP_PROGRAM */ -#ifdef MBS_SUPPORT -/* This function allocate the array which correspond to "buf". - Then this check multibyte string and mark on the positions which - are not single byte character nor the first byte of a multibyte - character. Caller must free the array. */ -static char* -check_multibyte_string(char const *buf, size_t size) -{ - char *mb_properties = xmalloc(size); - mbstate_t cur_state; - wchar_t wc; - int i; - - memset(&cur_state, 0, sizeof(mbstate_t)); - memset(mb_properties, 0, sizeof(char)*size); - - for (i = 0; i < size ;) - { - size_t mbclen; - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); - - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) - { - /* An invalid sequence, or a truncated multibyte character. - We treat it as a single byte character. */ - mbclen = 1; - } - else if (match_icase) - { - if (iswupper((wint_t)wc)) - { - wc = towlower((wint_t)wc); - wcrtomb(buf + i, wc, &cur_state); - } - } - mb_properties[i] = mbclen; - i += mbclen; - } - - return mb_properties; -} -#endif /* MBS_SUPPORT */ - #if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) #ifdef EGREP_PROGRAM COMPILE_FCT(Ecompile) @@ -193,10 +171,9 @@ size_t total = size; char const *motif = pattern; -#if 0 + check_utf8 (); if (match_icase) syntax_bits |= RE_ICASE; -#endif re_set_syntax (syntax_bits); dfasyntax (syntax_bits, match_icase, eolbyte); @@ -301,23 +278,35 @@ char eol = eolbyte; int backref, start, len, best_len; struct kwsmatch kwsm; + static int use_dfa; + static int use_dfa_checked = 0; size_t i, ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) + const char *last_char = NULL; + int mb_cur_max = MB_CUR_MAX; + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); +#endif /* MBS_SUPPORT */ + + if (!use_dfa_checked) { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - if (start_ptr) - start_ptr = case_buf + (start_ptr - buf); - buf = case_buf; - } - if (kwset) - mb_properties = check_multibyte_string(buf, size); - } + char *grep_use_dfa = getenv ("GREP_USE_DFA"); + if (!grep_use_dfa) + { +#ifdef MBS_SUPPORT + /* Turn off DFA when processing multibyte input. */ + use_dfa = (MB_CUR_MAX == 1); +#else + use_dfa = 1; #endif /* MBS_SUPPORT */ + } + else + { + use_dfa = atoi (grep_use_dfa); + } + + use_dfa_checked = 1; + } buflim = buf + size; @@ -329,40 +318,123 @@ if (kwset) { /* Find a possible match using the KWset matcher. */ - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ + size_t offset; +#ifdef MBS_SUPPORT + /* kwsexec doesn't work with match_icase and multibyte input. */ + if (match_icase && mb_cur_max > 1) + /* Avoid kwset */ + offset = 0; + else +#endif /* MBS_SUPPORT */ + offset = kwsexec (kwset, beg, buflim - beg, &kwsm); if (offset == (size_t) -1) - goto failure; + return (size_t)-1; +#ifdef MBS_SUPPORT + if (mb_cur_max > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) + { + size_t mlen = mbrlen (beg, bytes_left, &mbs); + + last_char = beg; + if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (mlen == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += mlen; + bytes_left -= mlen; + } + } + else +#endif /* MBS_SUPPORT */ beg += offset; /* Narrow down to the line containing the candidate, and run it through DFA. */ end = memchr(beg, eol, buflim - beg); end++; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) + if (mb_cur_max > 1 && bytes_left) continue; #endif while (beg > buf && beg[-1] != eol) --beg; - if (kwsm.index < kwset_exact_matches) + if ( +#ifdef MBS_SUPPORT + !(match_icase && mb_cur_max > 1) && +#endif /* MBS_SUPPORT */ + (kwsm.index < kwset_exact_matches)) goto success; - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) + if (use_dfa && + dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) continue; } else { /* No good fixed strings; start with DFA. */ - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ + size_t offset = 0; + if (use_dfa) + offset = dfaexec (&dfa, beg, buflim - beg, &backref); if (offset == (size_t) -1) break; /* Narrow down to the line we've found. */ +#ifdef MBS_SUPPORT + if (mb_cur_max > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) + { + size_t mlen = mbrlen (beg, bytes_left, &mbs); + + last_char = beg; + if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (mlen == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += mlen; + bytes_left -= mlen; + } + } + else +#endif /* MBS_SUPPORT */ beg += offset; end = memchr (beg, eol, buflim - beg); end++; +#ifdef MBS_SUPPORT + if (mb_cur_max > 1 && bytes_left) + continue; +#endif /* MBS_SUPPORT */ while (beg > buf && beg[-1] != eol) --beg; } /* Successful, no backreferences encountered! */ - if (!backref) + if (use_dfa && !backref) goto success; } else @@ -408,10 +480,84 @@ if (match_words) while (match <= best_match) { - if ((match == buf || !WCHAR ((unsigned char) match[-1])) - && (len == end - beg - 1 - || !WCHAR ((unsigned char) match[len]))) - goto assess_pattern_match; + int lword_match = 0; + if (match == buf) + lword_match = 1; + else + { + assert (start > 0); +#ifdef MBS_SUPPORT + if (mb_cur_max > 1) + { + const char *s; + int mr; + wchar_t pwc; + if (using_utf8) + { + s = match - 1; + while (s > buf + && (unsigned char) *s >= 0x80 + && (unsigned char) *s <= 0xbf) + --s; + } + else + s = last_char; + mr = mbtowc (&pwc, s, match - s); + if (mr <= 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + lword_match = 1; + } + else if (!(iswalnum (pwc) || pwc == L'_') + && mr == (int) (match - s)) + lword_match = 1; + } + else +#endif /* MBS_SUPPORT */ + if (!WCHAR ((unsigned char) match[-1])) + lword_match = 1; + } + + if (lword_match) + { + int rword_match = 0; + if (start + len == end - beg - 1) + rword_match = 1; + else + { +#ifdef MBS_SUPPORT + if (mb_cur_max > 1) + { + wchar_t nwc; + int mr; + + mr = mbtowc (&nwc, buf + start + len, + end - buf - start - len - 1); + if (mr <= 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + rword_match = 1; + } + else if (!iswalnum (nwc) && nwc != L'_') + rword_match = 1; + } + else +#endif /* MBS_SUPPORT */ + if (!WCHAR ((unsigned char) match[len])) + rword_match = 1; + } + + if (rword_match) + { + if (!start_ptr) + /* Returns the whole line. */ + goto success; + else + { + goto assess_pattern_match; + } + } + } if (len > 0) { /* Try a shorter length anchored at the same place. */ @@ -475,24 +621,144 @@ *match_size = len; ret_val = beg - buf; out: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free((char*)buf); - if (mb_properties) - free(mb_properties); - } -#endif /* MBS_SUPPORT */ return ret_val; } #endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */ +#ifdef MBS_SUPPORT +static int f_i_multibyte; /* whether we're using the new -Fi MB method */ +static struct +{ + wchar_t **patterns; + size_t count, maxlen; + unsigned char *match; +} Fimb; +#endif + #if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) COMPILE_FCT(Fcompile) { + int mb_cur_max = MB_CUR_MAX; char const *beg, *lim, *err; + check_utf8 (); +#ifdef MBS_SUPPORT + /* Support -F -i for UTF-8 input. */ + if (match_icase && mb_cur_max > 1) + { + mbstate_t mbs; + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); + const char *patternend = pattern; + size_t wcsize; + kwset_t fimb_kwset = NULL; + char *starts = NULL; + wchar_t *wcbeg, *wclim; + size_t allocated = 0; + + memset (&mbs, '\0', sizeof (mbs)); +# ifdef __GNU_LIBRARY__ + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); + if (patternend != pattern + size) + wcsize = (size_t) -1; +# else + { + char *patterncopy = xmalloc (size + 1); + + memcpy (patterncopy, pattern, size); + patterncopy[size] = '\0'; + patternend = patterncopy; + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); + if (patternend != patterncopy + size) + wcsize = (size_t) -1; + free (patterncopy); + } +# endif + if (wcsize + 2 <= 2) + { +fimb_fail: + free (wcpattern); + free (starts); + if (fimb_kwset) + kwsfree (fimb_kwset); + free (Fimb.patterns); + Fimb.patterns = NULL; + } + else + { + if (!(fimb_kwset = kwsalloc (NULL))) + error (2, 0, _("memory exhausted")); + + starts = xmalloc (mb_cur_max * 3); + wcbeg = wcpattern; + do + { + int i; + size_t wclen; + + if (Fimb.count >= allocated) + { + if (allocated == 0) + allocated = 128; + else + allocated *= 2; + Fimb.patterns = xrealloc (Fimb.patterns, + sizeof (wchar_t *) * allocated); + } + Fimb.patterns[Fimb.count++] = wcbeg; + for (wclim = wcbeg; + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) + *wclim = towlower (*wclim); + *wclim = L'\0'; + wclen = wclim - wcbeg; + if (wclen > Fimb.maxlen) + Fimb.maxlen = wclen; + if (wclen > 3) + wclen = 3; + if (wclen == 0) + { + if ((err = kwsincr (fimb_kwset, "", 0)) != 0) + error (2, 0, err); + } + else + for (i = 0; i < (1 << wclen); i++) + { + char *p = starts; + int j, k; + + for (j = 0; j < wclen; ++j) + { + wchar_t wc = wcbeg[j]; + if (i & (1 << j)) + { + wc = towupper (wc); + if (wc == wcbeg[j]) + continue; + } + k = wctomb (p, wc); + if (k <= 0) + goto fimb_fail; + p += k; + } + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) + error (2, 0, err); + } + if (wclim < wcpattern + wcsize) + ++wclim; + wcbeg = wclim; + } + while (wcbeg < wcpattern + wcsize); + f_i_multibyte = 1; + kwset = fimb_kwset; + free (starts); + Fimb.match = xmalloc (Fimb.count); + if ((err = kwsprep (kwset)) != 0) + error (2, 0, err); + return; + } + } +#endif /* MBS_SUPPORT */ + + kwsinit (); beg = pattern; do @@ -511,6 +777,76 @@ error (2, 0, err); } +#ifdef MBS_SUPPORT +static int +Fimbexec (const char *buf, size_t size, size_t *plen, int exact) +{ + size_t len, letter, i; + int ret = -1; + mbstate_t mbs; + wchar_t wc; + int patterns_left; + + assert (match_icase && f_i_multibyte == 1); + assert (MB_CUR_MAX > 1); + + memset (&mbs, '\0', sizeof (mbs)); + memset (Fimb.match, '\1', Fimb.count); + letter = len = 0; + patterns_left = 1; + while (patterns_left && len <= size) + { + size_t c; + + patterns_left = 0; + if (len < size) + { + c = mbrtowc (&wc, buf + len, size - len, &mbs); + if (c + 2 <= 2) + return ret; + + wc = towlower (wc); + } + else + { + c = 1; + wc = L'\0'; + } + + for (i = 0; i < Fimb.count; i++) + { + if (Fimb.match[i]) + { + if (Fimb.patterns[i][letter] == L'\0') + { + /* Found a match. */ + *plen = len; + if (!exact && !match_words) + return 0; + else + { + /* For -w or exact look for longest match. */ + ret = 0; + Fimb.match[i] = '\0'; + continue; + } + } + + if (Fimb.patterns[i][letter] == wc) + patterns_left = 1; + else + Fimb.match[i] = '\0'; + } + } + + len += c; + letter++; + } + + return ret; +} +#endif /* MBS_SUPPORT */ + EXECUTE_FCT(Fexecute) { register char const *beg, *try, *end; @@ -519,69 +855,256 @@ struct kwsmatch kwsmatch; size_t ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - if (start_ptr) - start_ptr = case_buf + (start_ptr - buf); - buf = case_buf; - } - mb_properties = check_multibyte_string(buf, size); - } + int mb_cur_max = MB_CUR_MAX; + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); + const char *last_char = NULL; #endif /* MBS_SUPPORT */ for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) { size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); if (offset == (size_t) -1) - goto failure; + return offset; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) - continue; /* It is a part of multibyte character. */ + if (mb_cur_max > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t mlen = mbrlen (beg, bytes_left, &mbs); + + last_char = beg; + if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (mlen == (size_t) -2) + /* Offset points inside multibyte character: no good. */ + break; + + beg += mlen; + bytes_left -= mlen; + } + + if (bytes_left) + continue; + } + else #endif /* MBS_SUPPORT */ beg += offset; +#ifdef MBS_SUPPORT + /* For f_i_multibyte, the string at beg now matches first 3 chars of + one of the search strings (less if there are shorter search strings). + See if this is a real match. */ + if (f_i_multibyte + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], start_ptr == NULL)) + goto next_char; +#endif /* MBS_SUPPORT */ len = kwsmatch.size[0]; if (start_ptr && !match_words) goto success_in_beg_and_len; if (match_lines) { if (beg > buf && beg[-1] != eol) - continue; + goto next_char; if (beg + len < buf + size && beg[len] != eol) - continue; + goto next_char; goto success; } else if (match_words) - for (try = beg; len; ) - { - if (try > buf && WCHAR((unsigned char) try[-1])) - break; - if (try + len < buf + size && WCHAR((unsigned char) try[len])) - { - offset = kwsexec (kwset, beg, --len, &kwsmatch); - if (offset == (size_t) -1) - break; - try = beg + offset; - len = kwsmatch.size[0]; - } - else if (!start_ptr) - goto success; - else - goto success_in_beg_and_len; - } /* for (try) */ - else + { + while (len) + { + int word_match = 0; + if (beg > buf) + { +#ifdef MBS_SUPPORT + if (mb_cur_max > 1) + { + const char *s; + int mr; + wchar_t pwc; + + if (using_utf8) + { + s = beg - 1; + while (s > buf + && (unsigned char) *s >= 0x80 + && (unsigned char) *s <= 0xbf) + --s; + } + else + s = last_char; + mr = mbtowc (&pwc, s, beg - s); + if (mr <= 0) + memset (&mbs, '\0', sizeof (mbstate_t)); + else if ((iswalnum (pwc) || pwc == L'_') + && mr == (int) (beg - s)) + goto next_char; + } + else +#endif /* MBS_SUPPORT */ + if (WCHAR ((unsigned char) beg[-1])) + goto next_char; + } +#ifdef MBS_SUPPORT + if (mb_cur_max > 1) + { + wchar_t nwc; + int mr; + + mr = mbtowc (&nwc, beg + len, buf + size - beg - len); + if (mr <= 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + word_match = 1; + } + else if (!iswalnum (nwc) && nwc != L'_') + word_match = 1; + } + else +#endif /* MBS_SUPPORT */ + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) + word_match = 1; + if (word_match) + { + if (start_ptr == NULL) + /* Returns the whole line now we know there's a word match. */ + goto success; + else { + /* Returns just this word match. */ + *match_size = len; + return beg - buf; + } + } + if (len > 0) + { + /* Try a shorter length anchored at the same place. */ + --len; + offset = kwsexec (kwset, beg, len, &kwsmatch); + + if (offset == -1) + goto next_char; /* Try a different anchor. */ +#ifdef MBS_SUPPORT + + if (mb_cur_max > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t mlen = mbrlen (beg, bytes_left, &mbs); + + last_char = beg; + if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (mlen == (size_t) -2) + { + /* Offset points inside multibyte character: + * no good. */ + break; + } + + beg += mlen; + bytes_left -= mlen; + } + + if (bytes_left) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + goto next_char; /* Try a different anchor. */ + } + } + else +#endif /* MBS_SUPPORT */ + beg += offset; +#ifdef MBS_SUPPORT + /* The string at beg now matches first 3 chars of one of + the search strings (less if there are shorter search + strings). See if this is a real match. */ + if (f_i_multibyte + && Fimbexec (beg, len - offset, &kwsmatch.size[0], + start_ptr == NULL)) + goto next_char; +#endif /* MBS_SUPPORT */ + len = kwsmatch.size[0]; + } + } + } + else goto success; - } /* for (beg in buf) */ +next_char:; +#ifdef MBS_SUPPORT + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled + by ++beg above. */ + if (mb_cur_max > 1) + { + if (using_utf8) + { + unsigned char c = *beg; + if (c >= 0xc2) + { + if (c < 0xe0) + ++beg; + else if (c < 0xf0) + beg += 2; + else if (c < 0xf8) + beg += 3; + else if (c < 0xfc) + beg += 4; + else if (c < 0xfe) + beg += 5; + } + } + else + { + size_t l = mbrlen (beg, buf + size - beg, &mbs); - failure: - ret_val = -1; - goto out; + last_char = beg; + if (l + 2 >= 2) + beg += l - 1; + else + memset (&mbs, '\0', sizeof (mbstate_t)); + } + } +#endif /* MBS_SUPPORT */ + } + + return -1; success: +#ifdef MBS_SUPPORT + if (mb_cur_max > 1 && !using_utf8) + { + end = beg + len; + while (end < buf + size) + { + size_t mlen = mbrlen (end, buf + size - end, &mbs); + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + mlen = 1; + } + if (mlen == 1 && *end == eol) + break; + + end += mlen; + } + } + else + #endif /* MBS_SUPPORT */ end = memchr (beg + len, eol, (buf + size) - (beg + len)); end++; while (buf < beg && beg[-1] != eol) @@ -591,15 +1114,6 @@ *match_size = len; ret_val = beg - buf; out: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free((char*)buf); - if (mb_properties) - free(mb_properties); - } -#endif /* MBS_SUPPORT */ return ret_val; } #endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */