source: patches/grep-2.5.3-i18n-1.patch@ 1f7883fe

clfs-1.2 clfs-2.1 clfs-3.0.0-systemd clfs-3.0.0-sysvinit systemd sysvinit
Last change on this file since 1f7883fe was 318408a, checked in by Jim Gifford <clfs@…>, 16 years ago

Added Internationalization Patch to Grep

  • Property mode set to 100644
File size: 26.9 KB
RevLine 
[318408a]1Submitted By: Ken Moffat <ken at linuxfromscratch dot org>
2Date: 2008-02-19
3Initial Package Version: 2.5.3
4Upstream Status: uncertain
5Origin: from debian.
6Description: Various fixes, particularly speed improvements for UTF-8 locales.
7Also adds a 'standard input' marker into the results for certain obscure uses.
8
9diff -Naur grep-2.5.3.orig/lib/posix/regex.h grep-2.5.3.lfs/lib/posix/regex.h
10--- grep-2.5.3.orig/lib/posix/regex.h 2007-06-28 19:57:18.000000000 +0100
11+++ grep-2.5.3.lfs/lib/posix/regex.h 2008-02-10 18:56:07.000000000 +0000
12@@ -165,6 +165,10 @@
13 treated as 'a\{1'. */
14 #define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1)
15
16+/* If this bit is set, then ignore case when matching.
17+ If not set, then case is significant. */
18+#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
19+
20 /* This global variable defines the particular regexp syntax to use (for
21 some interfaces). When a regexp is compiled, the syntax used is
22 stored in the pattern buffer, so changing this does not affect
23diff -Naur grep-2.5.3.orig/src/dfa.c grep-2.5.3.lfs/src/dfa.c
24--- grep-2.5.3.orig/src/dfa.c 2007-06-28 19:57:19.000000000 +0100
25+++ grep-2.5.3.lfs/src/dfa.c 2008-02-10 18:55:29.000000000 +0000
26@@ -594,6 +594,17 @@
27 /* build character class. */
28 {
29 wctype_t wt;
30+ /* NOTE:
31+ * when case_fold, character class [:upper:] and [:lower:]
32+ * should be treated as [:alpha:], this is the same way
33+ * of glibc/posix/regcomp.c:build_charclass().
34+ * reported by Bug#276202
35+ * - fixed by Fumitoshi UKAI
36+ */
37+ if (case_fold
38+ && (strcmp (str, "upper") == 0 || strcmp (str, "lower") == 0))
39+ strcpy (str, "alpha");
40+
41 /* Query the character class as wctype_t. */
42 wt = wctype (str);
43
44@@ -681,6 +692,29 @@
45 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
46 range_ends_al, work_mbc->nranges + 1);
47 work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
48+ if (case_fold
49+ && (iswlower((wint_t)wc) || iswupper((wint_t)wc))
50+ && (iswlower((wint_t)wc2) || iswupper((wint_t)wc2))) {
51+ wint_t altcase;
52+ altcase = wc;
53+ if (iswlower((wint_t)wc))
54+ altcase = towupper((wint_t)wc);
55+ else
56+ altcase = towlower((wint_t)wc);
57+ REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
58+ range_sts_al, work_mbc->nranges + 1);
59+ work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
60+
61+ altcase = wc2;
62+ if (iswlower((wint_t)wc2))
63+ altcase = towupper((wint_t)wc2);
64+ else
65+ altcase = towlower((wint_t)wc2);
66+ REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
67+ range_ends_al, work_mbc->nranges + 1);
68+ work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
69+
70+ }
71 }
72 else if (wc != WEOF)
73 /* build normal characters. */
74@@ -688,6 +722,20 @@
75 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
76 work_mbc->nchars + 1);
77 work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
78+ if (case_fold && (iswlower((wint_t) wc) || iswupper((wint_t) wc)))
79+ {
80+ wint_t altcase;
81+
82+ altcase = wc; /* keeps compiler happy */
83+ if (iswlower((wint_t) wc))
84+ altcase = towupper((wint_t) wc);
85+ else if (iswupper((wint_t) wc))
86+ altcase = towlower((wint_t) wc);
87+
88+ REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
89+ work_mbc->nchars + 1);
90+ work_mbc->chars[work_mbc->nchars++] = (wchar_t) altcase;
91+ }
92 }
93 }
94 while ((wc = wc1) != L']');
95diff -Naur grep-2.5.3.orig/src/grep.c grep-2.5.3.lfs/src/grep.c
96--- grep-2.5.3.orig/src/grep.c 2007-06-28 19:57:19.000000000 +0100
97+++ grep-2.5.3.lfs/src/grep.c 2008-02-10 18:54:53.000000000 +0000
98@@ -274,6 +274,12 @@
99 #endif
100 ;
101
102+/* Default for `file_list' if no files are given on the command line. */
103+static char *stdin_argv[] =
104+{
105+ "-", NULL
106+};
107+
108 /* Non-boolean long options that have no corresponding short equivalents. */
109 enum
110 {
111@@ -534,7 +540,16 @@
112 for byte sentinels fore and aft. */
113 newalloc = newsize + pagesize + 1;
114
115- newbuf = bufalloc < newalloc ? xmalloc (bufalloc = newalloc) : buffer;
116+ newbuf = bufalloc < newalloc ? malloc (bufalloc = newalloc) : buffer;
117+ if (newbuf == NULL)
118+ {
119+ int saved_errno = errno;
120+ free (buffer);
121+ bufalloc = ALIGN_TO (INITIAL_BUFSIZE, pagesize) + pagesize + 1;
122+ buffer = xmalloc (bufalloc);
123+ errno = saved_errno;
124+ return 0;
125+ }
126 readbuf = ALIGN_TO (newbuf + 1 + save, pagesize);
127 bufbeg = readbuf - save;
128 memmove (bufbeg, buffer + saved_offset, save);
129@@ -1825,6 +1840,7 @@
130 FILE *fp;
131 extern char *optarg;
132 extern int optind;
133+ char **file_list;
134
135 initialize_main (&argc, &argv);
136 program_name = argv[0];
137@@ -2244,29 +2260,29 @@
138 if (max_count == 0)
139 exit (1);
140
141- if (optind < argc)
142+ file_list = (optind == argc ? stdin_argv : &argv[optind]);
143+
144+ status = 1;
145+ while (1)
146 {
147- status = 1;
148- do
149+ char *file = *file_list++;
150+
151+ if (file == NULL)
152+ break;
153+
154+ if ((included_patterns || excluded_patterns)
155+ && !isdir (file))
156 {
157- char *file = argv[optind];
158- if ((included_patterns || excluded_patterns)
159- && !isdir (file))
160- {
161- if (included_patterns &&
162- ! excluded_filename (included_patterns, file, 0))
163- continue;
164- if (excluded_patterns &&
165- excluded_filename (excluded_patterns, file, 0))
166- continue;
167- }
168- status &= grepfile (strcmp (file, "-") == 0 ? (char *) NULL : file,
169- &stats_base);
170+ if (included_patterns &&
171+ ! excluded_filename (included_patterns, file, 0))
172+ continue;
173+ if (excluded_patterns &&
174+ excluded_filename (excluded_patterns, file, 0))
175+ continue;
176 }
177- while ( ++optind < argc);
178+ status &= grepfile (strcmp (file, "-") == 0
179+ ? (char *) NULL : file, &stats_base);
180 }
181- else
182- status = grepfile ((char *) NULL, &stats_base);
183
184 /* We register via atexit() to test stdout. */
185 exit (errseen ? 2 : status);
186diff -Naur grep-2.5.3.orig/src/search.c grep-2.5.3.lfs/src/search.c
187--- grep-2.5.3.orig/src/search.c 2007-06-28 19:57:19.000000000 +0100
188+++ grep-2.5.3.lfs/src/search.c 2008-02-10 18:56:18.000000000 +0000
189@@ -18,10 +18,15 @@
190
191 /* Written August 1992 by Mike Haertel. */
192
193+#ifndef _GNU_SOURCE
194+# define _GNU_SOURCE 1
195+#endif
196 #ifdef HAVE_CONFIG_H
197 # include <config.h>
198 #endif
199
200+#include <assert.h>
201+
202 #include <sys/types.h>
203
204 #include "mbsupport.h"
205@@ -43,6 +48,9 @@
206 #ifdef HAVE_LIBPCRE
207 # include <pcre.h>
208 #endif
209+#ifdef HAVE_LANGINFO_CODESET
210+# include <langinfo.h>
211+#endif
212
213 #define NCHAR (UCHAR_MAX + 1)
214
215@@ -68,6 +76,19 @@
216 error (2, 0, _("memory exhausted"));
217 }
218
219+/* UTF-8 encoding allows some optimizations that we can't otherwise
220+ assume in a multibyte encoding. */
221+static int using_utf8;
222+
223+void
224+check_utf8 (void)
225+{
226+#ifdef HAVE_LANGINFO_CODESET
227+ if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
228+ using_utf8 = 1;
229+#endif
230+}
231+
232 #ifndef FGREP_PROGRAM
233 /* DFA compiled regexp. */
234 static struct dfa dfa;
235@@ -134,49 +155,6 @@
236 }
237 #endif /* !FGREP_PROGRAM */
238
239-#ifdef MBS_SUPPORT
240-/* This function allocate the array which correspond to "buf".
241- Then this check multibyte string and mark on the positions which
242- are not single byte character nor the first byte of a multibyte
243- character. Caller must free the array. */
244-static char*
245-check_multibyte_string(char const *buf, size_t size)
246-{
247- char *mb_properties = xmalloc(size);
248- mbstate_t cur_state;
249- wchar_t wc;
250- int i;
251-
252- memset(&cur_state, 0, sizeof(mbstate_t));
253- memset(mb_properties, 0, sizeof(char)*size);
254-
255- for (i = 0; i < size ;)
256- {
257- size_t mbclen;
258- mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state);
259-
260- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
261- {
262- /* An invalid sequence, or a truncated multibyte character.
263- We treat it as a single byte character. */
264- mbclen = 1;
265- }
266- else if (match_icase)
267- {
268- if (iswupper((wint_t)wc))
269- {
270- wc = towlower((wint_t)wc);
271- wcrtomb(buf + i, wc, &cur_state);
272- }
273- }
274- mb_properties[i] = mbclen;
275- i += mbclen;
276- }
277-
278- return mb_properties;
279-}
280-#endif /* MBS_SUPPORT */
281-
282 #if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM)
283 #ifdef EGREP_PROGRAM
284 COMPILE_FCT(Ecompile)
285@@ -193,10 +171,9 @@
286 size_t total = size;
287 char const *motif = pattern;
288
289-#if 0
290+ check_utf8 ();
291 if (match_icase)
292 syntax_bits |= RE_ICASE;
293-#endif
294 re_set_syntax (syntax_bits);
295 dfasyntax (syntax_bits, match_icase, eolbyte);
296
297@@ -301,23 +278,35 @@
298 char eol = eolbyte;
299 int backref, start, len, best_len;
300 struct kwsmatch kwsm;
301+ static int use_dfa;
302+ static int use_dfa_checked = 0;
303 size_t i, ret_val;
304 #ifdef MBS_SUPPORT
305- char *mb_properties = NULL;
306- if (MB_CUR_MAX > 1)
307+ const char *last_char = NULL;
308+ int mb_cur_max = MB_CUR_MAX;
309+ mbstate_t mbs;
310+ memset (&mbs, '\0', sizeof (mbstate_t));
311+#endif /* MBS_SUPPORT */
312+
313+ if (!use_dfa_checked)
314 {
315- if (match_icase)
316- {
317- char *case_buf = xmalloc(size);
318- memcpy(case_buf, buf, size);
319- if (start_ptr)
320- start_ptr = case_buf + (start_ptr - buf);
321- buf = case_buf;
322- }
323- if (kwset)
324- mb_properties = check_multibyte_string(buf, size);
325- }
326+ char *grep_use_dfa = getenv ("GREP_USE_DFA");
327+ if (!grep_use_dfa)
328+ {
329+#ifdef MBS_SUPPORT
330+ /* Turn off DFA when processing multibyte input. */
331+ use_dfa = (MB_CUR_MAX == 1);
332+#else
333+ use_dfa = 1;
334 #endif /* MBS_SUPPORT */
335+ }
336+ else
337+ {
338+ use_dfa = atoi (grep_use_dfa);
339+ }
340+
341+ use_dfa_checked = 1;
342+ }
343
344 buflim = buf + size;
345
346@@ -329,40 +318,123 @@
347 if (kwset)
348 {
349 /* Find a possible match using the KWset matcher. */
350- size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
351+#ifdef MBS_SUPPORT
352+ size_t bytes_left = 0;
353+#endif /* MBS_SUPPORT */
354+ size_t offset;
355+#ifdef MBS_SUPPORT
356+ /* kwsexec doesn't work with match_icase and multibyte input. */
357+ if (match_icase && mb_cur_max > 1)
358+ /* Avoid kwset */
359+ offset = 0;
360+ else
361+#endif /* MBS_SUPPORT */
362+ offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
363 if (offset == (size_t) -1)
364- goto failure;
365+ return (size_t)-1;
366+#ifdef MBS_SUPPORT
367+ if (mb_cur_max > 1 && !using_utf8)
368+ {
369+ bytes_left = offset;
370+ while (bytes_left)
371+ {
372+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
373+
374+ last_char = beg;
375+ if (mlen == (size_t) -1 || mlen == 0)
376+ {
377+ /* Incomplete character: treat as single-byte. */
378+ memset (&mbs, '\0', sizeof (mbstate_t));
379+ beg++;
380+ bytes_left--;
381+ continue;
382+ }
383+
384+ if (mlen == (size_t) -2)
385+ /* Offset points inside multibyte character:
386+ * no good. */
387+ break;
388+
389+ beg += mlen;
390+ bytes_left -= mlen;
391+ }
392+ }
393+ else
394+#endif /* MBS_SUPPORT */
395 beg += offset;
396 /* Narrow down to the line containing the candidate, and
397 run it through DFA. */
398 end = memchr(beg, eol, buflim - beg);
399 end++;
400 #ifdef MBS_SUPPORT
401- if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
402+ if (mb_cur_max > 1 && bytes_left)
403 continue;
404 #endif
405 while (beg > buf && beg[-1] != eol)
406 --beg;
407- if (kwsm.index < kwset_exact_matches)
408+ if (
409+#ifdef MBS_SUPPORT
410+ !(match_icase && mb_cur_max > 1) &&
411+#endif /* MBS_SUPPORT */
412+ (kwsm.index < kwset_exact_matches))
413 goto success;
414- if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
415+ if (use_dfa &&
416+ dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
417 continue;
418 }
419 else
420 {
421 /* No good fixed strings; start with DFA. */
422- size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
423+#ifdef MBS_SUPPORT
424+ size_t bytes_left = 0;
425+#endif /* MBS_SUPPORT */
426+ size_t offset = 0;
427+ if (use_dfa)
428+ offset = dfaexec (&dfa, beg, buflim - beg, &backref);
429 if (offset == (size_t) -1)
430 break;
431 /* Narrow down to the line we've found. */
432+#ifdef MBS_SUPPORT
433+ if (mb_cur_max > 1 && !using_utf8)
434+ {
435+ bytes_left = offset;
436+ while (bytes_left)
437+ {
438+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
439+
440+ last_char = beg;
441+ if (mlen == (size_t) -1 || mlen == 0)
442+ {
443+ /* Incomplete character: treat as single-byte. */
444+ memset (&mbs, '\0', sizeof (mbstate_t));
445+ beg++;
446+ bytes_left--;
447+ continue;
448+ }
449+
450+ if (mlen == (size_t) -2)
451+ /* Offset points inside multibyte character:
452+ * no good. */
453+ break;
454+
455+ beg += mlen;
456+ bytes_left -= mlen;
457+ }
458+ }
459+ else
460+#endif /* MBS_SUPPORT */
461 beg += offset;
462 end = memchr (beg, eol, buflim - beg);
463 end++;
464+#ifdef MBS_SUPPORT
465+ if (mb_cur_max > 1 && bytes_left)
466+ continue;
467+#endif /* MBS_SUPPORT */
468 while (beg > buf && beg[-1] != eol)
469 --beg;
470 }
471 /* Successful, no backreferences encountered! */
472- if (!backref)
473+ if (use_dfa && !backref)
474 goto success;
475 }
476 else
477@@ -408,10 +480,84 @@
478 if (match_words)
479 while (match <= best_match)
480 {
481- if ((match == buf || !WCHAR ((unsigned char) match[-1]))
482- && (len == end - beg - 1
483- || !WCHAR ((unsigned char) match[len])))
484- goto assess_pattern_match;
485+ int lword_match = 0;
486+ if (match == buf)
487+ lword_match = 1;
488+ else
489+ {
490+ assert (start > 0);
491+#ifdef MBS_SUPPORT
492+ if (mb_cur_max > 1)
493+ {
494+ const char *s;
495+ int mr;
496+ wchar_t pwc;
497+ if (using_utf8)
498+ {
499+ s = match - 1;
500+ while (s > buf
501+ && (unsigned char) *s >= 0x80
502+ && (unsigned char) *s <= 0xbf)
503+ --s;
504+ }
505+ else
506+ s = last_char;
507+ mr = mbtowc (&pwc, s, match - s);
508+ if (mr <= 0)
509+ {
510+ memset (&mbs, '\0', sizeof (mbstate_t));
511+ lword_match = 1;
512+ }
513+ else if (!(iswalnum (pwc) || pwc == L'_')
514+ && mr == (int) (match - s))
515+ lword_match = 1;
516+ }
517+ else
518+#endif /* MBS_SUPPORT */
519+ if (!WCHAR ((unsigned char) match[-1]))
520+ lword_match = 1;
521+ }
522+
523+ if (lword_match)
524+ {
525+ int rword_match = 0;
526+ if (start + len == end - beg - 1)
527+ rword_match = 1;
528+ else
529+ {
530+#ifdef MBS_SUPPORT
531+ if (mb_cur_max > 1)
532+ {
533+ wchar_t nwc;
534+ int mr;
535+
536+ mr = mbtowc (&nwc, buf + start + len,
537+ end - buf - start - len - 1);
538+ if (mr <= 0)
539+ {
540+ memset (&mbs, '\0', sizeof (mbstate_t));
541+ rword_match = 1;
542+ }
543+ else if (!iswalnum (nwc) && nwc != L'_')
544+ rword_match = 1;
545+ }
546+ else
547+#endif /* MBS_SUPPORT */
548+ if (!WCHAR ((unsigned char) match[len]))
549+ rword_match = 1;
550+ }
551+
552+ if (rword_match)
553+ {
554+ if (!start_ptr)
555+ /* Returns the whole line. */
556+ goto success;
557+ else
558+ {
559+ goto assess_pattern_match;
560+ }
561+ }
562+ }
563 if (len > 0)
564 {
565 /* Try a shorter length anchored at the same place. */
566@@ -475,24 +621,144 @@
567 *match_size = len;
568 ret_val = beg - buf;
569 out:
570-#ifdef MBS_SUPPORT
571- if (MB_CUR_MAX > 1)
572- {
573- if (match_icase)
574- free((char*)buf);
575- if (mb_properties)
576- free(mb_properties);
577- }
578-#endif /* MBS_SUPPORT */
579 return ret_val;
580 }
581 #endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
582
583+#ifdef MBS_SUPPORT
584+static int f_i_multibyte; /* whether we're using the new -Fi MB method */
585+static struct
586+{
587+ wchar_t **patterns;
588+ size_t count, maxlen;
589+ unsigned char *match;
590+} Fimb;
591+#endif
592+
593 #if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
594 COMPILE_FCT(Fcompile)
595 {
596+ int mb_cur_max = MB_CUR_MAX;
597 char const *beg, *lim, *err;
598
599+ check_utf8 ();
600+#ifdef MBS_SUPPORT
601+ /* Support -F -i for UTF-8 input. */
602+ if (match_icase && mb_cur_max > 1)
603+ {
604+ mbstate_t mbs;
605+ wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
606+ const char *patternend = pattern;
607+ size_t wcsize;
608+ kwset_t fimb_kwset = NULL;
609+ char *starts = NULL;
610+ wchar_t *wcbeg, *wclim;
611+ size_t allocated = 0;
612+
613+ memset (&mbs, '\0', sizeof (mbs));
614+# ifdef __GNU_LIBRARY__
615+ wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
616+ if (patternend != pattern + size)
617+ wcsize = (size_t) -1;
618+# else
619+ {
620+ char *patterncopy = xmalloc (size + 1);
621+
622+ memcpy (patterncopy, pattern, size);
623+ patterncopy[size] = '\0';
624+ patternend = patterncopy;
625+ wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
626+ if (patternend != patterncopy + size)
627+ wcsize = (size_t) -1;
628+ free (patterncopy);
629+ }
630+# endif
631+ if (wcsize + 2 <= 2)
632+ {
633+fimb_fail:
634+ free (wcpattern);
635+ free (starts);
636+ if (fimb_kwset)
637+ kwsfree (fimb_kwset);
638+ free (Fimb.patterns);
639+ Fimb.patterns = NULL;
640+ }
641+ else
642+ {
643+ if (!(fimb_kwset = kwsalloc (NULL)))
644+ error (2, 0, _("memory exhausted"));
645+
646+ starts = xmalloc (mb_cur_max * 3);
647+ wcbeg = wcpattern;
648+ do
649+ {
650+ int i;
651+ size_t wclen;
652+
653+ if (Fimb.count >= allocated)
654+ {
655+ if (allocated == 0)
656+ allocated = 128;
657+ else
658+ allocated *= 2;
659+ Fimb.patterns = xrealloc (Fimb.patterns,
660+ sizeof (wchar_t *) * allocated);
661+ }
662+ Fimb.patterns[Fimb.count++] = wcbeg;
663+ for (wclim = wcbeg;
664+ wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
665+ *wclim = towlower (*wclim);
666+ *wclim = L'\0';
667+ wclen = wclim - wcbeg;
668+ if (wclen > Fimb.maxlen)
669+ Fimb.maxlen = wclen;
670+ if (wclen > 3)
671+ wclen = 3;
672+ if (wclen == 0)
673+ {
674+ if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
675+ error (2, 0, err);
676+ }
677+ else
678+ for (i = 0; i < (1 << wclen); i++)
679+ {
680+ char *p = starts;
681+ int j, k;
682+
683+ for (j = 0; j < wclen; ++j)
684+ {
685+ wchar_t wc = wcbeg[j];
686+ if (i & (1 << j))
687+ {
688+ wc = towupper (wc);
689+ if (wc == wcbeg[j])
690+ continue;
691+ }
692+ k = wctomb (p, wc);
693+ if (k <= 0)
694+ goto fimb_fail;
695+ p += k;
696+ }
697+ if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
698+ error (2, 0, err);
699+ }
700+ if (wclim < wcpattern + wcsize)
701+ ++wclim;
702+ wcbeg = wclim;
703+ }
704+ while (wcbeg < wcpattern + wcsize);
705+ f_i_multibyte = 1;
706+ kwset = fimb_kwset;
707+ free (starts);
708+ Fimb.match = xmalloc (Fimb.count);
709+ if ((err = kwsprep (kwset)) != 0)
710+ error (2, 0, err);
711+ return;
712+ }
713+ }
714+#endif /* MBS_SUPPORT */
715+
716+
717 kwsinit ();
718 beg = pattern;
719 do
720@@ -511,6 +777,76 @@
721 error (2, 0, err);
722 }
723
724+#ifdef MBS_SUPPORT
725+static int
726+Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
727+{
728+ size_t len, letter, i;
729+ int ret = -1;
730+ mbstate_t mbs;
731+ wchar_t wc;
732+ int patterns_left;
733+
734+ assert (match_icase && f_i_multibyte == 1);
735+ assert (MB_CUR_MAX > 1);
736+
737+ memset (&mbs, '\0', sizeof (mbs));
738+ memset (Fimb.match, '\1', Fimb.count);
739+ letter = len = 0;
740+ patterns_left = 1;
741+ while (patterns_left && len <= size)
742+ {
743+ size_t c;
744+
745+ patterns_left = 0;
746+ if (len < size)
747+ {
748+ c = mbrtowc (&wc, buf + len, size - len, &mbs);
749+ if (c + 2 <= 2)
750+ return ret;
751+
752+ wc = towlower (wc);
753+ }
754+ else
755+ {
756+ c = 1;
757+ wc = L'\0';
758+ }
759+
760+ for (i = 0; i < Fimb.count; i++)
761+ {
762+ if (Fimb.match[i])
763+ {
764+ if (Fimb.patterns[i][letter] == L'\0')
765+ {
766+ /* Found a match. */
767+ *plen = len;
768+ if (!exact && !match_words)
769+ return 0;
770+ else
771+ {
772+ /* For -w or exact look for longest match. */
773+ ret = 0;
774+ Fimb.match[i] = '\0';
775+ continue;
776+ }
777+ }
778+
779+ if (Fimb.patterns[i][letter] == wc)
780+ patterns_left = 1;
781+ else
782+ Fimb.match[i] = '\0';
783+ }
784+ }
785+
786+ len += c;
787+ letter++;
788+ }
789+
790+ return ret;
791+}
792+#endif /* MBS_SUPPORT */
793+
794 EXECUTE_FCT(Fexecute)
795 {
796 register char const *beg, *try, *end;
797@@ -519,69 +855,256 @@
798 struct kwsmatch kwsmatch;
799 size_t ret_val;
800 #ifdef MBS_SUPPORT
801- char *mb_properties = NULL;
802- if (MB_CUR_MAX > 1)
803- {
804- if (match_icase)
805- {
806- char *case_buf = xmalloc(size);
807- memcpy(case_buf, buf, size);
808- if (start_ptr)
809- start_ptr = case_buf + (start_ptr - buf);
810- buf = case_buf;
811- }
812- mb_properties = check_multibyte_string(buf, size);
813- }
814+ int mb_cur_max = MB_CUR_MAX;
815+ mbstate_t mbs;
816+ memset (&mbs, '\0', sizeof (mbstate_t));
817+ const char *last_char = NULL;
818 #endif /* MBS_SUPPORT */
819
820 for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
821 {
822 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
823 if (offset == (size_t) -1)
824- goto failure;
825+ return offset;
826 #ifdef MBS_SUPPORT
827- if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
828- continue; /* It is a part of multibyte character. */
829+ if (mb_cur_max > 1 && !using_utf8)
830+ {
831+ size_t bytes_left = offset;
832+ while (bytes_left)
833+ {
834+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
835+
836+ last_char = beg;
837+ if (mlen == (size_t) -1 || mlen == 0)
838+ {
839+ /* Incomplete character: treat as single-byte. */
840+ memset (&mbs, '\0', sizeof (mbstate_t));
841+ beg++;
842+ bytes_left--;
843+ continue;
844+ }
845+
846+ if (mlen == (size_t) -2)
847+ /* Offset points inside multibyte character: no good. */
848+ break;
849+
850+ beg += mlen;
851+ bytes_left -= mlen;
852+ }
853+
854+ if (bytes_left)
855+ continue;
856+ }
857+ else
858 #endif /* MBS_SUPPORT */
859 beg += offset;
860+#ifdef MBS_SUPPORT
861+ /* For f_i_multibyte, the string at beg now matches first 3 chars of
862+ one of the search strings (less if there are shorter search strings).
863+ See if this is a real match. */
864+ if (f_i_multibyte
865+ && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], start_ptr == NULL))
866+ goto next_char;
867+#endif /* MBS_SUPPORT */
868 len = kwsmatch.size[0];
869 if (start_ptr && !match_words)
870 goto success_in_beg_and_len;
871 if (match_lines)
872 {
873 if (beg > buf && beg[-1] != eol)
874- continue;
875+ goto next_char;
876 if (beg + len < buf + size && beg[len] != eol)
877- continue;
878+ goto next_char;
879 goto success;
880 }
881 else if (match_words)
882- for (try = beg; len; )
883- {
884- if (try > buf && WCHAR((unsigned char) try[-1]))
885- break;
886- if (try + len < buf + size && WCHAR((unsigned char) try[len]))
887- {
888- offset = kwsexec (kwset, beg, --len, &kwsmatch);
889- if (offset == (size_t) -1)
890- break;
891- try = beg + offset;
892- len = kwsmatch.size[0];
893- }
894- else if (!start_ptr)
895- goto success;
896- else
897- goto success_in_beg_and_len;
898- } /* for (try) */
899- else
900+ {
901+ while (len)
902+ {
903+ int word_match = 0;
904+ if (beg > buf)
905+ {
906+#ifdef MBS_SUPPORT
907+ if (mb_cur_max > 1)
908+ {
909+ const char *s;
910+ int mr;
911+ wchar_t pwc;
912+
913+ if (using_utf8)
914+ {
915+ s = beg - 1;
916+ while (s > buf
917+ && (unsigned char) *s >= 0x80
918+ && (unsigned char) *s <= 0xbf)
919+ --s;
920+ }
921+ else
922+ s = last_char;
923+ mr = mbtowc (&pwc, s, beg - s);
924+ if (mr <= 0)
925+ memset (&mbs, '\0', sizeof (mbstate_t));
926+ else if ((iswalnum (pwc) || pwc == L'_')
927+ && mr == (int) (beg - s))
928+ goto next_char;
929+ }
930+ else
931+#endif /* MBS_SUPPORT */
932+ if (WCHAR ((unsigned char) beg[-1]))
933+ goto next_char;
934+ }
935+#ifdef MBS_SUPPORT
936+ if (mb_cur_max > 1)
937+ {
938+ wchar_t nwc;
939+ int mr;
940+
941+ mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
942+ if (mr <= 0)
943+ {
944+ memset (&mbs, '\0', sizeof (mbstate_t));
945+ word_match = 1;
946+ }
947+ else if (!iswalnum (nwc) && nwc != L'_')
948+ word_match = 1;
949+ }
950+ else
951+#endif /* MBS_SUPPORT */
952+ if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
953+ word_match = 1;
954+ if (word_match)
955+ {
956+ if (start_ptr == NULL)
957+ /* Returns the whole line now we know there's a word match. */
958+ goto success;
959+ else {
960+ /* Returns just this word match. */
961+ *match_size = len;
962+ return beg - buf;
963+ }
964+ }
965+ if (len > 0)
966+ {
967+ /* Try a shorter length anchored at the same place. */
968+ --len;
969+ offset = kwsexec (kwset, beg, len, &kwsmatch);
970+
971+ if (offset == -1)
972+ goto next_char; /* Try a different anchor. */
973+#ifdef MBS_SUPPORT
974+
975+ if (mb_cur_max > 1 && !using_utf8)
976+ {
977+ size_t bytes_left = offset;
978+ while (bytes_left)
979+ {
980+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
981+
982+ last_char = beg;
983+ if (mlen == (size_t) -1 || mlen == 0)
984+ {
985+ /* Incomplete character: treat as single-byte. */
986+ memset (&mbs, '\0', sizeof (mbstate_t));
987+ beg++;
988+ bytes_left--;
989+ continue;
990+ }
991+
992+ if (mlen == (size_t) -2)
993+ {
994+ /* Offset points inside multibyte character:
995+ * no good. */
996+ break;
997+ }
998+
999+ beg += mlen;
1000+ bytes_left -= mlen;
1001+ }
1002+
1003+ if (bytes_left)
1004+ {
1005+ memset (&mbs, '\0', sizeof (mbstate_t));
1006+ goto next_char; /* Try a different anchor. */
1007+ }
1008+ }
1009+ else
1010+#endif /* MBS_SUPPORT */
1011+ beg += offset;
1012+#ifdef MBS_SUPPORT
1013+ /* The string at beg now matches first 3 chars of one of
1014+ the search strings (less if there are shorter search
1015+ strings). See if this is a real match. */
1016+ if (f_i_multibyte
1017+ && Fimbexec (beg, len - offset, &kwsmatch.size[0],
1018+ start_ptr == NULL))
1019+ goto next_char;
1020+#endif /* MBS_SUPPORT */
1021+ len = kwsmatch.size[0];
1022+ }
1023+ }
1024+ }
1025+ else
1026 goto success;
1027- } /* for (beg in buf) */
1028+next_char:;
1029+#ifdef MBS_SUPPORT
1030+ /* Advance to next character. For MB_CUR_MAX == 1 case this is handled
1031+ by ++beg above. */
1032+ if (mb_cur_max > 1)
1033+ {
1034+ if (using_utf8)
1035+ {
1036+ unsigned char c = *beg;
1037+ if (c >= 0xc2)
1038+ {
1039+ if (c < 0xe0)
1040+ ++beg;
1041+ else if (c < 0xf0)
1042+ beg += 2;
1043+ else if (c < 0xf8)
1044+ beg += 3;
1045+ else if (c < 0xfc)
1046+ beg += 4;
1047+ else if (c < 0xfe)
1048+ beg += 5;
1049+ }
1050+ }
1051+ else
1052+ {
1053+ size_t l = mbrlen (beg, buf + size - beg, &mbs);
1054
1055- failure:
1056- ret_val = -1;
1057- goto out;
1058+ last_char = beg;
1059+ if (l + 2 >= 2)
1060+ beg += l - 1;
1061+ else
1062+ memset (&mbs, '\0', sizeof (mbstate_t));
1063+ }
1064+ }
1065+#endif /* MBS_SUPPORT */
1066+ }
1067+
1068+ return -1;
1069
1070 success:
1071+#ifdef MBS_SUPPORT
1072+ if (mb_cur_max > 1 && !using_utf8)
1073+ {
1074+ end = beg + len;
1075+ while (end < buf + size)
1076+ {
1077+ size_t mlen = mbrlen (end, buf + size - end, &mbs);
1078+ if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
1079+ {
1080+ memset (&mbs, '\0', sizeof (mbstate_t));
1081+ mlen = 1;
1082+ }
1083+ if (mlen == 1 && *end == eol)
1084+ break;
1085+
1086+ end += mlen;
1087+ }
1088+ }
1089+ else
1090+ #endif /* MBS_SUPPORT */
1091 end = memchr (beg + len, eol, (buf + size) - (beg + len));
1092 end++;
1093 while (buf < beg && beg[-1] != eol)
1094@@ -591,15 +1114,6 @@
1095 *match_size = len;
1096 ret_val = beg - buf;
1097 out:
1098-#ifdef MBS_SUPPORT
1099- if (MB_CUR_MAX > 1)
1100- {
1101- if (match_icase)
1102- free((char*)buf);
1103- if (mb_properties)
1104- free(mb_properties);
1105- }
1106-#endif /* MBS_SUPPORT */
1107 return ret_val;
1108 }
1109 #endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */
Note: See TracBrowser for help on using the repository browser.