source: patches/coreutils-6.12-i18n-1.patch@ c4a6f7f

clfs-1.2 clfs-2.1 clfs-3.0.0-systemd clfs-3.0.0-sysvinit systemd sysvinit
Last change on this file since c4a6f7f was a85d737, checked in by Jim Gifford <clfs@…>, 16 years ago

Didn't like the naming changed Unicode to Internationalization.

  • Property mode set to 100644
File size: 101.3 KB
RevLine 
[577dd2d]1Submitted By: Jim Gifford <jim at cross-lfs dot org>
2Date: 2009-01-08
3Initial Package Version: 6.12
4Upstream Status: Unkown
5Origin: Fedora and Bryan Kadzban
[a85d737]6Description: i18n Updates
[577dd2d]7
8diff -Naur coreutils-6.12.orig/lib/linebuffer.h coreutils-6.12/lib/linebuffer.h
9--- coreutils-6.12.orig/lib/linebuffer.h 2007-10-17 06:47:26.000000000 -0700
10+++ coreutils-6.12/lib/linebuffer.h 2009-01-08 12:56:49.000000000 -0800
11@@ -21,6 +21,11 @@
12
13 # include <stdio.h>
14
15+/* Get mbstate_t. */
16+# if HAVE_WCHAR_H
17+# include <wchar.h>
18+# endif
19+
20 /* A `struct linebuffer' holds a line of text. */
21
22 struct linebuffer
23@@ -28,6 +33,9 @@
24 size_t size; /* Allocated. */
25 size_t length; /* Used. */
26 char *buffer;
27+# if HAVE_WCHAR_H
28+ mbstate_t state;
29+# endif
30 };
31
32 /* Initialize linebuffer LINEBUFFER for use. */
33diff -Naur coreutils-6.12.orig/src/cut.c coreutils-6.12/src/cut.c
34--- coreutils-6.12.orig/src/cut.c 2008-05-25 23:40:33.000000000 -0700
35+++ coreutils-6.12/src/cut.c 2009-01-08 12:56:49.000000000 -0800
36@@ -28,6 +28,11 @@
37 #include <assert.h>
38 #include <getopt.h>
39 #include <sys/types.h>
40+
41+/* Get mbstate_t, mbrtowc(). */
42+#if HAVE_WCHAR_H
43+# include <wchar.h>
44+#endif
45 #include "system.h"
46
47 #include "error.h"
48@@ -36,6 +41,18 @@
49 #include "quote.h"
50 #include "xstrndup.h"
51
52+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
53+ installation; work around this configuration error. */
54+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
55+# undef MB_LEN_MAX
56+# define MB_LEN_MAX 16
57+#endif
58+
59+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
60+#if HAVE_MBRTOWC && defined mbstate_t
61+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
62+#endif
63+
64 /* The official name of this program (e.g., no `g' prefix). */
65 #define PROGRAM_NAME "cut"
66
67@@ -71,6 +88,52 @@
68 } \
69 while (0)
70
71+/* Refill the buffer BUF to get a multibyte character. */
72+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
73+ do \
74+ { \
75+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
76+ { \
77+ memmove (BUF, BUFPOS, BUFLEN); \
78+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
79+ BUFPOS = BUF; \
80+ } \
81+ } \
82+ while (0)
83+
84+/* Get wide character on BUFPOS. BUFPOS is not included after that.
85+ If byte sequence is not valid as a character, CONVFAIL is 1. Otherwise 0. */
86+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
87+ do \
88+ { \
89+ mbstate_t state_bak; \
90+ \
91+ if (BUFLEN < 1) \
92+ { \
93+ WC = WEOF; \
94+ break; \
95+ } \
96+ \
97+ /* Get a wide character. */ \
98+ CONVFAIL = 0; \
99+ state_bak = STATE; \
100+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
101+ \
102+ switch (MBLENGTH) \
103+ { \
104+ case (size_t)-1: \
105+ case (size_t)-2: \
106+ CONVFAIL++; \
107+ STATE = state_bak; \
108+ /* Fall througn. */ \
109+ \
110+ case 0: \
111+ MBLENGTH = 1; \
112+ break; \
113+ } \
114+ } \
115+ while (0)
116+
117 struct range_pair
118 {
119 size_t lo;
120@@ -89,7 +152,7 @@
121 /* The number of bytes allocated for FIELD_1_BUFFER. */
122 static size_t field_1_bufsize;
123
124-/* The largest field or byte index used as an endpoint of a closed
125+/* The largest byte, character or field index used as an endpoint of a closed
126 or degenerate range specification; this doesn't include the starting
127 index of right-open-ended ranges. For example, with either range spec
128 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
129@@ -101,10 +164,11 @@
130
131 /* This is a bit vector.
132 In byte mode, which bytes to output.
133+ In character mode, which characters to output.
134 In field mode, which DELIM-separated fields to output.
135- Both bytes and fields are numbered starting with 1,
136+ Bytes, characters and fields are numbered starting with 1,
137 so the zeroth bit of this array is unused.
138- A field or byte K has been selected if
139+ A byte, character or field K has been selected if
140 (K <= MAX_RANGE_ENDPOINT and is_printable_field(K))
141 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
142 static unsigned char *printable_field;
143@@ -113,9 +177,12 @@
144 {
145 undefined_mode,
146
147- /* Output characters that are in the given bytes. */
148+ /* Output bytes that are at the given positions. */
149 byte_mode,
150
151+ /* Output characters that are at the given positions. */
152+ character_mode,
153+
154 /* Output the given delimeter-separated fields. */
155 field_mode
156 };
157@@ -125,6 +192,13 @@
158
159 static enum operating_mode operating_mode;
160
161+/* If nonzero, when in byte mode, don't split multibyte characters. */
162+static int byte_mode_character_aware;
163+
164+/* If nonzero, the function for single byte locale is work
165+ if this program runs on multibyte locale. */
166+static int force_singlebyte_mode;
167+
168 /* If true do not output lines containing no delimeter characters.
169 Otherwise, all such lines are printed. This option is valid only
170 with field mode. */
171@@ -136,6 +210,9 @@
172
173 /* The delimeter character for field mode. */
174 static unsigned char delim;
175+#if HAVE_WCHAR_H
176+static wchar_t wcdelim;
177+#endif
178
179 /* True if the --output-delimiter=STRING option was specified. */
180 static bool output_delimiter_specified;
181@@ -209,7 +286,7 @@
182 -f, --fields=LIST select only these fields; also print any line\n\
183 that contains no delimiter character, unless\n\
184 the -s option is specified\n\
185- -n (ignored)\n\
186+ -n with -b: don't split multibyte characters\n\
187 "), stdout);
188 fputs (_("\
189 --complement complement the set of selected bytes, characters\n\
190@@ -368,7 +445,7 @@
191 in_digits = false;
192 /* Starting a range. */
193 if (dash_found)
194- FATAL_ERROR (_("invalid byte or field list"));
195+ FATAL_ERROR (_("invalid byte, character or field list"));
196 dash_found = true;
197 fieldstr++;
198
199@@ -392,14 +469,16 @@
200 if (!rhs_specified)
201 {
202 /* `n-'. From `initial' to end of line. */
203- eol_range_start = initial;
204+ if (eol_range_start == 0 ||
205+ (eol_range_start != 0 && eol_range_start > initial))
206+ eol_range_start = initial;
207 field_found = true;
208 }
209 else
210 {
211 /* `m-n' or `-n' (1-n). */
212 if (value < initial)
213- FATAL_ERROR (_("invalid decreasing range"));
214+ FATAL_ERROR (_("invalid byte, character or field list"));
215
216 /* Is there already a range going to end of line? */
217 if (eol_range_start != 0)
218@@ -479,6 +558,9 @@
219 if (operating_mode == byte_mode)
220 error (0, 0,
221 _("byte offset %s is too large"), quote (bad_num));
222+ else if (operating_mode == character_mode)
223+ error (0, 0,
224+ _("character offset %s is too large"), quote (bad_num));
225 else
226 error (0, 0,
227 _("field number %s is too large"), quote (bad_num));
228@@ -489,7 +571,7 @@
229 fieldstr++;
230 }
231 else
232- FATAL_ERROR (_("invalid byte or field list"));
233+ FATAL_ERROR (_("invalid byte, character or field list"));
234 }
235
236 max_range_endpoint = 0;
237@@ -582,6 +664,63 @@
238 }
239 }
240
241+#if HAVE_MBRTOWC
242+/* This function is in use for the following case.
243+
244+ 1. Read from the stream STREAM, printing to standard output any selected
245+ characters.
246+
247+ 2. Read from stream STREAM, printing to standard output any selected bytes,
248+ without splitting multibyte characters. */
249+
250+static void
251+cut_characters_or_cut_bytes_no_split (FILE *stream)
252+{
253+ int idx; /* number of bytes or characters in the line so far. */
254+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
255+ char *bufpos; /* Next read position of BUF. */
256+ size_t buflen; /* The length of the byte sequence in buf. */
257+ wint_t wc; /* A gotten wide character. */
258+ size_t mblength; /* The byte size of a multibyte character which shows
259+ as same character as WC. */
260+ mbstate_t state; /* State of the stream. */
261+ int convfail; /* 1, when conversion is failed. Otherwise 0. */
262+
263+ idx = 0;
264+ buflen = 0;
265+ bufpos = buf;
266+ memset (&state, '\0', sizeof(mbstate_t));
267+
268+ while (1)
269+ {
270+ REFILL_BUFFER (buf, bufpos, buflen, stream);
271+
272+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
273+
274+ if (wc == WEOF)
275+ {
276+ if (idx > 0)
277+ putchar ('\n');
278+ break;
279+ }
280+ else if (wc == L'\n')
281+ {
282+ putchar ('\n');
283+ idx = 0;
284+ }
285+ else
286+ {
287+ idx += (operating_mode == byte_mode) ? mblength : 1;
288+ if (print_kth (idx, NULL))
289+ fwrite (bufpos, mblength, sizeof(char), stdout);
290+ }
291+
292+ buflen -= mblength;
293+ bufpos += mblength;
294+ }
295+}
296+#endif
297+
298 /* Read from stream STREAM, printing to standard output any selected fields. */
299
300 static void
301@@ -704,13 +843,192 @@
302 }
303 }
304
305+#if HAVE_MBRTOWC
306+static void
307+cut_fields_mb (FILE *stream)
308+{
309+ int c;
310+ unsigned int field_idx;
311+ int found_any_selected_field;
312+ int buffer_first_field;
313+ int empty_input;
314+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
315+ char *bufpos; /* Next read position of BUF. */
316+ size_t buflen; /* The length of the byte sequence in buf. */
317+ wint_t wc = 0; /* A gotten wide character. */
318+ size_t mblength; /* The byte size of a multibyte character which shows
319+ as same character as WC. */
320+ mbstate_t state; /* State of the stream. */
321+ int convfail; /* 1, when conversion is failed. Otherwise 0. */
322+
323+ found_any_selected_field = 0;
324+ field_idx = 1;
325+ bufpos = buf;
326+ buflen = 0;
327+ memset (&state, '\0', sizeof(mbstate_t));
328+
329+ c = getc (stream);
330+ empty_input = (c == EOF);
331+ if (c != EOF)
332+ ungetc (c, stream);
333+ else
334+ wc = WEOF;
335+
336+ /* To support the semantics of the -s flag, we may have to buffer
337+ all of the first field to determine whether it is `delimited.'
338+ But that is unnecessary if all non-delimited lines must be printed
339+ and the first field has been selected, or if non-delimited lines
340+ must be suppressed and the first field has *not* been selected.
341+ That is because a non-delimited line has exactly one field. */
342+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL));
343+
344+ while (1)
345+ {
346+ if (field_idx == 1 && buffer_first_field)
347+ {
348+ int len = 0;
349+
350+ while (1)
351+ {
352+ REFILL_BUFFER (buf, bufpos, buflen, stream);
353+
354+ GET_NEXT_WC_FROM_BUFFER
355+ (wc, bufpos, buflen, mblength, state, convfail);
356+
357+ if (wc == WEOF)
358+ break;
359+
360+ field_1_buffer = xrealloc (field_1_buffer, len + mblength);
361+ memcpy (field_1_buffer + len, bufpos, mblength);
362+ len += mblength;
363+ buflen -= mblength;
364+ bufpos += mblength;
365+
366+ if (!convfail && (wc == L'\n' || wc == wcdelim))
367+ break;
368+ }
369+
370+ if (wc == WEOF)
371+ break;
372+
373+ /* If the first field extends to the end of line (it is not
374+ delimited) and we are printing all non-delimited lines,
375+ print this one. */
376+ if (convfail || (!convfail && wc != wcdelim))
377+ {
378+ if (suppress_non_delimited)
379+ {
380+ /* Empty. */
381+ }
382+ else
383+ {
384+ fwrite (field_1_buffer, sizeof (char), len, stdout);
385+ /* Make sure the output line is newline terminated. */
386+ if (convfail || (!convfail && wc != L'\n'))
387+ putchar ('\n');
388+ }
389+ continue;
390+ }
391+
392+ if (print_kth (1, NULL))
393+ {
394+ /* Print the field, but not the trailing delimiter. */
395+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
396+ found_any_selected_field = 1;
397+ }
398+ ++field_idx;
399+ }
400+
401+ if (wc != WEOF)
402+ {
403+ if (print_kth (field_idx, NULL))
404+ {
405+ if (found_any_selected_field)
406+ {
407+ fwrite (output_delimiter_string, sizeof (char),
408+ output_delimiter_length, stdout);
409+ }
410+ found_any_selected_field = 1;
411+ }
412+
413+ while (1)
414+ {
415+ REFILL_BUFFER (buf, bufpos, buflen, stream);
416+
417+ GET_NEXT_WC_FROM_BUFFER
418+ (wc, bufpos, buflen, mblength, state, convfail);
419+
420+ if (wc == WEOF)
421+ break;
422+ else if (!convfail && (wc == wcdelim || wc == L'\n'))
423+ {
424+ buflen -= mblength;
425+ bufpos += mblength;
426+ break;
427+ }
428+
429+ if (print_kth (field_idx, NULL))
430+ fwrite (bufpos, mblength, sizeof(char), stdout);
431+
432+ buflen -= mblength;
433+ bufpos += mblength;
434+ }
435+ }
436+
437+ if ((!convfail || wc == L'\n') && buflen < 1)
438+ wc = WEOF;
439+
440+ if (!convfail && wc == wcdelim)
441+ ++field_idx;
442+ else if (wc == WEOF || (!convfail && wc == L'\n'))
443+ {
444+ if (found_any_selected_field
445+ || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
446+ putchar ('\n');
447+ if (wc == WEOF)
448+ break;
449+ field_idx = 1;
450+ found_any_selected_field = 0;
451+ }
452+ }
453+}
454+#endif
455+
456 static void
457 cut_stream (FILE *stream)
458 {
459- if (operating_mode == byte_mode)
460- cut_bytes (stream);
461+#if HAVE_MBRTOWC
462+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
463+ {
464+ switch (operating_mode)
465+ {
466+ case byte_mode:
467+ if (byte_mode_character_aware)
468+ cut_characters_or_cut_bytes_no_split (stream);
469+ else
470+ cut_bytes (stream);
471+ break;
472+
473+ case character_mode:
474+ cut_characters_or_cut_bytes_no_split (stream);
475+ break;
476+
477+ case field_mode:
478+ cut_fields_mb (stream);
479+ break;
480+
481+ default:
482+ abort ();
483+ }
484+ }
485 else
486- cut_fields (stream);
487+#endif
488+ {
489+ if (operating_mode == field_mode)
490+ cut_fields (stream);
491+ else
492+ cut_bytes (stream);
493+ }
494 }
495
496 /* Process file FILE to standard output.
497@@ -760,6 +1078,8 @@
498 bool ok;
499 bool delim_specified = false;
500 char *spec_list_string IF_LINT(= NULL);
501+ char mbdelim[MB_LEN_MAX + 1];
502+ size_t delimlen = 0;
503
504 initialize_main (&argc, &argv);
505 program_name = argv[0];
506@@ -782,7 +1102,6 @@
507 switch (optc)
508 {
509 case 'b':
510- case 'c':
511 /* Build the byte list. */
512 if (operating_mode != undefined_mode)
513 FATAL_ERROR (_("only one type of list may be specified"));
514@@ -790,6 +1109,14 @@
515 spec_list_string = optarg;
516 break;
517
518+ case 'c':
519+ /* Build the character list. */
520+ if (operating_mode != undefined_mode)
521+ FATAL_ERROR (_("only one type of list may be specified"));
522+ operating_mode = character_mode;
523+ spec_list_string = optarg;
524+ break;
525+
526 case 'f':
527 /* Build the field list. */
528 if (operating_mode != undefined_mode)
529@@ -801,10 +1128,35 @@
530 case 'd':
531 /* New delimiter. */
532 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
533- if (optarg[0] != '\0' && optarg[1] != '\0')
534- FATAL_ERROR (_("the delimiter must be a single character"));
535- delim = optarg[0];
536- delim_specified = true;
537+ {
538+#if HAVE_MBRTOWC
539+ if(MB_CUR_MAX > 1)
540+ {
541+ mbstate_t state;
542+
543+ memset (&state, '\0', sizeof(mbstate_t));
544+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
545+
546+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
547+ ++force_singlebyte_mode;
548+ else
549+ {
550+ delimlen = (delimlen < 1) ? 1 : delimlen;
551+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
552+ FATAL_ERROR (_("the delimiter must be a single character"));
553+ memcpy (mbdelim, optarg, delimlen);
554+ }
555+ }
556+
557+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
558+#endif
559+ {
560+ if (optarg[0] != '\0' && optarg[1] != '\0')
561+ FATAL_ERROR (_("the delimiter must be a single character"));
562+ delim = (unsigned char) optarg[0];
563+ }
564+ delim_specified = true;
565+ }
566 break;
567
568 case OUTPUT_DELIMITER_OPTION:
569@@ -817,6 +1169,7 @@
570 break;
571
572 case 'n':
573+ byte_mode_character_aware = 1;
574 break;
575
576 case 's':
577@@ -839,7 +1192,7 @@
578 if (operating_mode == undefined_mode)
579 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
580
581- if (delim != '\0' && operating_mode != field_mode)
582+ if (delim_specified && operating_mode != field_mode)
583 FATAL_ERROR (_("an input delimiter may be specified only\
584 when operating on fields"));
585
586@@ -866,15 +1219,34 @@
587 }
588
589 if (!delim_specified)
590- delim = '\t';
591+ {
592+ delim = '\t';
593+#ifdef HAVE_MBRTOWC
594+ wcdelim = L'\t';
595+ mbdelim[0] = '\t';
596+ mbdelim[1] = '\0';
597+ delimlen = 1;
598+#endif
599+ }
600
601 if (output_delimiter_string == NULL)
602 {
603- static char dummy[2];
604- dummy[0] = delim;
605- dummy[1] = '\0';
606- output_delimiter_string = dummy;
607- output_delimiter_length = 1;
608+#ifdef HAVE_MBRTOWC
609+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
610+ {
611+ output_delimiter_string = xstrdup(mbdelim);
612+ output_delimiter_length = delimlen;
613+ }
614+
615+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
616+#endif
617+ {
618+ static char dummy[2];
619+ dummy[0] = delim;
620+ dummy[1] = '\0';
621+ output_delimiter_string = dummy;
622+ output_delimiter_length = 1;
623+ }
624 }
625
626 if (optind == argc)
627diff -Naur coreutils-6.12.orig/src/expand.c coreutils-6.12/src/expand.c
628--- coreutils-6.12.orig/src/expand.c 2008-05-25 23:40:33.000000000 -0700
629+++ coreutils-6.12/src/expand.c 2009-01-08 12:56:49.000000000 -0800
630@@ -37,11 +37,28 @@
631 #include <stdio.h>
632 #include <getopt.h>
633 #include <sys/types.h>
634+
635+/* Get mbstate_t, mbrtowc(), wcwidth(). */
636+#if HAVE_WCHAR_H
637+# include <wchar.h>
638+#endif
639+
640 #include "system.h"
641 #include "error.h"
642 #include "quote.h"
643 #include "xstrndup.h"
644
645+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
646+ installation; work around this configuration error. */
647+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
648+# define MB_LEN_MAX 16
649+#endif
650+
651+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
652+#if HAVE_MBRTOWC && defined mbstate_t
653+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
654+#endif
655+
656 /* The official name of this program (e.g., no `g' prefix). */
657 #define PROGRAM_NAME "expand"
658
659@@ -182,6 +199,7 @@
660 stops = num_start + len - 1;
661 }
662 }
663+
664 else
665 {
666 error (0, 0, _("tab size contains invalid character(s): %s"),
667@@ -364,6 +382,142 @@
668 }
669 }
670
671+#if HAVE_MBRTOWC
672+static void
673+expand_multibyte (void)
674+{
675+ FILE *fp; /* Input strem. */
676+ mbstate_t i_state; /* Current shift state of the input stream. */
677+ mbstate_t i_state_bak; /* Back up the I_STATE. */
678+ mbstate_t o_state; /* Current shift state of the output stream. */
679+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
680+ char *bufpos; /* Next read position of BUF. */
681+ size_t buflen = 0; /* The length of the byte sequence in buf. */
682+ wchar_t wc; /* A gotten wide character. */
683+ size_t mblength; /* The byte size of a multibyte character
684+ which shows as same character as WC. */
685+ int tab_index = 0; /* Index in `tab_list' of next tabstop. */
686+ int column = 0; /* Column on screen of the next char. */
687+ int next_tab_column; /* Column the next tab stop is on. */
688+ int convert = 1; /* If nonzero, perform translations. */
689+
690+ fp = next_file ((FILE *) NULL);
691+ if (fp == NULL)
692+ return;
693+
694+ memset (&o_state, '\0', sizeof(mbstate_t));
695+ memset (&i_state, '\0', sizeof(mbstate_t));
696+
697+ for (;;)
698+ {
699+ /* Refill the buffer BUF. */
700+ if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
701+ {
702+ memmove (buf, bufpos, buflen);
703+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
704+ bufpos = buf;
705+ }
706+
707+ /* No character is left in BUF. */
708+ if (buflen < 1)
709+ {
710+ fp = next_file (fp);
711+
712+ if (fp == NULL)
713+ break; /* No more files. */
714+ else
715+ {
716+ memset (&i_state, '\0', sizeof(mbstate_t));
717+ continue;
718+ }
719+ }
720+
721+ /* Get a wide character. */
722+ i_state_bak = i_state;
723+ mblength = mbrtowc (&wc, bufpos, buflen, &i_state);
724+
725+ switch (mblength)
726+ {
727+ case (size_t)-1: /* illegal byte sequence. */
728+ case (size_t)-2:
729+ mblength = 1;
730+ i_state = i_state_bak;
731+ if (convert)
732+ {
733+ ++column;
734+ if (convert_entire_line == 0)
735+ convert = 0;
736+ }
737+ putchar (*bufpos);
738+ break;
739+
740+ case 0: /* null. */
741+ mblength = 1;
742+ if (convert && convert_entire_line == 0)
743+ convert = 0;
744+ putchar ('\0');
745+ break;
746+
747+ default:
748+ if (wc == L'\n') /* LF. */
749+ {
750+ tab_index = 0;
751+ column = 0;
752+ convert = 1;
753+ putchar ('\n');
754+ }
755+ else if (wc == L'\t' && convert) /* Tab. */
756+ {
757+ if (tab_size == 0)
758+ {
759+ /* Do not let tab_index == first_free_tab;
760+ stop when it is 1 less. */
761+ while (tab_index < first_free_tab - 1
762+ && column >= tab_list[tab_index])
763+ tab_index++;
764+ next_tab_column = tab_list[tab_index];
765+ if (tab_index < first_free_tab - 1)
766+ tab_index++;
767+ if (column >= next_tab_column)
768+ next_tab_column = column + 1;
769+ }
770+ else
771+ next_tab_column = column + tab_size - column % tab_size;
772+
773+ while (column < next_tab_column)
774+ {
775+ putchar (' ');
776+ ++column;
777+ }
778+ }
779+ else /* Others. */
780+ {
781+ if (convert)
782+ {
783+ if (wc == L'\b')
784+ {
785+ if (column > 0)
786+ --column;
787+ }
788+ else
789+ {
790+ int width; /* The width of WC. */
791+
792+ width = wcwidth (wc);
793+ column += (width > 0) ? width : 0;
794+ if (convert_entire_line == 0)
795+ convert = 0;
796+ }
797+ }
798+ fwrite (bufpos, sizeof(char), mblength, stdout);
799+ }
800+ }
801+ buflen -= mblength;
802+ bufpos += mblength;
803+ }
804+}
805+#endif
806+
807 int
808 main (int argc, char **argv)
809 {
810@@ -428,7 +582,12 @@
811
812 file_list = (optind < argc ? &argv[optind] : stdin_argv);
813
814- expand ();
815+#if HAVE_MBRTOWC
816+ if (MB_CUR_MAX > 1)
817+ expand_multibyte ();
818+ else
819+#endif
820+ expand ();
821
822 if (have_read_stdin && fclose (stdin) != 0)
823 error (EXIT_FAILURE, errno, "-");
824diff -Naur coreutils-6.12.orig/src/fold.c coreutils-6.12/src/fold.c
825--- coreutils-6.12.orig/src/fold.c 2008-05-25 23:40:33.000000000 -0700
826+++ coreutils-6.12/src/fold.c 2009-01-08 12:56:49.000000000 -0800
827@@ -22,11 +22,33 @@
828 #include <getopt.h>
829 #include <sys/types.h>
830
831+/* Get mbstate_t, mbrtowc(), wcwidth(). */
832+#if HAVE_WCHAR_H
833+# include <wchar.h>
834+#endif
835+
836+/* Get iswprint(), iswblank(), wcwidth(). */
837+#if HAVE_WCTYPE_H
838+# include <wctype.h>
839+#endif
840+
841 #include "system.h"
842 #include "error.h"
843 #include "quote.h"
844 #include "xstrtol.h"
845
846+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
847+ installation; work around this configuration error. */
848+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
849+# undef MB_LEN_MAX
850+# define MB_LEN_MAX 16
851+#endif
852+
853+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
854+#if HAVE_MBRTOWC && defined mbstate_t
855+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
856+#endif
857+
858 #define TAB_WIDTH 8
859
860 /* The official name of this program (e.g., no `g' prefix). */
861@@ -34,23 +56,44 @@
862
863 #define AUTHORS proper_name ("David MacKenzie")
864
865+#define FATAL_ERROR(Message) \
866+ do \
867+ { \
868+ error (0, 0, (Message)); \
869+ usage (2); \
870+ } \
871+ while (0)
872+
873+enum operating_mode
874+{
875+ /* Fold texts by columns that are at the given positions. */
876+ column_mode,
877+
878+ /* Fold texts by bytes that are at the given positions. */
879+ byte_mode,
880+
881+ /* Fold texts by characters that are at the given positions. */
882+ character_mode,
883+};
884+
885 /* The name this program was run with. */
886 char *program_name;
887
888+/* The argument shows current mode. (Default: column_mode) */
889+static enum operating_mode operating_mode;
890+
891 /* If nonzero, try to break on whitespace. */
892 static bool break_spaces;
893
894-/* If nonzero, count bytes, not column positions. */
895-static bool count_bytes;
896-
897 /* If nonzero, at least one of the files we read was standard input. */
898 static bool have_read_stdin;
899
900-static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
901+static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
902
903 static struct option const longopts[] =
904 {
905 {"bytes", no_argument, NULL, 'b'},
906+ {"characters", no_argument, NULL, 'c'},
907 {"spaces", no_argument, NULL, 's'},
908 {"width", required_argument, NULL, 'w'},
909 {GETOPT_HELP_OPTION_DECL},
910@@ -80,6 +123,7 @@
911 "), stdout);
912 fputs (_("\
913 -b, --bytes count bytes rather than columns\n\
914+ -c, --characters count characters rather than columns\n\
915 -s, --spaces break at spaces\n\
916 -w, --width=WIDTH use WIDTH columns instead of 80\n\
917 "), stdout);
918@@ -97,7 +141,7 @@
919 static size_t
920 adjust_column (size_t column, char c)
921 {
922- if (!count_bytes)
923+ if (operating_mode != byte_mode)
924 {
925 if (c == '\b')
926 {
927@@ -120,30 +164,14 @@
928 to stdout, with maximum line length WIDTH.
929 Return true if successful. */
930
931-static bool
932-fold_file (char const *filename, size_t width)
933+static void
934+fold_text (FILE *istream, size_t width, int *saved_errno)
935 {
936- FILE *istream;
937 int c;
938 size_t column = 0; /* Screen column where next char will go. */
939 size_t offset_out = 0; /* Index in `line_out' for next char. */
940 static char *line_out = NULL;
941 static size_t allocated_out = 0;
942- int saved_errno;
943-
944- if (STREQ (filename, "-"))
945- {
946- istream = stdin;
947- have_read_stdin = true;
948- }
949- else
950- istream = fopen (filename, "r");
951-
952- if (istream == NULL)
953- {
954- error (0, errno, "%s", filename);
955- return false;
956- }
957
958 while ((c = getc (istream)) != EOF)
959 {
960@@ -171,6 +199,15 @@
961 bool found_blank = false;
962 size_t logical_end = offset_out;
963
964+ /* If LINE_OUT has no wide character,
965+ put a new wide character in LINE_OUT
966+ if column is bigger than width. */
967+ if (offset_out == 0)
968+ {
969+ line_out[offset_out++] = c;
970+ continue;
971+ }
972+
973 /* Look for the last blank. */
974 while (logical_end)
975 {
976@@ -217,11 +254,225 @@
977 line_out[offset_out++] = c;
978 }
979
980- saved_errno = errno;
981+ *saved_errno = errno;
982+
983+ if (offset_out)
984+ fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
985+
986+ free(line_out);
987+}
988+
989+#if HAVE_MBRTOWC
990+static void
991+fold_multibyte_text (FILE *istream, size_t width, int *saved_errno)
992+{
993+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
994+ size_t buflen = 0; /* The length of the byte sequence in buf. */
995+ char *bufpos; /* Next read position of BUF. */
996+ wint_t wc; /* A gotten wide character. */
997+ size_t mblength; /* The byte size of a multibyte character which shows
998+ as same character as WC. */
999+ mbstate_t state, state_bak; /* State of the stream. */
1000+ int convfail; /* 1, when conversion is failed. Otherwise 0. */
1001+
1002+ char *line_out = NULL;
1003+ size_t offset_out = 0; /* Index in `line_out' for next char. */
1004+ size_t allocated_out = 0;
1005+
1006+ int increment;
1007+ size_t column = 0;
1008+
1009+ size_t last_blank_pos;
1010+ size_t last_blank_column;
1011+ int is_blank_seen;
1012+ int last_blank_increment;
1013+ int is_bs_following_last_blank;
1014+ size_t bs_following_last_blank_num;
1015+ int is_cr_after_last_blank;
1016+
1017+#define CLEAR_FLAGS \
1018+ do \
1019+ { \
1020+ last_blank_pos = 0; \
1021+ last_blank_column = 0; \
1022+ is_blank_seen = 0; \
1023+ is_bs_following_last_blank = 0; \
1024+ bs_following_last_blank_num = 0; \
1025+ is_cr_after_last_blank = 0; \
1026+ } \
1027+ while (0)
1028+
1029+#define START_NEW_LINE \
1030+ do \
1031+ { \
1032+ putchar ('\n'); \
1033+ column = 0; \
1034+ offset_out = 0; \
1035+ CLEAR_FLAGS; \
1036+ } \
1037+ while (0)
1038+
1039+ CLEAR_FLAGS;
1040+ memset (&state, '\0', sizeof(mbstate_t));
1041+
1042+ for (;; bufpos += mblength, buflen -= mblength)
1043+ {
1044+ if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
1045+ {
1046+ memmove (buf, bufpos, buflen);
1047+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
1048+ bufpos = buf;
1049+ }
1050+
1051+ if (buflen < 1)
1052+ break;
1053+
1054+ /* Get a wide character. */
1055+ convfail = 0;
1056+ state_bak = state;
1057+ mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
1058+
1059+ switch (mblength)
1060+ {
1061+ case (size_t)-1:
1062+ case (size_t)-2:
1063+ convfail++;
1064+ state = state_bak;
1065+ /* Fall through. */
1066+
1067+ case 0:
1068+ mblength = 1;
1069+ break;
1070+ }
1071+
1072+rescan:
1073+ if (operating_mode == byte_mode) /* byte mode */
1074+ increment = mblength;
1075+ else if (operating_mode == character_mode) /* character mode */
1076+ increment = 1;
1077+ else /* column mode */
1078+ {
1079+ if (convfail)
1080+ increment = 1;
1081+ else
1082+ {
1083+ switch (wc)
1084+ {
1085+ case L'\n':
1086+ fwrite (line_out, sizeof(char), offset_out, stdout);
1087+ START_NEW_LINE;
1088+ continue;
1089+
1090+ case L'\b':
1091+ increment = (column > 0) ? -1 : 0;
1092+ break;
1093+
1094+ case L'\r':
1095+ increment = -1 * column;
1096+ break;
1097+
1098+ case L'\t':
1099+ increment = 8 - column % 8;
1100+ break;
1101+
1102+ default:
1103+ increment = wcwidth (wc);
1104+ increment = (increment < 0) ? 0 : increment;
1105+ }
1106+ }
1107+ }
1108+
1109+ if (column + increment > width && break_spaces && last_blank_pos)
1110+ {
1111+ fwrite (line_out, sizeof(char), last_blank_pos, stdout);
1112+ putchar ('\n');
1113+
1114+ offset_out = offset_out - last_blank_pos;
1115+ column = column - last_blank_column + ((is_cr_after_last_blank)
1116+ ? last_blank_increment : bs_following_last_blank_num);
1117+ memmove (line_out, line_out + last_blank_pos, offset_out);
1118+ CLEAR_FLAGS;
1119+ goto rescan;
1120+ }
1121+
1122+ if (column + increment > width && column != 0)
1123+ {
1124+ fwrite (line_out, sizeof(char), offset_out, stdout);
1125+ START_NEW_LINE;
1126+ goto rescan;
1127+ }
1128+
1129+ if (allocated_out < offset_out + mblength)
1130+ {
1131+ allocated_out += 1024;
1132+ line_out = xrealloc (line_out, allocated_out);
1133+ }
1134+
1135+ memcpy (line_out + offset_out, bufpos, mblength);
1136+ offset_out += mblength;
1137+ column += increment;
1138+
1139+ if (is_blank_seen && !convfail && wc == L'\r')
1140+ is_cr_after_last_blank = 1;
1141+
1142+ if (is_bs_following_last_blank && !convfail && wc == L'\b')
1143+ ++bs_following_last_blank_num;
1144+ else
1145+ is_bs_following_last_blank = 0;
1146+
1147+ if (break_spaces && !convfail && iswblank (wc))
1148+ {
1149+ last_blank_pos = offset_out;
1150+ last_blank_column = column;
1151+ is_blank_seen = 1;
1152+ last_blank_increment = increment;
1153+ is_bs_following_last_blank = 1;
1154+ bs_following_last_blank_num = 0;
1155+ is_cr_after_last_blank = 0;
1156+ }
1157+ }
1158+
1159+ *saved_errno = errno;
1160
1161 if (offset_out)
1162 fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1163
1164+ free(line_out);
1165+}
1166+#endif
1167+
1168+/* Fold file FILENAME, or standard input if FILENAME is "-",
1169+ to stdout, with maximum line length WIDTH.
1170+ Return 0 if successful, 1 if an error occurs. */
1171+
1172+static bool
1173+fold_file (char *filename, size_t width)
1174+{
1175+ FILE *istream;
1176+ int saved_errno;
1177+
1178+ if (STREQ (filename, "-"))
1179+ {
1180+ istream = stdin;
1181+ have_read_stdin = 1;
1182+ }
1183+ else
1184+ istream = fopen (filename, "r");
1185+
1186+ if (istream == NULL)
1187+ {
1188+ error (0, errno, "%s", filename);
1189+ return 1;
1190+ }
1191+
1192+ /* Define how ISTREAM is being folded. */
1193+#if HAVE_MBRTOWC
1194+ if (MB_CUR_MAX > 1)
1195+ fold_multibyte_text (istream, width, &saved_errno);
1196+ else
1197+#endif
1198+ fold_text (istream, width, &saved_errno);
1199+
1200 if (ferror (istream))
1201 {
1202 error (0, saved_errno, "%s", filename);
1203@@ -254,7 +505,8 @@
1204
1205 atexit (close_stdout);
1206
1207- break_spaces = count_bytes = have_read_stdin = false;
1208+ operating_mode = column_mode;
1209+ break_spaces = have_read_stdin = false;
1210
1211 while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
1212 {
1213@@ -263,7 +515,15 @@
1214 switch (optc)
1215 {
1216 case 'b': /* Count bytes rather than columns. */
1217- count_bytes = true;
1218+ if (operating_mode != column_mode)
1219+ FATAL_ERROR (_("only one way of folding may be specified"));
1220+ operating_mode = byte_mode;
1221+ break;
1222+
1223+ case 'c':
1224+ if (operating_mode != column_mode)
1225+ FATAL_ERROR (_("only one way of folding may be specified"));
1226+ operating_mode = character_mode;
1227 break;
1228
1229 case 's': /* Break at word boundaries. */
1230diff -Naur coreutils-6.12.orig/src/join.c coreutils-6.12/src/join.c
1231--- coreutils-6.12.orig/src/join.c 2008-05-25 23:40:32.000000000 -0700
1232+++ coreutils-6.12/src/join.c 2009-01-08 12:56:49.000000000 -0800
1233@@ -22,17 +22,31 @@
1234 #include <sys/types.h>
1235 #include <getopt.h>
1236
1237+/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth(). */
1238+#if HAVE_WCHAR_H
1239+# include <wchar.h>
1240+#endif
1241+
1242+/* Get iswblank(), towupper. */
1243+#if HAVE_WCTYPE_H
1244+# include <wctype.h>
1245+#endif
1246+
1247 #include "system.h"
1248 #include "error.h"
1249 #include "hard-locale.h"
1250 #include "linebuffer.h"
1251-#include "memcasecmp.h"
1252 #include "quote.h"
1253 #include "stdio--.h"
1254 #include "xmemcoll.h"
1255 #include "xstrtol.h"
1256 #include "argmatch.h"
1257
1258+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1259+#if HAVE_MBRTOWC && defined mbstate_t
1260+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1261+#endif
1262+
1263 /* The official name of this program (e.g., no `g' prefix). */
1264 #define PROGRAM_NAME "join"
1265
1266@@ -113,10 +127,12 @@
1267 /* Last element in `outlist', where a new element can be added. */
1268 static struct outlist *outlist_end = &outlist_head;
1269
1270-/* Tab character separating fields. If negative, fields are separated
1271- by any nonempty string of blanks, otherwise by exactly one
1272- tab character whose value (when cast to unsigned char) equals TAB. */
1273-static int tab = -1;
1274+/* Tab character separating fields. If NULL, fields are separated
1275+ by any nonempty string of blanks. */
1276+static char *tab = NULL;
1277+
1278+/* The number of bytes used for tab. */
1279+static size_t tablen = 0;
1280
1281 /* If nonzero, check that the input is correctly ordered. */
1282 static enum
1283@@ -230,10 +246,11 @@
1284 if (ptr == lim)
1285 return;
1286
1287- if (0 <= tab)
1288+ if (tab != NULL)
1289 {
1290+ unsigned char t = tab[0];
1291 char *sep;
1292- for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
1293+ for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
1294 extract_field (line, ptr, sep - ptr);
1295 }
1296 else
1297@@ -260,6 +277,148 @@
1298 extract_field (line, ptr, lim - ptr);
1299 }
1300
1301+#if HAVE_MBRTOWC
1302+static void
1303+xfields_multibyte (struct line *line)
1304+{
1305+ char *ptr = line->buf.buffer;
1306+ char const *lim = ptr + line->buf.length - 1;
1307+ wchar_t wc = 0;
1308+ size_t mblength = 1;
1309+ mbstate_t state, state_bak;
1310+
1311+ memset (&state, 0, sizeof (mbstate_t));
1312+
1313+ if (ptr == lim)
1314+ return;
1315+
1316+ if (tab != NULL)
1317+ {
1318+ unsigned char t = tab[0];
1319+ char *sep = ptr;
1320+ for (; ptr < lim; ptr = sep + mblength)
1321+ {
1322+ sep = ptr;
1323+ while (sep < lim)
1324+ {
1325+ state_bak = state;
1326+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1327+
1328+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1329+ {
1330+ mblength = 1;
1331+ state = state_bak;
1332+ }
1333+ mblength = (mblength < 1) ? 1 : mblength;
1334+
1335+ if (mblength == tablen && !memcmp (sep, tab, mblength))
1336+ break;
1337+ else
1338+ {
1339+ sep += mblength;
1340+ continue;
1341+ }
1342+ }
1343+
1344+ if (sep == lim)
1345+ break;
1346+
1347+ extract_field (line, ptr, sep - ptr);
1348+ }
1349+ }
1350+ else
1351+ {
1352+ /* Skip leading blanks before the first field. */
1353+ while(ptr < lim)
1354+ {
1355+ state_bak = state;
1356+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1357+
1358+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1359+ {
1360+ mblength = 1;
1361+ state = state_bak;
1362+ break;
1363+ }
1364+ mblength = (mblength < 1) ? 1 : mblength;
1365+
1366+ if (!iswblank(wc))
1367+ break;
1368+ ptr += mblength;
1369+ }
1370+
1371+ do
1372+ {
1373+ char *sep;
1374+ state_bak = state;
1375+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1376+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1377+ {
1378+ mblength = 1;
1379+ state = state_bak;
1380+ break;
1381+ }
1382+ mblength = (mblength < 1) ? 1 : mblength;
1383+
1384+ sep = ptr + mblength;
1385+ while (sep != lim)
1386+ {
1387+ state_bak = state;
1388+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1389+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1390+ {
1391+ mblength = 1;
1392+ state = state_bak;
1393+ break;
1394+ }
1395+ mblength = (mblength < 1) ? 1 : mblength;
1396+
1397+ if (iswblank (wc))
1398+ break;
1399+
1400+ sep += mblength;
1401+ }
1402+
1403+ extract_field (line, ptr, sep - ptr);
1404+ if (sep == lim)
1405+ return;
1406+
1407+ state_bak = state;
1408+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1409+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1410+ {
1411+ mblength = 1;
1412+ state = state_bak;
1413+ break;
1414+ }
1415+ mblength = (mblength < 1) ? 1 : mblength;
1416+
1417+ ptr = sep + mblength;
1418+ while (ptr != lim)
1419+ {
1420+ state_bak = state;
1421+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1422+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1423+ {
1424+ mblength = 1;
1425+ state = state_bak;
1426+ break;
1427+ }
1428+ mblength = (mblength < 1) ? 1 : mblength;
1429+
1430+ if (!iswblank (wc))
1431+ break;
1432+
1433+ ptr += mblength;
1434+ }
1435+ }
1436+ while (ptr != lim);
1437+ }
1438+
1439+ extract_field (line, ptr, lim - ptr);
1440+}
1441+#endif
1442+
1443 static struct line *
1444 dup_line (const struct line *old)
1445 {
1446@@ -305,56 +464,115 @@
1447 size_t jf_1, size_t jf_2)
1448 {
1449 /* Start of field to compare in each file. */
1450- char *beg1;
1451- char *beg2;
1452-
1453- size_t len1;
1454- size_t len2; /* Length of fields to compare. */
1455+ char *beg[2];
1456+ char *copy[2];
1457+ size_t len[2]; /* Length of fields to compare. */
1458 int diff;
1459+ int i, j;
1460
1461 if (jf_1 < line1->nfields)
1462 {
1463- beg1 = line1->fields[jf_1].beg;
1464- len1 = line1->fields[jf_1].len;
1465+ beg[0] = line1->fields[jf_1].beg;
1466+ len[0] = line1->fields[jf_1].len;
1467 }
1468 else
1469 {
1470- beg1 = NULL;
1471- len1 = 0;
1472+ beg[0] = NULL;
1473+ len[0] = 0;
1474 }
1475
1476 if (jf_2 < line2->nfields)
1477 {
1478- beg2 = line2->fields[jf_2].beg;
1479- len2 = line2->fields[jf_2].len;
1480+ beg[1] = line2->fields[jf_2].beg;
1481+ len[1] = line2->fields[jf_2].len;
1482 }
1483 else
1484 {
1485- beg2 = NULL;
1486- len2 = 0;
1487+ beg[1] = NULL;
1488+ len[1] = 0;
1489 }
1490
1491- if (len1 == 0)
1492- return len2 == 0 ? 0 : -1;
1493- if (len2 == 0)
1494+ if (len[0] == 0)
1495+ return len[1] == 0 ? 0 : -1;
1496+ if (len[1] == 0)
1497 return 1;
1498
1499 if (ignore_case)
1500 {
1501- /* FIXME: ignore_case does not work with NLS (in particular,
1502- with multibyte chars). */
1503- diff = memcasecmp (beg1, beg2, MIN (len1, len2));
1504+#ifdef HAVE_MBRTOWC
1505+ if (MB_CUR_MAX > 1)
1506+ {
1507+ size_t mblength;
1508+ wchar_t wc, uwc;
1509+ mbstate_t state, state_bak;
1510+
1511+ memset (&state, '\0', sizeof (mbstate_t));
1512+
1513+ for (i = 0; i < 2; i++)
1514+ {
1515+ copy[i] = alloca (len[i] + 1);
1516+
1517+ for (j = 0; j < MIN (len[0], len[1]);)
1518+ {
1519+ state_bak = state;
1520+ mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
1521+
1522+ switch (mblength)
1523+ {
1524+ case (size_t) -1:
1525+ case (size_t) -2:
1526+ state = state_bak;
1527+ /* Fall through */
1528+ case 0:
1529+ mblength = 1;
1530+ break;
1531+
1532+ default:
1533+ uwc = towupper (wc);
1534+
1535+ if (uwc != wc)
1536+ {
1537+ mbstate_t state_wc;
1538+
1539+ memset (&state_wc, '\0', sizeof (mbstate_t));
1540+ wcrtomb (copy[i] + j, uwc, &state_wc);
1541+ }
1542+ else
1543+ memcpy (copy[i] + j, beg[i] + j, mblength);
1544+ }
1545+ j += mblength;
1546+ }
1547+ copy[i][j] = '\0';
1548+ }
1549+ }
1550+ else
1551+#endif
1552+ {
1553+ for (i = 0; i < 2; i++)
1554+ {
1555+ copy[i] = alloca (len[i] + 1);
1556+
1557+ for (j = 0; j < MIN (len[0], len[1]); j++)
1558+ copy[i][j] = toupper (beg[i][j]);
1559+
1560+ copy[i][j] = '\0';
1561+ }
1562+ }
1563 }
1564 else
1565 {
1566- if (hard_LC_COLLATE)
1567- return xmemcoll (beg1, len1, beg2, len2);
1568- diff = memcmp (beg1, beg2, MIN (len1, len2));
1569+ copy[0] = (unsigned char *) beg[0];
1570+ copy[1] = (unsigned char *) beg[1];
1571 }
1572
1573+ if (hard_LC_COLLATE)
1574+ return xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
1575+ diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
1576+
1577+
1578 if (diff)
1579 return diff;
1580- return len1 < len2 ? -1 : len1 != len2;
1581+ return len[0] - len[1];
1582 }
1583
1584 /* Check that successive input lines PREV and CURRENT from input file
1585@@ -413,6 +631,11 @@
1586 line->nfields_allocated = 0;
1587 line->nfields = 0;
1588 line->fields = NULL;
1589+#if HAVE_MBRTOWC
1590+ if (MB_CUR_MAX > 1)
1591+ xfields_multibyte (line);
1592+ else
1593+#endif
1594 xfields (line);
1595
1596 if (prevline[which - 1])
1597@@ -509,11 +732,18 @@
1598
1599 /* Print the join of LINE1 and LINE2. */
1600
1601+#define PUT_TAB_CHAR \
1602+ do \
1603+ { \
1604+ (tab != NULL) ? \
1605+ fwrite(tab, sizeof(char), tablen, stdout) : putchar (' '); \
1606+ } \
1607+ while (0)
1608+
1609 static void
1610 prjoin (struct line const *line1, struct line const *line2)
1611 {
1612 const struct outlist *outlist;
1613- char output_separator = tab < 0 ? ' ' : tab;
1614
1615 outlist = outlist_head.next;
1616 if (outlist)
1617@@ -529,12 +759,12 @@
1618 if (o->file == 0)
1619 {
1620 if (line1 == &uni_blank)
1621- {
1622+ {
1623 line = line2;
1624 field = join_field_2;
1625 }
1626 else
1627- {
1628+ {
1629 line = line1;
1630 field = join_field_1;
1631 }
1632@@ -548,7 +778,7 @@
1633 o = o->next;
1634 if (o == NULL)
1635 break;
1636- putchar (output_separator);
1637+ PUT_TAB_CHAR;
1638 }
1639 putchar ('\n');
1640 }
1641@@ -566,23 +796,23 @@
1642 prfield (join_field_1, line1);
1643 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
1644 {
1645- putchar (output_separator);
1646+ PUT_TAB_CHAR;
1647 prfield (i, line1);
1648 }
1649 for (i = join_field_1 + 1; i < line1->nfields; ++i)
1650 {
1651- putchar (output_separator);
1652+ PUT_TAB_CHAR;
1653 prfield (i, line1);
1654 }
1655
1656 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
1657 {
1658- putchar (output_separator);
1659+ PUT_TAB_CHAR;
1660 prfield (i, line2);
1661 }
1662 for (i = join_field_2 + 1; i < line2->nfields; ++i)
1663 {
1664- putchar (output_separator);
1665+ PUT_TAB_CHAR;
1666 prfield (i, line2);
1667 }
1668 putchar ('\n');
1669@@ -1016,20 +1246,41 @@
1670
1671 case 't':
1672 {
1673- unsigned char newtab = optarg[0];
1674- if (! newtab)
1675+ char *newtab;
1676+ size_t newtablen;
1677+ if (! optarg[0])
1678 error (EXIT_FAILURE, 0, _("empty tab"));
1679- if (optarg[1])
1680+ newtab = xstrdup (optarg);
1681+#if HAVE_MBRTOWC
1682+ if (MB_CUR_MAX > 1)
1683+ {
1684+ mbstate_t state;
1685+
1686+ memset (&state, 0, sizeof (mbstate_t));
1687+ newtablen = mbrtowc (NULL, newtab,
1688+ strnlen (newtab, MB_LEN_MAX),
1689+ &state);
1690+ if (newtablen == (size_t) 0
1691+ || newtablen == (size_t) -1
1692+ || newtablen == (size_t) -2)
1693+ newtablen = 1;
1694+ }
1695+ else
1696+#endif
1697+ newtablen = 1;
1698+
1699+ if (newtablen == 1 && newtab[1])
1700+ {
1701+ if (STREQ (newtab, "\\0"))
1702+ newtab[0] = '\0';
1703+ }
1704+ if (tab != NULL && strcmp (tab, newtab))
1705 {
1706- if (STREQ (optarg, "\\0"))
1707- newtab = '\0';
1708- else
1709- error (EXIT_FAILURE, 0, _("multi-character tab %s"),
1710- quote (optarg));
1711+ free (newtab);
1712+ error (EXIT_FAILURE, 0, _("incompatible tabs"));
1713 }
1714- if (0 <= tab && tab != newtab)
1715- error (EXIT_FAILURE, 0, _("incompatible tabs"));
1716 tab = newtab;
1717+ tablen = newtablen;
1718 }
1719 break;
1720
1721diff -Naur coreutils-6.12.orig/src/pr.c coreutils-6.12/src/pr.c
1722--- coreutils-6.12.orig/src/pr.c 2008-05-25 23:40:32.000000000 -0700
1723+++ coreutils-6.12/src/pr.c 2009-01-08 12:56:50.000000000 -0800
1724@@ -312,6 +312,32 @@
1725
1726 #include <getopt.h>
1727 #include <sys/types.h>
1728+
1729+/* Get MB_LEN_MAX. */
1730+#include <limits.h>
1731+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
1732+ installation; work around this configuration error. */
1733+#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
1734+# define MB_LEN_MAX 16
1735+#endif
1736+
1737+/* Get MB_CUR_MAX. */
1738+#include <stdlib.h>
1739+
1740+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
1741+/* Get mbstate_t, mbrtowc(), wcwidth(). */
1742+#if HAVE_WCHAR_H
1743+# include <wchar.h>
1744+#endif
1745+
1746+/* Get iswprint(). -- for wcwidth(). */
1747+#if HAVE_WCTYPE_H
1748+# include <wctype.h>
1749+#endif
1750+#if !defined iswprint && !HAVE_ISWPRINT
1751+# define iswprint(wc) 1
1752+#endif
1753+
1754 #include "system.h"
1755 #include "error.h"
1756 #include "hard-locale.h"
1757@@ -322,6 +348,18 @@
1758 #include "strftime.h"
1759 #include "xstrtol.h"
1760
1761+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1762+#if HAVE_MBRTOWC && defined mbstate_t
1763+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1764+#endif
1765+
1766+#ifndef HAVE_DECL_WCWIDTH
1767+"this configure-time declaration test was not run"
1768+#endif
1769+#if !HAVE_DECL_WCWIDTH
1770+extern int wcwidth ();
1771+#endif
1772+
1773 /* The official name of this program (e.g., no `g' prefix). */
1774 #define PROGRAM_NAME "pr"
1775
1776@@ -416,7 +454,20 @@
1777
1778 #define NULLCOL (COLUMN *)0
1779
1780-static int char_to_clump (char c);
1781+/* Funtion pointers to switch functions for single byte locale or for
1782+ multibyte locale. If multibyte functions do not exist in your sysytem,
1783+ these pointers always point the function for single byte locale. */
1784+static void (*print_char) (char c);
1785+static int (*char_to_clump) (char c);
1786+
1787+/* Functions for single byte locale. */
1788+static void print_char_single (char c);
1789+static int char_to_clump_single (char c);
1790+
1791+/* Functions for multibyte locale. */
1792+static void print_char_multi (char c);
1793+static int char_to_clump_multi (char c);
1794+
1795 static bool read_line (COLUMN *p);
1796 static bool print_page (void);
1797 static bool print_stored (COLUMN *p);
1798@@ -426,6 +477,7 @@
1799 static void pad_across_to (int position);
1800 static void add_line_number (COLUMN *p);
1801 static void getoptarg (char *arg, char switch_char, char *character,
1802+ int *character_length, int *character_width,
1803 int *number);
1804 void usage (int status);
1805 static void print_files (int number_of_files, char **av);
1806@@ -440,7 +492,6 @@
1807 static void pad_down (int lines);
1808 static void read_rest_of_line (COLUMN *p);
1809 static void skip_read (COLUMN *p, int column_number);
1810-static void print_char (char c);
1811 static void cleanup (void);
1812 static void print_sep_string (void);
1813 static void separator_string (const char *optarg_S);
1814@@ -455,7 +506,7 @@
1815 we store the leftmost columns contiguously in buff.
1816 To print a line from buff, get the index of the first character
1817 from line_vector[i], and print up to line_vector[i + 1]. */
1818-static char *buff;
1819+static unsigned char *buff;
1820
1821 /* Index of the position in buff where the next character
1822 will be stored. */
1823@@ -559,7 +610,7 @@
1824 static bool untabify_input = false;
1825
1826 /* (-e) The input tab character. */
1827-static char input_tab_char = '\t';
1828+static char input_tab_char[MB_LEN_MAX] = "\t";
1829
1830 /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
1831 where the leftmost column is 1. */
1832@@ -569,7 +620,10 @@
1833 static bool tabify_output = false;
1834
1835 /* (-i) The output tab character. */
1836-static char output_tab_char = '\t';
1837+static char output_tab_char[MB_LEN_MAX] = "\t";
1838+
1839+/* (-i) The byte length of output tab character. */
1840+static int output_tab_char_length = 1;
1841
1842 /* (-i) The width of the output tab. */
1843 static int chars_per_output_tab = 8;
1844@@ -643,7 +697,13 @@
1845 static bool numbered_lines = false;
1846
1847 /* (-n) Character which follows each line number. */
1848-static char number_separator = '\t';
1849+static char number_separator[MB_LEN_MAX] = "\t";
1850+
1851+/* (-n) The byte length of the character which follows each line number. */
1852+static int number_separator_length = 1;
1853+
1854+/* (-n) The character width of the character which follows each line number. */
1855+static int number_separator_width = 0;
1856
1857 /* (-n) line counting starts with 1st line of input file (not with 1st
1858 line of 1st page printed). */
1859@@ -696,6 +756,7 @@
1860 -a|COLUMN|-m is a `space' and with the -J option a `tab'. */
1861 static char *col_sep_string = "";
1862 static int col_sep_length = 0;
1863+static int col_sep_width = 0;
1864 static char *column_separator = " ";
1865 static char *line_separator = "\t";
1866
1867@@ -852,6 +913,13 @@
1868 col_sep_length = (int) strlen (optarg_S);
1869 col_sep_string = xmalloc (col_sep_length + 1);
1870 strcpy (col_sep_string, optarg_S);
1871+
1872+#if HAVE_MBRTOWC
1873+ if (MB_CUR_MAX > 1)
1874+ col_sep_width = mbswidth (col_sep_string, 0);
1875+ else
1876+#endif
1877+ col_sep_width = col_sep_length;
1878 }
1879
1880 int
1881@@ -876,6 +944,21 @@
1882
1883 atexit (close_stdout);
1884
1885+/* Define which functions are used, the ones for single byte locale or the ones
1886+ for multibyte locale. */
1887+#if HAVE_MBRTOWC
1888+ if (MB_CUR_MAX > 1)
1889+ {
1890+ print_char = print_char_multi;
1891+ char_to_clump = char_to_clump_multi;
1892+ }
1893+ else
1894+#endif
1895+ {
1896+ print_char = print_char_single;
1897+ char_to_clump = char_to_clump_single;
1898+ }
1899+
1900 n_files = 0;
1901 file_names = (argc > 1
1902 ? xmalloc ((argc - 1) * sizeof (char *))
1903@@ -952,8 +1035,12 @@
1904 break;
1905 case 'e':
1906 if (optarg)
1907- getoptarg (optarg, 'e', &input_tab_char,
1908- &chars_per_input_tab);
1909+ {
1910+ int dummy_length, dummy_width;
1911+
1912+ getoptarg (optarg, 'e', input_tab_char, &dummy_length,
1913+ &dummy_width, &chars_per_input_tab);
1914+ }
1915 /* Could check tab width > 0. */
1916 untabify_input = true;
1917 break;
1918@@ -966,8 +1053,12 @@
1919 break;
1920 case 'i':
1921 if (optarg)
1922- getoptarg (optarg, 'i', &output_tab_char,
1923- &chars_per_output_tab);
1924+ {
1925+ int dummy_width;
1926+
1927+ getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
1928+ &dummy_width, &chars_per_output_tab);
1929+ }
1930 /* Could check tab width > 0. */
1931 tabify_output = true;
1932 break;
1933@@ -994,8 +1085,8 @@
1934 case 'n':
1935 numbered_lines = true;
1936 if (optarg)
1937- getoptarg (optarg, 'n', &number_separator,
1938- &chars_per_number);
1939+ getoptarg (optarg, 'n', number_separator, &number_separator_length,
1940+ &number_separator_width, &chars_per_number);
1941 break;
1942 case 'N':
1943 skip_count = false;
1944@@ -1034,7 +1125,7 @@
1945 old_s = false;
1946 /* Reset an additional input of -s, -S dominates -s */
1947 col_sep_string = "";
1948- col_sep_length = 0;
1949+ col_sep_length = col_sep_width = 0;
1950 use_col_separator = true;
1951 if (optarg)
1952 separator_string (optarg);
1953@@ -1191,10 +1282,45 @@
1954 a number. */
1955
1956 static void
1957-getoptarg (char *arg, char switch_char, char *character, int *number)
1958+getoptarg (char *arg, char switch_char, char *character, int *character_length,
1959+ int *character_width, int *number)
1960 {
1961 if (!ISDIGIT (*arg))
1962- *character = *arg++;
1963+ {
1964+#ifdef HAVE_MBRTOWC
1965+ if (MB_CUR_MAX > 1) /* for multibyte locale. */
1966+ {
1967+ wchar_t wc;
1968+ size_t mblength;
1969+ int width;
1970+ mbstate_t state = {'\0'};
1971+
1972+ mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
1973+
1974+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1975+ {
1976+ *character_length = 1;
1977+ *character_width = 1;
1978+ }
1979+ else
1980+ {
1981+ *character_length = (mblength < 1) ? 1 : mblength;
1982+ width = wcwidth (wc);
1983+ *character_width = (width < 0) ? 0 : width;
1984+ }
1985+
1986+ strncpy (character, arg, *character_length);
1987+ arg += *character_length;
1988+ }
1989+ else /* for single byte locale. */
1990+#endif
1991+ {
1992+ *character = *arg++;
1993+ *character_length = 1;
1994+ *character_width = 1;
1995+ }
1996+ }
1997+
1998 if (*arg)
1999 {
2000 long int tmp_long;
2001@@ -1253,7 +1379,7 @@
2002 else
2003 col_sep_string = column_separator;
2004
2005- col_sep_length = 1;
2006+ col_sep_length = col_sep_width = 1;
2007 use_col_separator = true;
2008 }
2009 /* It's rather pointless to define a TAB separator with column
2010@@ -1284,11 +1410,11 @@
2011 TAB_WIDTH (chars_per_input_tab, chars_per_number); */
2012
2013 /* Estimate chars_per_text without any margin and keep it constant. */
2014- if (number_separator == '\t')
2015+ if (number_separator[0] == '\t')
2016 number_width = chars_per_number +
2017 TAB_WIDTH (chars_per_default_tab, chars_per_number);
2018 else
2019- number_width = chars_per_number + 1;
2020+ number_width = chars_per_number + number_separator_width;
2021
2022 /* The number is part of the column width unless we are
2023 printing files in parallel. */
2024@@ -1303,7 +1429,7 @@
2025 }
2026
2027 chars_per_column = (chars_per_line - chars_used_by_number -
2028- (columns - 1) * col_sep_length) / columns;
2029+ (columns - 1) * col_sep_width) / columns;
2030
2031 if (chars_per_column < 1)
2032 error (EXIT_FAILURE, 0, _("page width too narrow"));
2033@@ -1428,7 +1554,7 @@
2034
2035 /* Enlarge p->start_position of first column to use the same form of
2036 padding_not_printed with all columns. */
2037- h = h + col_sep_length;
2038+ h = h + col_sep_width;
2039
2040 /* This loop takes care of all but the rightmost column. */
2041
2042@@ -1462,7 +1588,7 @@
2043 }
2044 else
2045 {
2046- h = h_next + col_sep_length;
2047+ h = h_next + col_sep_width;
2048 h_next = h + chars_per_column;
2049 }
2050 }
2051@@ -1752,9 +1878,9 @@
2052 align_column (COLUMN *p)
2053 {
2054 padding_not_printed = p->start_position;
2055- if (padding_not_printed - col_sep_length > 0)
2056+ if (padding_not_printed - col_sep_width > 0)
2057 {
2058- pad_across_to (padding_not_printed - col_sep_length);
2059+ pad_across_to (padding_not_printed - col_sep_width);
2060 padding_not_printed = ANYWHERE;
2061 }
2062
2063@@ -2025,13 +2151,13 @@
2064 /* May be too generous. */
2065 buff = X2REALLOC (buff, &buff_allocated);
2066 }
2067- buff[buff_current++] = c;
2068+ buff[buff_current++] = (unsigned char) c;
2069 }
2070
2071 static void
2072 add_line_number (COLUMN *p)
2073 {
2074- int i;
2075+ int i, j;
2076 char *s;
2077 int left_cut;
2078
2079@@ -2054,22 +2180,24 @@
2080 /* Tabification is assumed for multiple columns, also for n-separators,
2081 but `default n-separator = TAB' hasn't been given priority over
2082 equal column_width also specified by POSIX. */
2083- if (number_separator == '\t')
2084+ if (number_separator[0] == '\t')
2085 {
2086 i = number_width - chars_per_number;
2087 while (i-- > 0)
2088 (p->char_func) (' ');
2089 }
2090 else
2091- (p->char_func) (number_separator);
2092+ for (j = 0; j < number_separator_length; j++)
2093+ (p->char_func) (number_separator[j]);
2094 }
2095 else
2096 /* To comply with POSIX, we avoid any expansion of default TAB
2097 separator with a single column output. No column_width requirement
2098 has to be considered. */
2099 {
2100- (p->char_func) (number_separator);
2101- if (number_separator == '\t')
2102+ for (j = 0; j < number_separator_length; j++)
2103+ (p->char_func) (number_separator[j]);
2104+ if (number_separator[0] == '\t')
2105 output_position = POS_AFTER_TAB (chars_per_output_tab,
2106 output_position);
2107 }
2108@@ -2230,7 +2358,7 @@
2109 while (goal - h_old > 1
2110 && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
2111 {
2112- putchar (output_tab_char);
2113+ fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
2114 h_old = h_new;
2115 }
2116 while (++h_old <= goal)
2117@@ -2250,6 +2378,7 @@
2118 {
2119 char *s;
2120 int l = col_sep_length;
2121+ int not_space_flag;
2122
2123 s = col_sep_string;
2124
2125@@ -2263,6 +2392,7 @@
2126 {
2127 for (; separators_not_printed > 0; --separators_not_printed)
2128 {
2129+ not_space_flag = 0;
2130 while (l-- > 0)
2131 {
2132 /* 3 types of sep_strings: spaces only, spaces and chars,
2133@@ -2276,12 +2406,15 @@
2134 }
2135 else
2136 {
2137+ not_space_flag = 1;
2138 if (spaces_not_printed > 0)
2139 print_white_space ();
2140 putchar (*s++);
2141- ++output_position;
2142 }
2143 }
2144+ if (not_space_flag)
2145+ output_position += col_sep_width;
2146+
2147 /* sep_string ends with some spaces */
2148 if (spaces_not_printed > 0)
2149 print_white_space ();
2150@@ -2309,7 +2442,7 @@
2151 required number of tabs and spaces. */
2152
2153 static void
2154-print_char (char c)
2155+print_char_single (char c)
2156 {
2157 if (tabify_output)
2158 {
2159@@ -2333,6 +2466,74 @@
2160 putchar (c);
2161 }
2162
2163+#ifdef HAVE_MBRTOWC
2164+static void
2165+print_char_multi (char c)
2166+{
2167+ static size_t mbc_pos = 0;
2168+ static char mbc[MB_LEN_MAX] = {'\0'};
2169+ static mbstate_t state = {'\0'};
2170+ mbstate_t state_bak;
2171+ wchar_t wc;
2172+ size_t mblength;
2173+ int width;
2174+
2175+ if (tabify_output)
2176+ {
2177+ state_bak = state;
2178+ mbc[mbc_pos++] = c;
2179+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2180+
2181+ while (mbc_pos > 0)
2182+ {
2183+ switch (mblength)
2184+ {
2185+ case (size_t)-2:
2186+ state = state_bak;
2187+ return;
2188+
2189+ case (size_t)-1:
2190+ state = state_bak;
2191+ ++output_position;
2192+ putchar (mbc[0]);
2193+ memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
2194+ --mbc_pos;
2195+ break;
2196+
2197+ case 0:
2198+ mblength = 1;
2199+
2200+ default:
2201+ if (wc == L' ')
2202+ {
2203+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2204+ --mbc_pos;
2205+ ++spaces_not_printed;
2206+ return;
2207+ }
2208+ else if (spaces_not_printed > 0)
2209+ print_white_space ();
2210+
2211+ /* Nonprintables are assumed to have width 0, except L'\b'. */
2212+ if ((width = wcwidth (wc)) < 1)
2213+ {
2214+ if (wc == L'\b')
2215+ --output_position;
2216+ }
2217+ else
2218+ output_position += width;
2219+
2220+ fwrite (mbc, sizeof(char), mblength, stdout);
2221+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2222+ mbc_pos -= mblength;
2223+ }
2224+ }
2225+ return;
2226+ }
2227+ putchar (c);
2228+}
2229+#endif
2230+
2231 /* Skip to page PAGE before printing.
2232 PAGE may be larger than total number of pages. */
2233
2234@@ -2510,9 +2711,9 @@
2235 align_empty_cols = false;
2236 }
2237
2238- if (padding_not_printed - col_sep_length > 0)
2239+ if (padding_not_printed - col_sep_width > 0)
2240 {
2241- pad_across_to (padding_not_printed - col_sep_length);
2242+ pad_across_to (padding_not_printed - col_sep_width);
2243 padding_not_printed = ANYWHERE;
2244 }
2245
2246@@ -2613,9 +2814,9 @@
2247 }
2248 }
2249
2250- if (padding_not_printed - col_sep_length > 0)
2251+ if (padding_not_printed - col_sep_width > 0)
2252 {
2253- pad_across_to (padding_not_printed - col_sep_length);
2254+ pad_across_to (padding_not_printed - col_sep_width);
2255 padding_not_printed = ANYWHERE;
2256 }
2257
2258@@ -2628,8 +2829,8 @@
2259 if (spaces_not_printed == 0)
2260 {
2261 output_position = p->start_position + end_vector[line];
2262- if (p->start_position - col_sep_length == chars_per_margin)
2263- output_position -= col_sep_length;
2264+ if (p->start_position - col_sep_width == chars_per_margin)
2265+ output_position -= col_sep_width;
2266 }
2267
2268 return true;
2269@@ -2648,7 +2849,7 @@
2270 number of characters is 1.) */
2271
2272 static int
2273-char_to_clump (char c)
2274+char_to_clump_single (char c)
2275 {
2276 unsigned char uc = c;
2277 char *s = clump_buff;
2278@@ -2658,10 +2859,10 @@
2279 int chars;
2280 int chars_per_c = 8;
2281
2282- if (c == input_tab_char)
2283+ if (c == input_tab_char[0])
2284 chars_per_c = chars_per_input_tab;
2285
2286- if (c == input_tab_char || c == '\t')
2287+ if (c == input_tab_char[0] || c == '\t')
2288 {
2289 width = TAB_WIDTH (chars_per_c, input_position);
2290
2291@@ -2742,6 +2943,154 @@
2292 return chars;
2293 }
2294
2295+#ifdef HAVE_MBRTOWC
2296+static int
2297+char_to_clump_multi (char c)
2298+{
2299+ static size_t mbc_pos = 0;
2300+ static char mbc[MB_LEN_MAX] = {'\0'};
2301+ static mbstate_t state = {'\0'};
2302+ mbstate_t state_bak;
2303+ wchar_t wc;
2304+ size_t mblength;
2305+ int wc_width;
2306+ register char *s = clump_buff;
2307+ register int i, j;
2308+ char esc_buff[4];
2309+ int width;
2310+ int chars;
2311+ int chars_per_c = 8;
2312+
2313+ state_bak = state;
2314+ mbc[mbc_pos++] = c;
2315+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2316+
2317+ width = 0;
2318+ chars = 0;
2319+ while (mbc_pos > 0)
2320+ {
2321+ switch (mblength)
2322+ {
2323+ case (size_t)-2:
2324+ state = state_bak;
2325+ return 0;
2326+
2327+ case (size_t)-1:
2328+ state = state_bak;
2329+ mblength = 1;
2330+
2331+ if (use_esc_sequence || use_cntrl_prefix)
2332+ {
2333+ width = +4;
2334+ chars = +4;
2335+ *s++ = '\\';
2336+ sprintf (esc_buff, "%03o", mbc[0]);
2337+ for (i = 0; i <= 2; ++i)
2338+ *s++ = (int) esc_buff[i];
2339+ }
2340+ else
2341+ {
2342+ width += 1;
2343+ chars += 1;
2344+ *s++ = mbc[0];
2345+ }
2346+ break;
2347+
2348+ case 0:
2349+ mblength = 1;
2350+ /* Fall through */
2351+
2352+ default:
2353+ if (memcmp (mbc, input_tab_char, mblength) == 0)
2354+ chars_per_c = chars_per_input_tab;
2355+
2356+ if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
2357+ {
2358+ int width_inc;
2359+
2360+ width_inc = TAB_WIDTH (chars_per_c, input_position);
2361+ width += width_inc;
2362+
2363+ if (untabify_input)
2364+ {
2365+ for (i = width_inc; i; --i)
2366+ *s++ = ' ';
2367+ chars += width_inc;
2368+ }
2369+ else
2370+ {
2371+ for (i = 0; i < mblength; i++)
2372+ *s++ = mbc[i];
2373+ chars += mblength;
2374+ }
2375+ }
2376+ else if ((wc_width = wcwidth (wc)) < 1)
2377+ {
2378+ if (use_esc_sequence)
2379+ {
2380+ for (i = 0; i < mblength; i++)
2381+ {
2382+ width += 4;
2383+ chars += 4;
2384+ *s++ = '\\';
2385+ sprintf (esc_buff, "%03o", c);
2386+ for (j = 0; j <= 2; ++j)
2387+ *s++ = (int) esc_buff[j];
2388+ }
2389+ }
2390+ else if (use_cntrl_prefix)
2391+ {
2392+ if (wc < 0200)
2393+ {
2394+ width += 2;
2395+ chars += 2;
2396+ *s++ = '^';
2397+ *s++ = wc ^ 0100;
2398+ }
2399+ else
2400+ {
2401+ for (i = 0; i < mblength; i++)
2402+ {
2403+ width += 4;
2404+ chars += 4;
2405+ *s++ = '\\';
2406+ sprintf (esc_buff, "%03o", c);
2407+ for (j = 0; j <= 2; ++j)
2408+ *s++ = (int) esc_buff[j];
2409+ }
2410+ }
2411+ }
2412+ else if (wc == L'\b')
2413+ {
2414+ width += -1;
2415+ chars += 1;
2416+ *s++ = c;
2417+ }
2418+ else
2419+ {
2420+ width += 0;
2421+ chars += mblength;
2422+ for (i = 0; i < mblength; i++)
2423+ *s++ = mbc[i];
2424+ }
2425+ }
2426+ else
2427+ {
2428+ width += wc_width;
2429+ chars += mblength;
2430+ for (i = 0; i < mblength; i++)
2431+ *s++ = mbc[i];
2432+ }
2433+ }
2434+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2435+ mbc_pos -= mblength;
2436+ }
2437+
2438+ input_position += width;
2439+ return chars;
2440+}
2441+#endif
2442+
2443 /* We've just printed some files and need to clean up things before
2444 looking for more options and printing the next batch of files.
2445
2446diff -Naur coreutils-6.12.orig/src/sort.c coreutils-6.12/src/sort.c
2447--- coreutils-6.12.orig/src/sort.c 2008-05-25 23:40:32.000000000 -0700
2448+++ coreutils-6.12/src/sort.c 2009-01-08 12:56:50.000000000 -0800
2449@@ -22,10 +22,19 @@
2450
2451 #include <config.h>
2452
2453+#include <assert.h>
2454 #include <getopt.h>
2455 #include <sys/types.h>
2456 #include <sys/wait.h>
2457 #include <signal.h>
2458+#if HAVE_WCHAR_H
2459+# include <wchar.h>
2460+#endif
2461+/* Get isw* functions. */
2462+#if HAVE_WCTYPE_H
2463+# include <wctype.h>
2464+#endif
2465+
2466 #include "system.h"
2467 #include "argmatch.h"
2468 #include "error.h"
2469@@ -117,14 +126,38 @@
2470 /* Thousands separator; if -1, then there isn't one. */
2471 static int thousands_sep;
2472
2473+static int force_general_numcompare = 0;
2474+
2475 /* Nonzero if the corresponding locales are hard. */
2476 static bool hard_LC_COLLATE;
2477-#if HAVE_NL_LANGINFO
2478+#if HAVE_LANGINFO_CODESET
2479 static bool hard_LC_TIME;
2480 #endif
2481
2482 #define NONZERO(x) ((x) != 0)
2483
2484+/* get a multibyte character's byte length. */
2485+#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
2486+ do \
2487+ { \
2488+ wchar_t wc; \
2489+ mbstate_t state_bak; \
2490+ \
2491+ state_bak = STATE; \
2492+ mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
2493+ \
2494+ switch (MBLENGTH) \
2495+ { \
2496+ case (size_t)-1: \
2497+ case (size_t)-2: \
2498+ STATE = state_bak; \
2499+ /* Fall through. */ \
2500+ case 0: \
2501+ MBLENGTH = 1; \
2502+ } \
2503+ } \
2504+ while (0)
2505+
2506 /* The kind of blanks for '-b' to skip in various options. */
2507 enum blanktype { bl_start, bl_end, bl_both };
2508
2509@@ -262,13 +295,11 @@
2510 they were read if all keys compare equal. */
2511 static bool stable;
2512
2513-/* If TAB has this value, blanks separate fields. */
2514-enum { TAB_DEFAULT = CHAR_MAX + 1 };
2515-
2516-/* Tab character separating fields. If TAB_DEFAULT, then fields are
2517+/* Tab character separating fields. If tab_length is 0, then fields are
2518 separated by the empty string between a non-blank character and a blank
2519 character. */
2520-static int tab = TAB_DEFAULT;
2521+static char tab[MB_LEN_MAX + 1];
2522+static size_t tab_length = 0;
2523
2524 /* Flag to remove consecutive duplicate lines from the output.
2525 Only the last of a sequence of equal lines will be output. */
2526@@ -655,6 +686,44 @@
2527 update_proc (pid);
2528 }
2529
2530+/* Function pointers. */
2531+static void
2532+(*inittables) (void);
2533+static char *
2534+(*begfield) (const struct line*, const struct keyfield *);
2535+static char *
2536+(*limfield) (const struct line*, const struct keyfield *);
2537+static int
2538+(*getmonth) (char const *, size_t);
2539+static int
2540+(*keycompare) (const struct line *, const struct line *);
2541+static int
2542+(*numcompare) (const char *, const char *);
2543+
2544+/* Test for white space multibyte character.
2545+ Set LENGTH the byte length of investigated multibyte character. */
2546+#if HAVE_MBRTOWC
2547+static int
2548+ismbblank (const char *str, size_t len, size_t *length)
2549+{
2550+ size_t mblength;
2551+ wchar_t wc;
2552+ mbstate_t state;
2553+
2554+ memset (&state, '\0', sizeof(mbstate_t));
2555+ mblength = mbrtowc (&wc, str, len, &state);
2556+
2557+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
2558+ {
2559+ *length = 1;
2560+ return 0;
2561+ }
2562+
2563+ *length = (mblength < 1) ? 1 : mblength;
2564+ return iswblank (wc);
2565+}
2566+#endif
2567+
2568 /* Clean up any remaining temporary files. */
2569
2570 static void
2571@@ -994,7 +1063,7 @@
2572 free (node);
2573 }
2574
2575-#if HAVE_NL_LANGINFO
2576+#if HAVE_LANGINFO_CODESET
2577
2578 static int
2579 struct_month_cmp (const void *m1, const void *m2)
2580@@ -1009,7 +1078,7 @@
2581 /* Initialize the character class tables. */
2582
2583 static void
2584-inittables (void)
2585+inittables_uni (void)
2586 {
2587 size_t i;
2588
2589@@ -1021,7 +1090,7 @@
2590 fold_toupper[i] = toupper (i);
2591 }
2592
2593-#if HAVE_NL_LANGINFO
2594+#if HAVE_LANGINFO_CODESET
2595 /* If we're not in the "C" locale, read different names for months. */
2596 if (hard_LC_TIME)
2597 {
2598@@ -1047,6 +1116,64 @@
2599 #endif
2600 }
2601
2602+#if HAVE_MBRTOWC
2603+static void
2604+inittables_mb (void)
2605+{
2606+ int i, j, k, l;
2607+ char *name, *s;
2608+ size_t s_len, mblength;
2609+ char mbc[MB_LEN_MAX];
2610+ wchar_t wc, pwc;
2611+ mbstate_t state_mb, state_wc;
2612+
2613+ for (i = 0; i < MONTHS_PER_YEAR; i++)
2614+ {
2615+ s = (char *) nl_langinfo (ABMON_1 + i);
2616+ s_len = strlen (s);
2617+ monthtab[i].name = name = (char *) xmalloc (s_len + 1);
2618+ monthtab[i].val = i + 1;
2619+
2620+ memset (&state_mb, '\0', sizeof (mbstate_t));
2621+ memset (&state_wc, '\0', sizeof (mbstate_t));
2622+
2623+ for (j = 0; j < s_len;)
2624+ {
2625+ if (!ismbblank (s + j, s_len - j, &mblength))
2626+ break;
2627+ j += mblength;
2628+ }
2629+
2630+ for (k = 0; j < s_len;)
2631+ {
2632+ mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
2633+ assert (mblength != (size_t)-1 && mblength != (size_t)-2);
2634+ if (mblength == 0)
2635+ break;
2636+
2637+ pwc = towupper (wc);
2638+ if (pwc == wc)
2639+ {
2640+ memcpy (mbc, s + j, mblength);
2641+ j += mblength;
2642+ }
2643+ else
2644+ {
2645+ j += mblength;
2646+ mblength = wcrtomb (mbc, pwc, &state_wc);
2647+ assert (mblength != (size_t)0 && mblength != (size_t)-1);
2648+ }
2649+
2650+ for (l = 0; l < mblength; l++)
2651+ name[k++] = mbc[l];
2652+ }
2653+ name[k] = '\0';
2654+ }
2655+ qsort ((void *) monthtab, MONTHS_PER_YEAR,
2656+ sizeof (struct month), struct_month_cmp);
2657+}
2658+#endif
2659+
2660 /* Specify the amount of main memory to use when sorting. */
2661 static void
2662 specify_sort_size (int oi, char c, char const *s)
2663@@ -1257,7 +1384,7 @@
2664 by KEY in LINE. */
2665
2666 static char *
2667-begfield (const struct line *line, const struct keyfield *key)
2668+begfield_uni (const struct line *line, const struct keyfield *key)
2669 {
2670 char *ptr = line->text, *lim = ptr + line->length - 1;
2671 size_t sword = key->sword;
2672@@ -1267,10 +1394,10 @@
2673 /* The leading field separator itself is included in a field when -t
2674 is absent. */
2675
2676- if (tab != TAB_DEFAULT)
2677+ if (tab_length)
2678 while (ptr < lim && sword--)
2679 {
2680- while (ptr < lim && *ptr != tab)
2681+ while (ptr < lim && *ptr != tab[0])
2682 ++ptr;
2683 if (ptr < lim)
2684 ++ptr;
2685@@ -1298,11 +1425,70 @@
2686 return ptr;
2687 }
2688
2689+#if HAVE_MBRTOWC
2690+static char *
2691+begfield_mb (const struct line *line, const struct keyfield *key)
2692+{
2693+ int i;
2694+ char *ptr = line->text, *lim = ptr + line->length - 1;
2695+ size_t sword = key->sword;
2696+ size_t schar = key->schar;
2697+ size_t mblength;
2698+ mbstate_t state;
2699+
2700+ memset (&state, '\0', sizeof(mbstate_t));
2701+
2702+ if (tab_length)
2703+ while (ptr < lim && sword--)
2704+ {
2705+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2706+ {
2707+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2708+ ptr += mblength;
2709+ }
2710+ if (ptr < lim)
2711+ {
2712+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2713+ ptr += mblength;
2714+ }
2715+ }
2716+ else
2717+ while (ptr < lim && sword--)
2718+ {
2719+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2720+ ptr += mblength;
2721+ if (ptr < lim)
2722+ {
2723+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2724+ ptr += mblength;
2725+ }
2726+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2727+ ptr += mblength;
2728+ }
2729+
2730+ if (key->skipsblanks)
2731+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2732+ ptr += mblength;
2733+
2734+ for (i = 0; i < schar; i++)
2735+ {
2736+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2737+
2738+ if (ptr + mblength > lim)
2739+ break;
2740+ else
2741+ ptr += mblength;
2742+ }
2743+
2744+ return ptr;
2745+}
2746+#endif
2747+
2748 /* Return the limit of (a pointer to the first character after) the field
2749 in LINE specified by KEY. */
2750
2751 static char *
2752-limfield (const struct line *line, const struct keyfield *key)
2753+limfield_uni (const struct line *line, const struct keyfield *key)
2754 {
2755 char *ptr = line->text, *lim = ptr + line->length - 1;
2756 size_t eword = key->eword, echar = key->echar;
2757@@ -1315,10 +1501,10 @@
2758 `beginning' is the first character following the delimiting TAB.
2759 Otherwise, leave PTR pointing at the first `blank' character after
2760 the preceding field. */
2761- if (tab != TAB_DEFAULT)
2762+ if (tab_length)
2763 while (ptr < lim && eword--)
2764 {
2765- while (ptr < lim && *ptr != tab)
2766+ while (ptr < lim && *ptr != tab[0])
2767 ++ptr;
2768 if (ptr < lim && (eword | echar))
2769 ++ptr;
2770@@ -1364,10 +1550,10 @@
2771 */
2772
2773 /* Make LIM point to the end of (one byte past) the current field. */
2774- if (tab != TAB_DEFAULT)
2775+ if (tab_length)
2776 {
2777 char *newlim;
2778- newlim = memchr (ptr, tab, lim - ptr);
2779+ newlim = memchr (ptr, tab[0], lim - ptr);
2780 if (newlim)
2781 lim = newlim;
2782 }
2783@@ -1400,6 +1586,107 @@
2784 return ptr;
2785 }
2786
2787+#if HAVE_MBRTOWC
2788+static char *
2789+limfield_mb (const struct line *line, const struct keyfield *key)
2790+{
2791+ char *ptr = line->text, *lim = ptr + line->length - 1;
2792+ size_t eword = key->eword, echar = key->echar;
2793+ int i;
2794+ size_t mblength;
2795+ mbstate_t state;
2796+
2797+ memset (&state, '\0', sizeof(mbstate_t));
2798+
2799+ if (tab_length)
2800+ while (ptr < lim && eword--)
2801+ {
2802+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2803+ {
2804+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2805+ ptr += mblength;
2806+ }
2807+ if (ptr < lim && (eword | echar))
2808+ {
2809+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2810+ ptr += mblength;
2811+ }
2812+ }
2813+ else
2814+ while (ptr < lim && eword--)
2815+ {
2816+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2817+ ptr += mblength;
2818+ if (ptr < lim)
2819+ {
2820+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2821+ ptr += mblength;
2822+ }
2823+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2824+ ptr += mblength;
2825+ }
2826+
2827+
2828+# ifdef POSIX_UNSPECIFIED
2829+ /* Make LIM point to the end of (one byte past) the current field. */
2830+ if (tab_length)
2831+ {
2832+ char *newlim, *p;
2833+
2834+ newlim = NULL;
2835+ for (p = ptr; p < lim;)
2836+ {
2837+ if (memcmp (p, tab, tab_length) == 0)
2838+ {
2839+ newlim = p;
2840+ break;
2841+ }
2842+
2843+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2844+ p += mblength;
2845+ }
2846+ }
2847+ else
2848+ {
2849+ char *newlim;
2850+ newlim = ptr;
2851+
2852+ while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
2853+ newlim += mblength;
2854+ if (ptr < lim)
2855+ {
2856+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2857+ ptr += mblength;
2858+ }
2859+ while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
2860+ newlim += mblength;
2861+ lim = newlim;
2862+ }
2863+# endif
2864+
2865+ /* If we're skipping leading blanks, don't start counting characters
2866+ * until after skipping past any leading blanks. */
2867+ if (key->skipsblanks)
2868+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2869+ ptr += mblength;
2870+
2871+ memset (&state, '\0', sizeof(mbstate_t));
2872+
2873+ /* Advance PTR by ECHAR (if possible), but no further than LIM. */
2874+ for (i = 0; i < echar; i++)
2875+ {
2876+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2877+
2878+ if (ptr + mblength > lim)
2879+ break;
2880+ else
2881+ ptr += mblength;
2882+ }
2883+
2884+ return ptr;
2885+}
2886+#endif
2887+
2888 /* Fill BUF reading from FP, moving buf->left bytes from the end
2889 of buf->buf to the beginning first. If EOF is reached and the
2890 file wasn't terminated by a newline, supply one. Set up BUF's line
2891@@ -1482,8 +1769,24 @@
2892 else
2893 {
2894 if (key->skipsblanks)
2895- while (blanks[to_uchar (*line_start)])
2896- line_start++;
2897+ {
2898+#if HAVE_MBRTOWC
2899+ if (MB_CUR_MAX > 1)
2900+ {
2901+ size_t mblength;
2902+ mbstate_t state;
2903+ memset (&state, '\0', sizeof(mbstate_t));
2904+ while (line_start < line->keylim &&
2905+ ismbblank (line_start,
2906+ line->keylim - line_start,
2907+ &mblength))
2908+ line_start += mblength;
2909+ }
2910+ else
2911+#endif
2912+ while (blanks[to_uchar (*line_start)])
2913+ line_start++;
2914+ }
2915 line->keybeg = line_start;
2916 }
2917 }
2918@@ -1521,7 +1824,7 @@
2919 hideously fast. */
2920
2921 static int
2922-numcompare (const char *a, const char *b)
2923+numcompare_uni (const char *a, const char *b)
2924 {
2925 while (blanks[to_uchar (*a)])
2926 a++;
2927@@ -1531,6 +1834,25 @@
2928 return strnumcmp (a, b, decimal_point, thousands_sep);
2929 }
2930
2931+#if HAVE_MBRTOWC
2932+static int
2933+numcompare_mb (const char *a, const char *b)
2934+{
2935+ size_t mblength, len;
2936+ len = strlen (a); /* okay for UTF-8 */
2937+ while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2938+ {
2939+ a += mblength;
2940+ len -= mblength;
2941+ }
2942+ len = strlen (b); /* okay for UTF-8 */
2943+ while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2944+ b += mblength;
2945+
2946+ return strnumcmp (a, b, decimal_point, thousands_sep);
2947+}
2948+#endif /* HAV_EMBRTOWC */
2949+
2950 static int
2951 general_numcompare (const char *sa, const char *sb)
2952 {
2953@@ -1564,7 +1886,7 @@
2954 Return 0 if the name in S is not recognized. */
2955
2956 static int
2957-getmonth (char const *month, size_t len)
2958+getmonth_uni (char const *month, size_t len)
2959 {
2960 size_t lo = 0;
2961 size_t hi = MONTHS_PER_YEAR;
2962@@ -1719,11 +2041,79 @@
2963 return diff;
2964 }
2965
2966+#if HAVE_MBRTOWC
2967+static int
2968+getmonth_mb (const char *s, size_t len)
2969+{
2970+ char *month;
2971+ register size_t i;
2972+ register int lo = 0, hi = MONTHS_PER_YEAR, result;
2973+ char *tmp;
2974+ size_t wclength, mblength;
2975+ const char **pp;
2976+ const wchar_t **wpp;
2977+ wchar_t *month_wcs;
2978+ mbstate_t state;
2979+
2980+ while (len > 0 && ismbblank (s, len, &mblength))
2981+ {
2982+ s += mblength;
2983+ len -= mblength;
2984+ }
2985+
2986+ if (len == 0)
2987+ return 0;
2988+
2989+ month = (char *) alloca (len + 1);
2990+
2991+ tmp = (char *) alloca (len + 1);
2992+ memcpy (tmp, s, len);
2993+ tmp[len] = '\0';
2994+ pp = (const char **)&tmp;
2995+ month_wcs = (wchar_t *) alloca ((len + 1) * sizeof (wchar_t));
2996+ memset (&state, '\0', sizeof(mbstate_t));
2997+
2998+ wclength = mbsrtowcs (month_wcs, pp, len + 1, &state);
2999+ assert (wclength != (size_t)-1 && *pp == NULL);
3000+
3001+ for (i = 0; i < wclength; i++)
3002+ {
3003+ month_wcs[i] = towupper(month_wcs[i]);
3004+ if (iswblank (month_wcs[i]))
3005+ {
3006+ month_wcs[i] = L'\0';
3007+ break;
3008+ }
3009+ }
3010+
3011+ wpp = (const wchar_t **)&month_wcs;
3012+
3013+ mblength = wcsrtombs (month, wpp, len + 1, &state);
3014+ assert (mblength != (-1) && *wpp == NULL);
3015+
3016+ do
3017+ {
3018+ int ix = (lo + hi) / 2;
3019+
3020+ if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
3021+ hi = ix;
3022+ else
3023+ lo = ix;
3024+ }
3025+ while (hi - lo > 1);
3026+
3027+ result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
3028+ ? monthtab[lo].val : 0);
3029+
3030+ return result;
3031+}
3032+#endif
3033+
3034 /* Compare two lines A and B trying every key in sequence until there
3035 are no more keys or a difference is found. */
3036
3037 static int
3038-keycompare (const struct line *a, const struct line *b)
3039+keycompare_uni (const struct line *a, const struct line *b)
3040 {
3041 struct keyfield const *key = keylist;
3042
3043@@ -1896,6 +2286,179 @@
3044 return key->reverse ? -diff : diff;
3045 }
3046
3047+#if HAVE_MBRTOWC
3048+static int
3049+keycompare_mb (const struct line *a, const struct line *b)
3050+{
3051+ struct keyfield *key = keylist;
3052+
3053+ /* For the first iteration only, the key positions have been
3054+ precomputed for us. */
3055+ char *texta = a->keybeg;
3056+ char *textb = b->keybeg;
3057+ char *lima = a->keylim;
3058+ char *limb = b->keylim;
3059+
3060+ size_t mblength_a, mblength_b;
3061+ wchar_t wc_a, wc_b;
3062+ mbstate_t state_a, state_b;
3063+
3064+ int diff;
3065+
3066+ memset (&state_a, '\0', sizeof(mbstate_t));
3067+ memset (&state_b, '\0', sizeof(mbstate_t));
3068+
3069+ for (;;)
3070+ {
3071+ unsigned char *translate = (unsigned char *) key->translate;
3072+ bool const *ignore = key->ignore;
3073+
3074+ /* Find the lengths. */
3075+ size_t lena = lima <= texta ? 0 : lima - texta;
3076+ size_t lenb = limb <= textb ? 0 : limb - textb;
3077+
3078+ /* Actually compare the fields. */
3079+ if (key->random)
3080+ diff = compare_random (texta, lena, textb, lenb);
3081+ else if (key->numeric | key->general_numeric)
3082+ {
3083+ char savea = *lima, saveb = *limb;
3084+
3085+ *lima = *limb = '\0';
3086+ if (force_general_numcompare)
3087+ diff = general_numcompare (texta, textb);
3088+ else
3089+ diff = ((key->numeric ? numcompare : general_numcompare)
3090+ (texta, textb));
3091+ *lima = savea, *limb = saveb;
3092+ }
3093+ else if (key->month)
3094+ diff = getmonth (texta, lena) - getmonth (textb, lenb);
3095+ else
3096+ {
3097+ if (ignore || translate)
3098+ {
3099+ char *copy_a = (char *) alloca (lena + 1 + lenb + 1);
3100+ char *copy_b = copy_a + lena + 1;
3101+ size_t new_len_a, new_len_b;
3102+ size_t i, j;
3103+
3104+ /* Ignore and/or translate chars before comparing. */
3105+# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
3106+ do \
3107+ { \
3108+ wchar_t uwc; \
3109+ char mbc[MB_LEN_MAX]; \
3110+ mbstate_t state_wc; \
3111+ \
3112+ for (NEW_LEN = i = 0; i < LEN;) \
3113+ { \
3114+ mbstate_t state_bak; \
3115+ \
3116+ state_bak = STATE; \
3117+ MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
3118+ \
3119+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
3120+ || MBLENGTH == 0) \
3121+ { \
3122+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
3123+ STATE = state_bak; \
3124+ if (!ignore) \
3125+ COPY[NEW_LEN++] = TEXT[i++]; \
3126+ continue; \
3127+ } \
3128+ \
3129+ if (ignore) \
3130+ { \
3131+ if ((ignore == nonprinting && !iswprint (WC)) \
3132+ || (ignore == nondictionary \
3133+ && !iswalnum (WC) && !iswblank (WC))) \
3134+ { \
3135+ i += MBLENGTH; \
3136+ continue; \
3137+ } \
3138+ } \
3139+ \
3140+ if (translate) \
3141+ { \
3142+ \
3143+ uwc = towupper(WC); \
3144+ if (WC == uwc) \
3145+ { \
3146+ memcpy (mbc, TEXT + i, MBLENGTH); \
3147+ i += MBLENGTH; \
3148+ } \
3149+ else \
3150+ { \
3151+ i += MBLENGTH; \
3152+ WC = uwc; \
3153+ memset (&state_wc, '\0', sizeof (mbstate_t)); \
3154+ \
3155+ MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
3156+ assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
3157+ } \
3158+ \
3159+ for (j = 0; j < MBLENGTH; j++) \
3160+ COPY[NEW_LEN++] = mbc[j]; \
3161+ } \
3162+ else \
3163+ for (j = 0; j < MBLENGTH; j++) \
3164+ COPY[NEW_LEN++] = TEXT[i++]; \
3165+ } \
3166+ COPY[NEW_LEN] = '\0'; \
3167+ } \
3168+ while (0)
3169+ IGNORE_CHARS (new_len_a, lena, texta, copy_a,
3170+ wc_a, mblength_a, state_a);
3171+ IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
3172+ wc_b, mblength_b, state_b);
3173+ diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
3174+ }
3175+ else if (lena == 0)
3176+ diff = - NONZERO (lenb);
3177+ else if (lenb == 0)
3178+ goto greater;
3179+ else
3180+ diff = xmemcoll (texta, lena, textb, lenb);
3181+ }
3182+
3183+ if (diff)
3184+ goto not_equal;
3185+
3186+ key = key->next;
3187+ if (! key)
3188+ break;
3189+
3190+ /* Find the beginning and limit of the next field. */
3191+ if (key->eword != -1)
3192+ lima = limfield (a, key), limb = limfield (b, key);
3193+ else
3194+ lima = a->text + a->length - 1, limb = b->text + b->length - 1;
3195+
3196+ if (key->sword != -1)
3197+ texta = begfield (a, key), textb = begfield (b, key);
3198+ else
3199+ {
3200+ texta = a->text, textb = b->text;
3201+ if (key->skipsblanks)
3202+ {
3203+ while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
3204+ texta += mblength_a;
3205+ while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
3206+ textb += mblength_b;
3207+ }
3208+ }
3209+ }
3210+
3211+ return 0;
3212+
3213+greater:
3214+ diff = 1;
3215+not_equal:
3216+ return key->reverse ? -diff : diff;
3217+}
3218+#endif
3219+
3220 /* Compare two lines A and B, returning negative, zero, or positive
3221 depending on whether A compares less than, equal to, or greater than B. */
3222
3223@@ -2765,7 +3328,7 @@
3224 initialize_exit_failure (SORT_FAILURE);
3225
3226 hard_LC_COLLATE = hard_locale (LC_COLLATE);
3227-#if HAVE_NL_LANGINFO
3228+#if HAVE_LANGINFO_CODESET
3229 hard_LC_TIME = hard_locale (LC_TIME);
3230 #endif
3231
3232@@ -2786,6 +3349,27 @@
3233 thousands_sep = -1;
3234 }
3235
3236+#if HAVE_MBRTOWC
3237+ if (MB_CUR_MAX > 1)
3238+ {
3239+ inittables = inittables_mb;
3240+ begfield = begfield_mb;
3241+ limfield = limfield_mb;
3242+ getmonth = getmonth_mb;
3243+ keycompare = keycompare_mb;
3244+ numcompare = numcompare_mb;
3245+ }
3246+ else
3247+#endif
3248+ {
3249+ inittables = inittables_uni;
3250+ begfield = begfield_uni;
3251+ limfield = limfield_uni;
3252+ getmonth = getmonth_uni;
3253+ keycompare = keycompare_uni;
3254+ numcompare = numcompare_uni;
3255+ }
3256+
3257 have_read_stdin = false;
3258 inittables ();
3259
3260@@ -3037,13 +3621,35 @@
3261
3262 case 't':
3263 {
3264- char newtab = optarg[0];
3265- if (! newtab)
3266+ char newtab[MB_LEN_MAX + 1];
3267+ size_t newtab_length = 1;
3268+ strncpy (newtab, optarg, MB_LEN_MAX);
3269+ if (! newtab[0])
3270 error (SORT_FAILURE, 0, _("empty tab"));
3271- if (optarg[1])
3272+#if HAVE_MBRTOWC
3273+ if (MB_CUR_MAX > 1)
3274+ {
3275+ wchar_t wc;
3276+ mbstate_t state;
3277+ size_t i;
3278+
3279+ memset (&state, '\0', sizeof (mbstate_t));
3280+ newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
3281+ MB_LEN_MAX),
3282+ &state);
3283+ switch (newtab_length)
3284+ {
3285+ case (size_t) -1:
3286+ case (size_t) -2:
3287+ case 0:
3288+ newtab_length = 1;
3289+ }
3290+ }
3291+#endif
3292+ if (newtab_length == 1 && optarg[1])
3293 {
3294 if (STREQ (optarg, "\\0"))
3295- newtab = '\0';
3296+ newtab[0] = '\0';
3297 else
3298 {
3299 /* Provoke with `sort -txx'. Complain about
3300@@ -3054,9 +3660,12 @@
3301 quote (optarg));
3302 }
3303 }
3304- if (tab != TAB_DEFAULT && tab != newtab)
3305+ if (tab_length
3306+ && (tab_length != newtab_length
3307+ || memcmp (tab, newtab, tab_length) != 0))
3308 error (SORT_FAILURE, 0, _("incompatible tabs"));
3309- tab = newtab;
3310+ memcpy (tab, newtab, newtab_length);
3311+ tab_length = newtab_length;
3312 }
3313 break;
3314
3315diff -Naur coreutils-6.12.orig/src/unexpand.c coreutils-6.12/src/unexpand.c
3316--- coreutils-6.12.orig/src/unexpand.c 2008-05-25 23:40:33.000000000 -0700
3317+++ coreutils-6.12/src/unexpand.c 2009-01-08 12:56:50.000000000 -0800
3318@@ -38,11 +38,28 @@
3319 #include <stdio.h>
3320 #include <getopt.h>
3321 #include <sys/types.h>
3322+
3323+/* Get mbstate_t, mbrtowc(), wcwidth(). */
3324+#if HAVE_WCHAR_H
3325+# include <wchar.h>
3326+#endif
3327+
3328 #include "system.h"
3329 #include "error.h"
3330 #include "quote.h"
3331 #include "xstrndup.h"
3332
3333+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3334+ installation; work around this configuration error. */
3335+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3336+# define MB_LEN_MAX 16
3337+#endif
3338+
3339+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3340+#if HAVE_MBRTOWC && defined mbstate_t
3341+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3342+#endif
3343+
3344 /* The official name of this program (e.g., no `g' prefix). */
3345 #define PROGRAM_NAME "unexpand"
3346
3347@@ -109,6 +126,208 @@
3348 {NULL, 0, NULL, 0}
3349 };
3350
3351+static FILE *next_file (FILE *fp);
3352+
3353+#if HAVE_MBRTOWC
3354+static void
3355+unexpand_multibyte (void)
3356+{
3357+ FILE *fp; /* Input stream. */
3358+ mbstate_t i_state; /* Current shift state of the input stream. */
3359+ mbstate_t i_state_bak; /* Back up the I_STATE. */
3360+ mbstate_t o_state; /* Current shift state of the output stream. */
3361+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
3362+ char *bufpos; /* Next read position of BUF. */
3363+ size_t buflen = 0; /* The length of the byte sequence in buf. */
3364+ wint_t wc; /* A gotten wide character. */
3365+ size_t mblength; /* The byte size of a multibyte character
3366+ which shows as same character as WC. */
3367+
3368+ /* Index in `tab_list' of next tabstop: */
3369+ int tab_index = 0; /* For calculating width of pending tabs. */
3370+ int print_tab_index = 0; /* For printing as many tabs as possible. */
3371+ unsigned int column = 0; /* Column on screen of next char. */
3372+ int next_tab_column; /* Column the next tab stop is on. */
3373+ int convert = 1; /* If nonzero, perform translations. */
3374+ unsigned int pending = 0; /* Pending columns of blanks. */
3375+
3376+ fp = next_file ((FILE *) NULL);
3377+ if (fp == NULL)
3378+ return;
3379+
3380+ memset (&o_state, '\0', sizeof(mbstate_t));
3381+ memset (&i_state, '\0', sizeof(mbstate_t));
3382+
3383+ for (;;)
3384+ {
3385+ if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
3386+ {
3387+ memmove (buf, bufpos, buflen);
3388+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
3389+ bufpos = buf;
3390+ }
3391+
3392+ /* Get a wide character. */
3393+ if (buflen < 1)
3394+ {
3395+ mblength = 1;
3396+ wc = WEOF;
3397+ }
3398+ else
3399+ {
3400+ i_state_bak = i_state;
3401+ mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &i_state);
3402+ }
3403+
3404+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
3405+ {
3406+ i_state = i_state_bak;
3407+ wc = L'\0';
3408+ }
3409+
3410+ if (wc == L' ' && convert && column < INT_MAX)
3411+ {
3412+ ++pending;
3413+ ++column;
3414+ }
3415+ else if (wc == L'\t' && convert)
3416+ {
3417+ if (tab_size == 0)
3418+ {
3419+ /* Do not let tab_index == first_free_tab;
3420+ stop when it is 1 less. */
3421+ while (tab_index < first_free_tab - 1
3422+ && column >= tab_list[tab_index])
3423+ tab_index++;
3424+ next_tab_column = tab_list[tab_index];
3425+ if (tab_index < first_free_tab - 1)
3426+ tab_index++;
3427+ if (column >= next_tab_column)
3428+ {
3429+ convert = 0; /* Ran out of tab stops. */
3430+ goto flush_pend_mb;
3431+ }
3432+ }
3433+ else
3434+ {
3435+ next_tab_column = column + tab_size - column % tab_size;
3436+ }
3437+ pending += next_tab_column - column;
3438+ column = next_tab_column;
3439+ }
3440+ else
3441+ {
3442+flush_pend_mb:
3443+ /* Flush pending spaces. Print as many tabs as possible,
3444+ then print the rest as spaces. */
3445+ if (pending == 1)
3446+ {
3447+ putchar (' ');
3448+ pending = 0;
3449+ }
3450+ column -= pending;
3451+ while (pending > 0)
3452+ {
3453+ if (tab_size == 0)
3454+ {
3455+ /* Do not let print_tab_index == first_free_tab;
3456+ stop when it is 1 less. */
3457+ while (print_tab_index < first_free_tab - 1
3458+ && column >= tab_list[print_tab_index])
3459+ print_tab_index++;
3460+ next_tab_column = tab_list[print_tab_index];
3461+ if (print_tab_index < first_free_tab - 1)
3462+ print_tab_index++;
3463+ }
3464+ else
3465+ {
3466+ next_tab_column =
3467+ column + tab_size - column % tab_size;
3468+ }
3469+ if (next_tab_column - column <= pending)
3470+ {
3471+ putchar ('\t');
3472+ pending -= next_tab_column - column;
3473+ column = next_tab_column;
3474+ }
3475+ else
3476+ {
3477+ --print_tab_index;
3478+ column += pending;
3479+ while (pending != 0)
3480+ {
3481+ putchar (' ');
3482+ pending--;
3483+ }
3484+ }
3485+ }
3486+
3487+ if (wc == WEOF)
3488+ {
3489+ fp = next_file (fp);
3490+ if (fp == NULL)
3491+ break; /* No more files. */
3492+ else
3493+ {
3494+ memset (&i_state, '\0', sizeof(mbstate_t));
3495+ continue;
3496+ }
3497+ }
3498+
3499+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
3500+ {
3501+ if (convert)
3502+ {
3503+ ++column;
3504+ if (convert_entire_line == 0)
3505+ convert = 0;
3506+ }
3507+ mblength = 1;
3508+ putchar (buf[0]);
3509+ }
3510+ else if (mblength == 0)
3511+ {
3512+ if (convert && convert_entire_line == 0)
3513+ convert = 0;
3514+ mblength = 1;
3515+ putchar ('\0');
3516+ }
3517+ else
3518+ {
3519+ if (convert)
3520+ {
3521+ if (wc == L'\b')
3522+ {
3523+ if (column > 0)
3524+ --column;
3525+ }
3526+ else
3527+ {
3528+ int width; /* The width of WC. */
3529+
3530+ width = wcwidth (wc);
3531+ column += (width > 0) ? width : 0;
3532+ if (convert_entire_line == 0)
3533+ convert = 0;
3534+ }
3535+ }
3536+
3537+ if (wc == L'\n')
3538+ {
3539+ tab_index = print_tab_index = 0;
3540+ column = pending = 0;
3541+ convert = 1;
3542+ }
3543+ fwrite (bufpos, sizeof(char), mblength, stdout);
3544+ }
3545+ }
3546+ buflen -= mblength;
3547+ bufpos += mblength;
3548+ }
3549+}
3550+#endif
3551+
3552+
3553 void
3554 usage (int status)
3555 {
3556@@ -530,7 +749,12 @@
3557
3558 file_list = (optind < argc ? &argv[optind] : stdin_argv);
3559
3560- unexpand ();
3561+#if HAVE_MBRTOWC
3562+ if (MB_CUR_MAX > 1)
3563+ unexpand_multibyte ();
3564+ else
3565+#endif
3566+ unexpand ();
3567
3568 if (have_read_stdin && fclose (stdin) != 0)
3569 error (EXIT_FAILURE, errno, "-");
3570diff -Naur coreutils-6.12.orig/src/uniq.c coreutils-6.12/src/uniq.c
3571--- coreutils-6.12.orig/src/uniq.c 2008-05-25 23:40:32.000000000 -0700
3572+++ coreutils-6.12/src/uniq.c 2009-01-08 12:56:50.000000000 -0800
3573@@ -22,6 +22,16 @@
3574 #include <getopt.h>
3575 #include <sys/types.h>
3576
3577+/* Get mbstate_t, mbrtowc(). */
3578+#if HAVE_WCHAR_H
3579+# include <wchar.h>
3580+#endif
3581+
3582+/* Get isw* functions. */
3583+#if HAVE_WCTYPE_H
3584+# include <wctype.h>
3585+#endif
3586+
3587 #include "system.h"
3588 #include "argmatch.h"
3589 #include "linebuffer.h"
3590@@ -31,7 +41,19 @@
3591 #include "quote.h"
3592 #include "xmemcoll.h"
3593 #include "xstrtol.h"
3594-#include "memcasecmp.h"
3595+#include "xmemcoll.h"
3596+
3597+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3598+ installation; work around this configuration error. */
3599+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3600+# define MB_LEN_MAX 16
3601+#endif
3602+
3603+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3604+#if HAVE_MBRTOWC && defined mbstate_t
3605+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3606+#endif
3607+
3608
3609 /* The official name of this program (e.g., no `g' prefix). */
3610 #define PROGRAM_NAME "uniq"
3611@@ -110,6 +132,10 @@
3612 /* Select whether/how to delimit groups of duplicate lines. */
3613 static enum delimit_method delimit_groups;
3614
3615+/* Function pointers. */
3616+static char *
3617+(*find_field) (struct linebuffer *line);
3618+
3619 static struct option const longopts[] =
3620 {
3621 {"count", no_argument, NULL, 'c'},
3622@@ -206,7 +232,7 @@
3623 return a pointer to the beginning of the line's field to be compared. */
3624
3625 static char *
3626-find_field (const struct linebuffer *line)
3627+find_field_uni (struct linebuffer *line)
3628 {
3629 size_t count;
3630 char *lp = line->buffer;
3631@@ -227,6 +253,83 @@
3632 return lp + i;
3633 }
3634
3635+#if HAVE_MBRTOWC
3636+
3637+# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
3638+ do \
3639+ { \
3640+ mbstate_t state_bak; \
3641+ \
3642+ CONVFAIL = 0; \
3643+ state_bak = *STATEP; \
3644+ \
3645+ MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
3646+ \
3647+ switch (MBLENGTH) \
3648+ { \
3649+ case (size_t)-2: \
3650+ case (size_t)-1: \
3651+ *STATEP = state_bak; \
3652+ CONVFAIL++; \
3653+ /* Fall through */ \
3654+ case 0: \
3655+ MBLENGTH = 1; \
3656+ } \
3657+ } \
3658+ while (0)
3659+
3660+static char *
3661+find_field_multi (struct linebuffer *line)
3662+{
3663+ size_t count;
3664+ char *lp = line->buffer;
3665+ size_t size = line->length - 1;
3666+ size_t pos;
3667+ size_t mblength;
3668+ wchar_t wc;
3669+ mbstate_t *statep;
3670+ int convfail;
3671+
3672+ pos = 0;
3673+ statep = &(line->state);
3674+
3675+ /* skip fields. */
3676+ for (count = 0; count < skip_fields && pos < size; count++)
3677+ {
3678+ while (pos < size)
3679+ {
3680+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3681+
3682+ if (convfail || !iswblank (wc))
3683+ {
3684+ pos += mblength;
3685+ break;
3686+ }
3687+ pos += mblength;
3688+ }
3689+
3690+ while (pos < size)
3691+ {
3692+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3693+
3694+ if (!convfail && iswblank (wc))
3695+ break;
3696+
3697+ pos += mblength;
3698+ }
3699+ }
3700+
3701+ /* skip fields. */
3702+ for (count = 0; count < skip_chars && pos < size; count++)
3703+ {
3704+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3705+ pos += mblength;
3706+ }
3707+
3708+ return lp + pos;
3709+}
3710+#endif
3711+
3712 /* Return false if two strings OLD and NEW match, true if not.
3713 OLD and NEW point not to the beginnings of the lines
3714 but rather to the beginnings of the fields to compare.
3715@@ -235,6 +338,8 @@
3716 static bool
3717 different (char *old, char *new, size_t oldlen, size_t newlen)
3718 {
3719+ char *copy_old, *copy_new;
3720+
3721 if (check_chars < oldlen)
3722 oldlen = check_chars;
3723 if (check_chars < newlen)
3724@@ -242,14 +347,92 @@
3725
3726 if (ignore_case)
3727 {
3728- /* FIXME: This should invoke strcoll somehow. */
3729- return oldlen != newlen || memcasecmp (old, new, oldlen);
3730+ size_t i;
3731+
3732+ copy_old = alloca (oldlen + 1);
3733+ copy_new = alloca (oldlen + 1);
3734+
3735+ for (i = 0; i < oldlen; i++)
3736+ {
3737+ copy_old[i] = toupper (old[i]);
3738+ copy_new[i] = toupper (new[i]);
3739+ }
3740 }
3741- else if (hard_LC_COLLATE)
3742- return xmemcoll (old, oldlen, new, newlen) != 0;
3743 else
3744- return oldlen != newlen || memcmp (old, new, oldlen);
3745+ {
3746+ copy_old = (char *)old;
3747+ copy_new = (char *)new;
3748+ }
3749+
3750+ return xmemcoll (copy_old, oldlen, copy_new, newlen);
3751+}
3752+
3753+#if HAVE_MBRTOWC
3754+static int
3755+different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate)
3756+{
3757+ size_t i, j, chars;
3758+ const char *str[2];
3759+ char *copy[2];
3760+ size_t len[2];
3761+ mbstate_t state[2];
3762+ size_t mblength;
3763+ wchar_t wc, uwc;
3764+ mbstate_t state_bak;
3765+
3766+ str[0] = old;
3767+ str[1] = new;
3768+ len[0] = oldlen;
3769+ len[1] = newlen;
3770+ state[0] = oldstate;
3771+ state[1] = newstate;
3772+
3773+ for (i = 0; i < 2; i++)
3774+ {
3775+ copy[i] = alloca (len[i] + 1);
3776+
3777+ for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++)
3778+ {
3779+ state_bak = state[i];
3780+ mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i]));
3781+
3782+ switch (mblength)
3783+ {
3784+ case (size_t)-1:
3785+ case (size_t)-2:
3786+ state[i] = state_bak;
3787+ /* Fall through */
3788+ case 0:
3789+ mblength = 1;
3790+ break;
3791+
3792+ default:
3793+ if (ignore_case)
3794+ {
3795+ uwc = towupper (wc);
3796+
3797+ if (uwc != wc)
3798+ {
3799+ mbstate_t state_wc;
3800+
3801+ memset (&state_wc, '\0', sizeof(mbstate_t));
3802+ wcrtomb (copy[i] + j, uwc, &state_wc);
3803+ }
3804+ else
3805+ memcpy (copy[i] + j, str[i] + j, mblength);
3806+ }
3807+ else
3808+ memcpy (copy[i] + j, str[i] + j, mblength);
3809+ }
3810+ j += mblength;
3811+ }
3812+ copy[i][j] = '\0';
3813+ len[i] = j;
3814+ }
3815+
3816+ return xmemcoll (copy[0], len[0], copy[1], len[1]);
3817 }
3818+#endif
3819
3820 /* Output the line in linebuffer LINE to standard output
3821 provided that the switches say it should be output.
3822@@ -303,15 +486,43 @@
3823 {
3824 char *prevfield IF_LINT (= NULL);
3825 size_t prevlen IF_LINT (= 0);
3826+#if HAVE_MBRTOWC
3827+ mbstate_t prevstate;
3828+
3829+ memset (&prevstate, '\0', sizeof (mbstate_t));
3830+#endif
3831
3832 while (!feof (stdin))
3833 {
3834 char *thisfield;
3835 size_t thislen;
3836+#if HAVE_MBRTOWC
3837+ mbstate_t thisstate;
3838+#endif
3839+
3840 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
3841 break;
3842 thisfield = find_field (thisline);
3843 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3844+#if HAVE_MBRTOWC
3845+ if (MB_CUR_MAX > 1)
3846+ {
3847+ thisstate = thisline->state;
3848+
3849+ if (prevline->length == 0 || different_multi
3850+ (thisfield, prevfield, thislen, prevlen, thisstate, prevstate))
3851+ {
3852+ fwrite (thisline->buffer, sizeof (char),
3853+ thisline->length, stdout);
3854+
3855+ SWAP_LINES (prevline, thisline);
3856+ prevfield = thisfield;
3857+ prevlen = thislen;
3858+ prevstate = thisstate;
3859+ }
3860+ }
3861+ else
3862+#endif
3863 if (prevline->length == 0
3864 || different (thisfield, prevfield, thislen, prevlen))
3865 {
3866@@ -330,17 +541,26 @@
3867 size_t prevlen;
3868 uintmax_t match_count = 0;
3869 bool first_delimiter = true;
3870+#if HAVE_MBRTOWC
3871+ mbstate_t prevstate;
3872+#endif
3873
3874 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
3875 goto closefiles;
3876 prevfield = find_field (prevline);
3877 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
3878+#if HAVE_MBRTOWC
3879+ prevstate = prevline->state;
3880+#endif
3881
3882 while (!feof (stdin))
3883 {
3884 bool match;
3885 char *thisfield;
3886 size_t thislen;
3887+#if HAVE_MBRTOWC
3888+ mbstate_t thisstate;
3889+#endif
3890 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
3891 {
3892 if (ferror (stdin))
3893@@ -349,6 +569,15 @@
3894 }
3895 thisfield = find_field (thisline);
3896 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3897+#if HAVE_MBRTOWC
3898+ if (MB_CUR_MAX > 1)
3899+ {
3900+ thisstate = thisline->state;
3901+ match = !different_multi (thisfield, prevfield,
3902+ thislen, prevlen, thisstate, prevstate);
3903+ }
3904+ else
3905+#endif
3906 match = !different (thisfield, prevfield, thislen, prevlen);
3907 match_count += match;
3908
3909@@ -381,6 +610,9 @@
3910 SWAP_LINES (prevline, thisline);
3911 prevfield = thisfield;
3912 prevlen = thislen;
3913+#if HAVE_MBRTOWC
3914+ prevstate = thisstate;
3915+#endif
3916 if (!match)
3917 match_count = 0;
3918 }
3919@@ -426,6 +658,19 @@
3920
3921 atexit (close_stdout);
3922
3923+#if HAVE_MBRTOWC
3924+ if (MB_CUR_MAX > 1)
3925+ {
3926+ find_field = find_field_multi;
3927+ }
3928+ else
3929+#endif
3930+ {
3931+ find_field = find_field_uni;
3932+ }
3933+
3934+
3935+
3936 skip_chars = 0;
3937 skip_fields = 0;
3938 check_chars = SIZE_MAX;
3939diff -Naur coreutils-6.12.orig/tests/Makefile.am coreutils-6.12/tests/Makefile.am
3940--- coreutils-6.12.orig/tests/Makefile.am 2008-05-27 04:47:53.000000000 -0700
3941+++ coreutils-6.12/tests/Makefile.am 2009-01-08 12:56:50.000000000 -0800
3942@@ -191,6 +191,7 @@
3943 misc/shuf \
3944 misc/sort \
3945 misc/sort-compress \
3946+ misc/sort-mb-tests \
3947 misc/sort-merge \
3948 misc/sort-rand \
3949 misc/split-a \
3950@@ -391,6 +392,10 @@
3951 $(root_tests)
3952
3953 pr_data = \
3954+ misc/mb1.X \
3955+ misc/mb1.I \
3956+ misc/mb2.X \
3957+ misc/mb2.I \
3958 pr/0F \
3959 pr/0FF \
3960 pr/0FFnt \
3961diff -Naur coreutils-6.12.orig/tests/misc/cut coreutils-6.12/tests/misc/cut
3962--- coreutils-6.12.orig/tests/misc/cut 2008-05-16 23:41:11.000000000 -0700
3963+++ coreutils-6.12/tests/misc/cut 2009-01-08 12:56:50.000000000 -0800
3964@@ -26,7 +26,7 @@
3965 my $prog = 'cut';
3966 my $try = "Try \`$prog --help' for more information.\n";
3967 my $from_1 = "$prog: fields and positions are numbered from 1\n$try";
3968-my $inval = "$prog: invalid byte or field list\n$try";
3969+my $inval = "$prog: invalid byte, character or field list\n$try";
3970 my $no_endpoint = "$prog: invalid range with no endpoint: -\n$try";
3971
3972 my @Tests =
3973@@ -141,7 +141,7 @@
3974
3975 # None of the following invalid ranges provoked an error up to coreutils-6.9.
3976 ['inval1', qw(-f 2-0), {IN=>''}, {OUT=>''}, {EXIT=>1},
3977- {ERR=>"$prog: invalid decreasing range\n$try"}],
3978+ {ERR=>"$inval"}],
3979 ['inval2', qw(-f -), {IN=>''}, {OUT=>''}, {EXIT=>1}, {ERR=>$no_endpoint}],
3980 ['inval3', '-f', '4,-', {IN=>''}, {OUT=>''}, {EXIT=>1}, {ERR=>$no_endpoint}],
3981 ['inval4', '-f', '1-2,-', {IN=>''}, {OUT=>''}, {EXIT=>1}, {ERR=>$no_endpoint}],
3982diff -Naur coreutils-6.12.orig/tests/misc/mb1.I coreutils-6.12/tests/misc/mb1.I
3983--- coreutils-6.12.orig/tests/misc/mb1.I 1969-12-31 16:00:00.000000000 -0800
3984+++ coreutils-6.12/tests/misc/mb1.I 2009-01-08 12:56:50.000000000 -0800
3985@@ -0,0 +1,4 @@
3986+Apple10
3987+Banana5
3988+Citrus20
3989+Cherry30
3990diff -Naur coreutils-6.12.orig/tests/misc/mb1.X coreutils-6.12/tests/misc/mb1.X
3991--- coreutils-6.12.orig/tests/misc/mb1.X 1969-12-31 16:00:00.000000000 -0800
3992+++ coreutils-6.12/tests/misc/mb1.X 2009-01-08 12:56:50.000000000 -0800
3993@@ -0,0 +1,4 @@
3994+Banana5
3995+Apple10
3996+Citrus20
3997+Cherry30
3998diff -Naur coreutils-6.12.orig/tests/misc/mb2.I coreutils-6.12/tests/misc/mb2.I
3999--- coreutils-6.12.orig/tests/misc/mb2.I 1969-12-31 16:00:00.000000000 -0800
4000+++ coreutils-6.12/tests/misc/mb2.I 2009-01-08 12:56:50.000000000 -0800
4001@@ -0,0 +1,4 @@
4002+Apple1020
4003+Banana530
4004+Citrus205
4005+Cherry3010
4006diff -Naur coreutils-6.12.orig/tests/misc/mb2.X coreutils-6.12/tests/misc/mb2.X
4007--- coreutils-6.12.orig/tests/misc/mb2.X 1969-12-31 16:00:00.000000000 -0800
4008+++ coreutils-6.12/tests/misc/mb2.X 2009-01-08 12:56:50.000000000 -0800
4009@@ -0,0 +1,4 @@
4010+Citrus205
4011+Cherry3010
4012+Apple1020
4013+Banana530
4014diff -Naur coreutils-6.12.orig/tests/misc/sort-mb-tests coreutils-6.12/tests/misc/sort-mb-tests
4015--- coreutils-6.12.orig/tests/misc/sort-mb-tests 1969-12-31 16:00:00.000000000 -0800
4016+++ coreutils-6.12/tests/misc/sort-mb-tests 2009-01-08 12:56:50.000000000 -0800
4017@@ -0,0 +1,58 @@
4018+#! /bin/sh
4019+case $# in
4020+ 0) xx='../src/sort';;
4021+ *) xx="$1";;
4022+esac
4023+test "$VERBOSE" && echo=echo || echo=:
4024+$echo testing program: $xx
4025+errors=0
4026+test "$srcdir" || srcdir=.
4027+test "$VERBOSE" && $xx --version 2> /dev/null
4028+
4029+export LC_ALL=en_US.UTF-8
4030+locale -k LC_CTYPE 2>&1 | grep -q charmap.*UTF-8 || exit 77
4031+errors=0
4032+
4033+$xx -t  -k2 -n misc/mb1.I > misc/mb1.O
4034+code=$?
4035+if test $code != 0; then
4036+ $echo "Test mb1 failed: $xx return code $code differs from expected value 0" 1>&2
4037+ errors=`expr $errors + 1`
4038+else
4039+ cmp misc/mb1.O $srcdir/misc/mb1.X > /dev/null 2>&1
4040+ case $? in
4041+ 0) if test "$VERBOSE"; then $echo "passed mb1"; fi;;
4042+ 1) $echo "Test mb1 failed: files misc/mb1.O and $srcdir/misc/mb1.X differ" 1>&2
4043+ (diff -c misc/mb1.O $srcdir/misc/mb1.X) 2> /dev/null
4044+ errors=`expr $errors + 1`;;
4045+ 2) $echo "Test mb1 may have failed." 1>&2
4046+ $echo The command "cmp misc/mb1.O $srcdir/misc/mb1.X" failed. 1>&2
4047+ errors=`expr $errors + 1`;;
4048+ esac
4049+fi
4050+
4051+$xx -t  -k4 -n misc/mb2.I > misc/mb2.O
4052+code=$?
4053+if test $code != 0; then
4054+ $echo "Test mb2 failed: $xx return code $code differs from expected value 0" 1>&2
4055+ errors=`expr $errors + 1`
4056+else
4057+ cmp misc/mb2.O $srcdir/misc/mb2.X > /dev/null 2>&1
4058+ case $? in
4059+ 0) if test "$VERBOSE"; then $echo "passed mb2"; fi;;
4060+ 1) $echo "Test mb2 failed: files misc/mb2.O and $srcdir/misc/mb2.X differ" 1>&2
4061+ (diff -c misc/mb2.O $srcdir/misc/mb2.X) 2> /dev/null
4062+ errors=`expr $errors + 1`;;
4063+ 2) $echo "Test mb2 may have failed." 1>&2
4064+ $echo The command "cmp misc/mb2.O $srcdir/misc/mb2.X" failed. 1>&2
4065+ errors=`expr $errors + 1`;;
4066+ esac
4067+fi
4068+
4069+if test $errors = 0; then
4070+ $echo Passed all 113 tests. 1>&2
4071+else
4072+ $echo Failed $errors tests. 1>&2
4073+fi
4074+test $errors = 0 || errors=1
4075+exit $errors
4076
4077
Note: See TracBrowser for help on using the repository browser.