Blame SOURCES/coreutils-i18n-cut-old.patch

e63663
diff --git a/src/cut.c b/src/cut.c
e63663
index 7ab6be4..022d0ad 100644
e63663
--- a/src/cut.c
e63663
+++ b/src/cut.c
e63663
@@ -28,6 +28,11 @@
e63663
 #include <assert.h>
e63663
 #include <getopt.h>
e63663
 #include <sys/types.h>
e63663
+
e63663
+/* Get mbstate_t, mbrtowc().  */
e63663
+#if HAVE_WCHAR_H
e63663
+# include <wchar.h>
e63663
+#endif
e63663
 #include "system.h"
e63663
 
e63663
 #include "error.h"
e63663
@@ -38,6 +43,18 @@
e63663
 
e63663
 #include "set-fields.h"
e63663
 
e63663
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
e63663
+   installation; work around this configuration error.        */
e63663
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
e63663
+# undef MB_LEN_MAX
e63663
+# define MB_LEN_MAX 16
e63663
+#endif
e63663
+
e63663
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t.  */
e63663
+#if HAVE_MBRTOWC && defined mbstate_t
e63663
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
e63663
+#endif
e63663
+
e63663
 /* The official name of this program (e.g., no 'g' prefix).  */
e63663
 #define PROGRAM_NAME "cut"
e63663
 
e63663
@@ -54,6 +71,52 @@
e63663
     }									\
e63663
   while (0)
e63663
 
e63663
+/* Refill the buffer BUF to get a multibyte character. */
e63663
+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM)                        \
e63663
+  do                                                                        \
e63663
+    {                                                                        \
e63663
+      if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM))        \
e63663
+        {                                                                \
e63663
+          memmove (BUF, BUFPOS, BUFLEN);                                \
e63663
+          BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
e63663
+          BUFPOS = BUF;                                                        \
e63663
+        }                                                                \
e63663
+    }                                                                        \
e63663
+  while (0)
e63663
+
e63663
+/* Get wide character on BUFPOS. BUFPOS is not included after that.
e63663
+   If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
e63663
+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
e63663
+  do                                                                        \
e63663
+    {                                                                        \
e63663
+      mbstate_t state_bak;                                                \
e63663
+                                                                        \
e63663
+      if (BUFLEN < 1)                                                        \
e63663
+        {                                                                \
e63663
+          WC = WEOF;                                                        \
e63663
+          break;                                                        \
e63663
+        }                                                                \
e63663
+                                                                        \
e63663
+      /* Get a wide character. */                                        \
e63663
+      CONVFAIL = false;                                                        \
e63663
+      state_bak = STATE;                                                \
e63663
+      MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE);        \
e63663
+                                                                        \
e63663
+      switch (MBLENGTH)                                                        \
e63663
+        {                                                                \
e63663
+        case (size_t)-1:                                                \
e63663
+        case (size_t)-2:                                                \
e63663
+          CONVFAIL = true;                                                        \
e63663
+          STATE = state_bak;                                                \
e63663
+          /* Fall througn. */                                                \
e63663
+                                                                        \
e63663
+        case 0:                                                                \
e63663
+          MBLENGTH = 1;                                                        \
e63663
+          break;                                                        \
e63663
+        }                                                                \
e63663
+    }                                                                        \
e63663
+  while (0)
e63663
+
e63663
 
e63663
 /* Pointer inside RP.  When checking if a byte or field is selected
e63663
    by a finite range, we check if it is between CURRENT_RP.LO
e63663
@@ -61,6 +124,9 @@
e63663
    CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
e63663
 static struct field_range_pair *current_rp;
e63663
 
e63663
+/* Length of the delimiter given as argument to -d.  */
e63663
+size_t delimlen;
e63663
+
e63663
 /* This buffer is used to support the semantics of the -s option
e63663
    (or lack of same) when the specified field list includes (does
e63663
    not include) the first field.  In both of those cases, the entire
e63663
@@ -77,15 +143,25 @@ enum operating_mode
e63663
   {
e63663
     undefined_mode,
e63663
 
e63663
-    /* Output characters that are in the given bytes. */
e63663
+    /* Output bytes that are at the given positions. */
e63663
     byte_mode,
e63663
 
e63663
+    /* Output characters that are at the given positions. */
e63663
+    character_mode,
e63663
+
e63663
     /* Output the given delimiter-separated fields. */
e63663
     field_mode
e63663
   };
e63663
 
e63663
 static enum operating_mode operating_mode;
e63663
 
e63663
+/* If nonzero, when in byte mode, don't split multibyte characters.  */
e63663
+static int byte_mode_character_aware;
e63663
+
e63663
+/* If nonzero, the function for single byte locale is work
e63663
+   if this program runs on multibyte locale. */
e63663
+static int force_singlebyte_mode;
e63663
+
e63663
 /* If true do not output lines containing no delimiter characters.
e63663
    Otherwise, all such lines are printed.  This option is valid only
e63663
    with field mode.  */
e63663
@@ -97,6 +173,9 @@ static bool complement;
e63663
 
e63663
 /* The delimiter character for field mode. */
e63663
 static unsigned char delim;
e63663
+#if HAVE_WCHAR_H
e63663
+static wchar_t wcdelim;
e63663
+#endif
e63663
 
e63663
 /* The delimiter for each line/record. */
e63663
 static unsigned char line_delim = '\n';
e63663
@@ -164,7 +243,7 @@ Print selected parts of lines from each FILE to standard output.\n\
e63663
   -f, --fields=LIST       select only these fields;  also print any line\n\
e63663
                             that contains no delimiter character, unless\n\
e63663
                             the -s option is specified\n\
e63663
-  -n                      (ignored)\n\
e63663
+  -n                      with -b: don't split multibyte characters\n\
e63663
 "), stdout);
e63663
       fputs (_("\
e63663
       --complement        complement the set of selected bytes, characters\n\
e63663
@@ -280,6 +359,82 @@ cut_bytes (FILE *stream)
e63663
     }
e63663
 }
e63663
 
e63663
+#if HAVE_MBRTOWC
e63663
+/* This function is in use for the following case.
e63663
+
e63663
+   1. Read from the stream STREAM, printing to standard output any selected
e63663
+   characters.
e63663
+
e63663
+   2. Read from stream STREAM, printing to standard output any selected bytes,
e63663
+   without splitting multibyte characters.  */
e63663
+
e63663
+static void
e63663
+cut_characters_or_cut_bytes_no_split (FILE *stream)
e63663
+{
e63663
+  uintmax_t idx;             /* number of bytes or characters in the line so far. */
e63663
+  char buf[MB_LEN_MAX + BUFSIZ];  /* For spooling a read byte sequence. */
e63663
+  char *bufpos;                /* Next read position of BUF. */
e63663
+  size_t buflen;        /* The length of the byte sequence in buf. */
e63663
+  wint_t wc;                /* A gotten wide character. */
e63663
+  size_t mblength;        /* The byte size of a multibyte character which shows
e63663
+                           as same character as WC. */
e63663
+  mbstate_t state;        /* State of the stream. */
e63663
+  bool convfail = false;  /* true, when conversion failed. Otherwise false. */
e63663
+  /* Whether to begin printing delimiters between ranges for the current line.
e63663
+     Set after we've begun printing data corresponding to the first range.  */
e63663
+  bool print_delimiter = false;
e63663
+
e63663
+  idx = 0;
e63663
+  buflen = 0;
e63663
+  bufpos = buf;
e63663
+  memset (&state, '\0', sizeof(mbstate_t));
e63663
+
e63663
+  current_rp = frp;
e63663
+
e63663
+  while (1)
e63663
+    {
e63663
+      REFILL_BUFFER (buf, bufpos, buflen, stream);
e63663
+
e63663
+      GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
e63663
+      (void) convfail;  /* ignore unused */
e63663
+
e63663
+      if (wc == WEOF)
e63663
+        {
e63663
+          if (idx > 0)
e63663
+            putchar (line_delim);
e63663
+          break;
e63663
+        }
e63663
+      else if (wc == line_delim)
e63663
+        {
e63663
+          putchar (line_delim);
e63663
+          idx = 0;
e63663
+          print_delimiter = false;
e63663
+          current_rp = frp;
e63663
+        }
e63663
+      else
e63663
+        {
e63663
+          next_item (&idx);
e63663
+          if (print_kth (idx))
e63663
+            {
e63663
+              if (output_delimiter_specified)
e63663
+                {
e63663
+                  if (print_delimiter && is_range_start_index (idx))
e63663
+                    {
e63663
+                      fwrite (output_delimiter_string, sizeof (char),
e63663
+                              output_delimiter_length, stdout);
e63663
+                    }
e63663
+                  print_delimiter = true;
e63663
+                }
e63663
+              fwrite (bufpos, mblength, sizeof(char), stdout);
e63663
+            }
e63663
+        }
e63663
+
e63663
+      buflen -= mblength;
e63663
+      bufpos += mblength;
e63663
+    }
e63663
+}
e63663
+#endif
e63663
+
e63663
 /* Read from stream STREAM, printing to standard output any selected fields.  */
e63663
 
e63663
 static void
e63663
@@ -425,13 +580,211 @@ cut_fields (FILE *stream)
e63663
     }
e63663
 }
e63663
 
e63663
+#if HAVE_MBRTOWC
e63663
+static void
e63663
+cut_fields_mb (FILE *stream)
e63663
+{
e63663
+  int c;
e63663
+  uintmax_t field_idx;
e63663
+  int found_any_selected_field;
e63663
+  int buffer_first_field;
e63663
+  int empty_input;
e63663
+  char buf[MB_LEN_MAX + BUFSIZ];  /* For spooling a read byte sequence. */
e63663
+  char *bufpos;                /* Next read position of BUF. */
e63663
+  size_t buflen;        /* The length of the byte sequence in buf. */
e63663
+  wint_t wc = 0;        /* A gotten wide character. */
e63663
+  size_t mblength;        /* The byte size of a multibyte character which shows
e63663
+                           as same character as WC. */
e63663
+  mbstate_t state;        /* State of the stream. */
e63663
+  bool convfail = false;  /* true, when conversion failed. Otherwise false. */
e63663
+
e63663
+  current_rp = frp;
e63663
+
e63663
+  found_any_selected_field = 0;
e63663
+  field_idx = 1;
e63663
+  bufpos = buf;
e63663
+  buflen = 0;
e63663
+  memset (&state, '\0', sizeof(mbstate_t));
e63663
+
e63663
+  c = getc (stream);
e63663
+  empty_input = (c == EOF);
e63663
+  if (c != EOF)
e63663
+  {
e63663
+    ungetc (c, stream);
e63663
+    wc = 0;
e63663
+  }
e63663
+  else
e63663
+    wc = WEOF;
e63663
+
e63663
+  /* To support the semantics of the -s flag, we may have to buffer
e63663
+     all of the first field to determine whether it is `delimited.'
e63663
+     But that is unnecessary if all non-delimited lines must be printed
e63663
+     and the first field has been selected, or if non-delimited lines
e63663
+     must be suppressed and the first field has *not* been selected.
e63663
+     That is because a non-delimited line has exactly one field.  */
e63663
+  buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
e63663
+
e63663
+  while (1)
e63663
+    {
e63663
+      if (field_idx == 1 && buffer_first_field)
e63663
+        {
e63663
+          int len = 0;
e63663
+
e63663
+          while (1)
e63663
+            {
e63663
+              REFILL_BUFFER (buf, bufpos, buflen, stream);
e63663
+
e63663
+              GET_NEXT_WC_FROM_BUFFER
e63663
+                (wc, bufpos, buflen, mblength, state, convfail);
e63663
+
e63663
+              if (wc == WEOF)
e63663
+                break;
e63663
+
e63663
+              field_1_buffer = xrealloc (field_1_buffer, len + mblength);
e63663
+              memcpy (field_1_buffer + len, bufpos, mblength);
e63663
+              len += mblength;
e63663
+              buflen -= mblength;
e63663
+              bufpos += mblength;
e63663
+
e63663
+              if (!convfail && (wc == line_delim || wc == wcdelim))
e63663
+                break;
e63663
+            }
e63663
+
e63663
+          if (len <= 0 && wc == WEOF)
e63663
+            break;
e63663
+
e63663
+          /* If the first field extends to the end of line (it is not
e63663
+             delimited) and we are printing all non-delimited lines,
e63663
+             print this one.  */
e63663
+          if (convfail || (!convfail && wc != wcdelim))
e63663
+            {
e63663
+              if (suppress_non_delimited)
e63663
+                {
e63663
+                  /* Empty.        */
e63663
+                }
e63663
+              else
e63663
+                {
e63663
+                  fwrite (field_1_buffer, sizeof (char), len, stdout);
e63663
+                  /* Make sure the output line is newline terminated.  */
e63663
+                  if (convfail || (!convfail && wc != line_delim))
e63663
+                    putchar (line_delim);
e63663
+                }
e63663
+              continue;
e63663
+            }
e63663
+
e63663
+          if (print_kth (1))
e63663
+            {
e63663
+              /* Print the field, but not the trailing delimiter.  */
e63663
+              fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
e63663
+              found_any_selected_field = 1;
e63663
+            }
e63663
+          next_item (&field_idx);
e63663
+        }
e63663
+
e63663
+      if (wc != WEOF)
e63663
+        {
e63663
+          if (print_kth (field_idx))
e63663
+            {
e63663
+              if (found_any_selected_field)
e63663
+                {
e63663
+                  fwrite (output_delimiter_string, sizeof (char),
e63663
+                          output_delimiter_length, stdout);
e63663
+                }
e63663
+              found_any_selected_field = 1;
e63663
+            }
e63663
+
e63663
+          while (1)
e63663
+            {
e63663
+              REFILL_BUFFER (buf, bufpos, buflen, stream);
e63663
+
e63663
+              GET_NEXT_WC_FROM_BUFFER
e63663
+                (wc, bufpos, buflen, mblength, state, convfail);
e63663
+
e63663
+              if (wc == WEOF)
e63663
+                break;
e63663
+              else if (!convfail && (wc == wcdelim || wc == line_delim))
e63663
+                {
e63663
+                  buflen -= mblength;
e63663
+                  bufpos += mblength;
e63663
+                  break;
e63663
+                }
e63663
+
e63663
+              if (print_kth (field_idx))
e63663
+                fwrite (bufpos, mblength, sizeof(char), stdout);
e63663
+
e63663
+              buflen -= mblength;
e63663
+              bufpos += mblength;
e63663
+            }
e63663
+        }
e63663
+
e63663
+      if ((!convfail || wc == line_delim) && buflen < 1)
e63663
+        wc = WEOF;
e63663
+
e63663
+      if (!convfail && wc == wcdelim)
e63663
+        next_item (&field_idx);
e63663
+      else if (wc == WEOF || (!convfail && wc == line_delim))
e63663
+        {
e63663
+          if (found_any_selected_field
e63663
+              || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
e63663
+            putchar (line_delim);
e63663
+          if (wc == WEOF)
e63663
+            break;
e63663
+          field_idx = 1;
e63663
+          current_rp = frp;
e63663
+          found_any_selected_field = 0;
e63663
+        }
e63663
+    }
e63663
+}
e63663
+#endif
e63663
+
e63663
 static void
e63663
 cut_stream (FILE *stream)
e63663
 {
e63663
-  if (operating_mode == byte_mode)
e63663
-    cut_bytes (stream);
e63663
+#if HAVE_MBRTOWC
e63663
+  if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
e63663
+    {
e63663
+      switch (operating_mode)
e63663
+        {
e63663
+        case byte_mode:
e63663
+          if (byte_mode_character_aware)
e63663
+            cut_characters_or_cut_bytes_no_split (stream);
e63663
+          else
e63663
+            cut_bytes (stream);
e63663
+          break;
e63663
+
e63663
+        case character_mode:
e63663
+          cut_characters_or_cut_bytes_no_split (stream);
e63663
+          break;
e63663
+
e63663
+        case field_mode:
e63663
+          if (delimlen == 1)
e63663
+            {
e63663
+              /* Check if we have utf8 multibyte locale, so we can use this
e63663
+                 optimization because of uniqueness of characters, which is
e63663
+                 not true for e.g. SJIS */
e63663
+              char * loc = setlocale(LC_CTYPE, NULL);
e63663
+              if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
e63663
+                  strstr (loc, "UTF8") || strstr (loc, "utf8")))
e63663
+                {
e63663
+                  cut_fields (stream);
e63663
+                  break;
e63663
+                }
e63663
+            }
e63663
+          cut_fields_mb (stream);
e63663
+          break;
e63663
+
e63663
+        default:
e63663
+          abort ();
e63663
+        }
e63663
+    }
e63663
   else
e63663
-    cut_fields (stream);
e63663
+#endif
e63663
+    {
e63663
+      if (operating_mode == field_mode)
e63663
+        cut_fields (stream);
e63663
+      else
e63663
+        cut_bytes (stream);
e63663
+    }
e63663
 }
e63663
 
e63663
 /* Process file FILE to standard output.
e63663
@@ -483,6 +836,7 @@ main (int argc, char **argv)
e63663
   bool ok;
e63663
   bool delim_specified = false;
e63663
   char *spec_list_string IF_LINT ( = NULL);
e63663
+  char mbdelim[MB_LEN_MAX + 1];
e63663
 
e63663
   initialize_main (&argc, &argv);
e63663
   set_program_name (argv[0]);
e63663
@@ -505,7 +859,6 @@ main (int argc, char **argv)
e63663
       switch (optc)
e63663
         {
e63663
         case 'b':
e63663
-        case 'c':
e63663
           /* Build the byte list. */
e63663
           if (operating_mode != undefined_mode)
e63663
             FATAL_ERROR (_("only one type of list may be specified"));
e63663
@@ -513,6 +866,14 @@ main (int argc, char **argv)
e63663
           spec_list_string = optarg;
e63663
           break;
e63663
 
e63663
+        case 'c':
e63663
+          /* Build the character list. */
e63663
+          if (operating_mode != undefined_mode)
e63663
+            FATAL_ERROR (_("only one type of list may be specified"));
e63663
+          operating_mode = character_mode;
e63663
+          spec_list_string = optarg;
e63663
+          break;
e63663
+
e63663
         case 'f':
e63663
           /* Build the field list. */
e63663
           if (operating_mode != undefined_mode)
e63663
@@ -524,10 +885,38 @@ main (int argc, char **argv)
e63663
         case 'd':
e63663
           /* New delimiter. */
e63663
           /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
e63663
-          if (optarg[0] != '\0' && optarg[1] != '\0')
e63663
-            FATAL_ERROR (_("the delimiter must be a single character"));
e63663
-          delim = optarg[0];
e63663
-          delim_specified = true;
e63663
+            {
e63663
+#if HAVE_MBRTOWC
e63663
+              if(MB_CUR_MAX > 1)
e63663
+                {
e63663
+                  mbstate_t state;
e63663
+
e63663
+                  memset (&state, '\0', sizeof(mbstate_t));
e63663
+                  delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
e63663
+
e63663
+                  if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
e63663
+                    ++force_singlebyte_mode;
e63663
+                  else
e63663
+                    {
e63663
+                      delimlen = (delimlen < 1) ? 1 : delimlen;
e63663
+                      if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
e63663
+                        FATAL_ERROR (_("the delimiter must be a single character"));
e63663
+                      memcpy (mbdelim, optarg, delimlen);
e63663
+                      mbdelim[delimlen] = '\0';
e63663
+                      if (delimlen == 1)
e63663
+                        delim = *optarg;
e63663
+                    }
e63663
+                }
e63663
+
e63663
+              if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
e63663
+#endif
e63663
+                {
e63663
+                  if (optarg[0] != '\0' && optarg[1] != '\0')
e63663
+                    FATAL_ERROR (_("the delimiter must be a single character"));
e63663
+                  delim = (unsigned char) optarg[0];
e63663
+                }
e63663
+            delim_specified = true;
e63663
+          }
e63663
           break;
e63663
 
e63663
         case OUTPUT_DELIMITER_OPTION:
e63663
@@ -540,6 +929,7 @@ main (int argc, char **argv)
e63663
           break;
e63663
 
e63663
         case 'n':
e63663
+          byte_mode_character_aware = 1;
e63663
           break;
e63663
 
e63663
         case 's':
e63663
@@ -579,15 +969,34 @@ main (int argc, char **argv)
e63663
               | (complement ? SETFLD_COMPLEMENT : 0) );
e63663
 
e63663
   if (!delim_specified)
e63663
-    delim = '\t';
e63663
+    {
e63663
+      delim = '\t';
e63663
+#ifdef HAVE_MBRTOWC
e63663
+      wcdelim = L'\t';
e63663
+      mbdelim[0] = '\t';
e63663
+      mbdelim[1] = '\0';
e63663
+      delimlen = 1;
e63663
+#endif
e63663
+    }
e63663
 
e63663
   if (output_delimiter_string == NULL)
e63663
     {
e63663
-      static char dummy[2];
e63663
-      dummy[0] = delim;
e63663
-      dummy[1] = '\0';
e63663
-      output_delimiter_string = dummy;
e63663
-      output_delimiter_length = 1;
e63663
+#ifdef HAVE_MBRTOWC
e63663
+      if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
e63663
+        {
e63663
+          output_delimiter_string = xstrdup(mbdelim);
e63663
+          output_delimiter_length = delimlen;
e63663
+        }
e63663
+
e63663
+      if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
e63663
+#endif
e63663
+        {
e63663
+          static char dummy[2];
e63663
+          dummy[0] = delim;
e63663
+          dummy[1] = '\0';
e63663
+          output_delimiter_string = dummy;
e63663
+          output_delimiter_length = 1;
e63663
+        }
e63663
     }
e63663
 
e63663
   if (optind == argc)