Blob Blame History Raw
diff --git a/src/cut.c b/src/cut.c
index 7ab6be4..022d0ad 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -28,6 +28,11 @@
 #include <assert.h>
 #include <getopt.h>
 #include <sys/types.h>
+
+/* Get mbstate_t, mbrtowc().  */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
 #include "system.h"
 
 #include "error.h"
@@ -38,6 +43,18 @@
 
 #include "set-fields.h"
 
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+   installation; work around this configuration error.        */
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
+# undef MB_LEN_MAX
+# define MB_LEN_MAX 16
+#endif
+
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t.  */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
 /* The official name of this program (e.g., no 'g' prefix).  */
 #define PROGRAM_NAME "cut"
 
@@ -54,6 +71,52 @@
     }									\
   while (0)
 
+/* Refill the buffer BUF to get a multibyte character. */
+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM)                        \
+  do                                                                        \
+    {                                                                        \
+      if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM))        \
+        {                                                                \
+          memmove (BUF, BUFPOS, BUFLEN);                                \
+          BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
+          BUFPOS = BUF;                                                        \
+        }                                                                \
+    }                                                                        \
+  while (0)
+
+/* Get wide character on BUFPOS. BUFPOS is not included after that.
+   If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
+  do                                                                        \
+    {                                                                        \
+      mbstate_t state_bak;                                                \
+                                                                        \
+      if (BUFLEN < 1)                                                        \
+        {                                                                \
+          WC = WEOF;                                                        \
+          break;                                                        \
+        }                                                                \
+                                                                        \
+      /* Get a wide character. */                                        \
+      CONVFAIL = false;                                                        \
+      state_bak = STATE;                                                \
+      MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE);        \
+                                                                        \
+      switch (MBLENGTH)                                                        \
+        {                                                                \
+        case (size_t)-1:                                                \
+        case (size_t)-2:                                                \
+          CONVFAIL = true;                                                        \
+          STATE = state_bak;                                                \
+          /* Fall througn. */                                                \
+                                                                        \
+        case 0:                                                                \
+          MBLENGTH = 1;                                                        \
+          break;                                                        \
+        }                                                                \
+    }                                                                        \
+  while (0)
+
 
 /* Pointer inside RP.  When checking if a byte or field is selected
    by a finite range, we check if it is between CURRENT_RP.LO
@@ -61,6 +124,9 @@
    CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
 static struct field_range_pair *current_rp;
 
+/* Length of the delimiter given as argument to -d.  */
+size_t delimlen;
+
 /* This buffer is used to support the semantics of the -s option
    (or lack of same) when the specified field list includes (does
    not include) the first field.  In both of those cases, the entire
@@ -77,15 +143,25 @@ enum operating_mode
   {
     undefined_mode,
 
-    /* Output characters that are in the given bytes. */
+    /* Output bytes that are at the given positions. */
     byte_mode,
 
+    /* Output characters that are at the given positions. */
+    character_mode,
+
     /* Output the given delimiter-separated fields. */
     field_mode
   };
 
 static enum operating_mode operating_mode;
 
+/* If nonzero, when in byte mode, don't split multibyte characters.  */
+static int byte_mode_character_aware;
+
+/* If nonzero, the function for single byte locale is work
+   if this program runs on multibyte locale. */
+static int force_singlebyte_mode;
+
 /* If true do not output lines containing no delimiter characters.
    Otherwise, all such lines are printed.  This option is valid only
    with field mode.  */
@@ -97,6 +173,9 @@ static bool complement;
 
 /* The delimiter character for field mode. */
 static unsigned char delim;
+#if HAVE_WCHAR_H
+static wchar_t wcdelim;
+#endif
 
 /* The delimiter for each line/record. */
 static unsigned char line_delim = '\n';
@@ -164,7 +243,7 @@ Print selected parts of lines from each FILE to standard output.\n\
   -f, --fields=LIST       select only these fields;  also print any line\n\
                             that contains no delimiter character, unless\n\
                             the -s option is specified\n\
-  -n                      (ignored)\n\
+  -n                      with -b: don't split multibyte characters\n\
 "), stdout);
       fputs (_("\
       --complement        complement the set of selected bytes, characters\n\
@@ -280,6 +359,82 @@ cut_bytes (FILE *stream)
     }
 }
 
+#if HAVE_MBRTOWC
+/* This function is in use for the following case.
+
+   1. Read from the stream STREAM, printing to standard output any selected
+   characters.
+
+   2. Read from stream STREAM, printing to standard output any selected bytes,
+   without splitting multibyte characters.  */
+
+static void
+cut_characters_or_cut_bytes_no_split (FILE *stream)
+{
+  uintmax_t idx;             /* number of bytes or characters in the line so far. */
+  char buf[MB_LEN_MAX + BUFSIZ];  /* For spooling a read byte sequence. */
+  char *bufpos;                /* Next read position of BUF. */
+  size_t buflen;        /* The length of the byte sequence in buf. */
+  wint_t wc;                /* A gotten wide character. */
+  size_t mblength;        /* The byte size of a multibyte character which shows
+                           as same character as WC. */
+  mbstate_t state;        /* State of the stream. */
+  bool convfail = false;  /* true, when conversion failed. Otherwise false. */
+  /* Whether to begin printing delimiters between ranges for the current line.
+     Set after we've begun printing data corresponding to the first range.  */
+  bool print_delimiter = false;
+
+  idx = 0;
+  buflen = 0;
+  bufpos = buf;
+  memset (&state, '\0', sizeof(mbstate_t));
+
+  current_rp = frp;
+
+  while (1)
+    {
+      REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+      GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
+      (void) convfail;  /* ignore unused */
+
+      if (wc == WEOF)
+        {
+          if (idx > 0)
+            putchar (line_delim);
+          break;
+        }
+      else if (wc == line_delim)
+        {
+          putchar (line_delim);
+          idx = 0;
+          print_delimiter = false;
+          current_rp = frp;
+        }
+      else
+        {
+          next_item (&idx);
+          if (print_kth (idx))
+            {
+              if (output_delimiter_specified)
+                {
+                  if (print_delimiter && is_range_start_index (idx))
+                    {
+                      fwrite (output_delimiter_string, sizeof (char),
+                              output_delimiter_length, stdout);
+                    }
+                  print_delimiter = true;
+                }
+              fwrite (bufpos, mblength, sizeof(char), stdout);
+            }
+        }
+
+      buflen -= mblength;
+      bufpos += mblength;
+    }
+}
+#endif
+
 /* Read from stream STREAM, printing to standard output any selected fields.  */
 
 static void
@@ -425,13 +580,211 @@ cut_fields (FILE *stream)
     }
 }
 
+#if HAVE_MBRTOWC
+static void
+cut_fields_mb (FILE *stream)
+{
+  int c;
+  uintmax_t field_idx;
+  int found_any_selected_field;
+  int buffer_first_field;
+  int empty_input;
+  char buf[MB_LEN_MAX + BUFSIZ];  /* For spooling a read byte sequence. */
+  char *bufpos;                /* Next read position of BUF. */
+  size_t buflen;        /* The length of the byte sequence in buf. */
+  wint_t wc = 0;        /* A gotten wide character. */
+  size_t mblength;        /* The byte size of a multibyte character which shows
+                           as same character as WC. */
+  mbstate_t state;        /* State of the stream. */
+  bool convfail = false;  /* true, when conversion failed. Otherwise false. */
+
+  current_rp = frp;
+
+  found_any_selected_field = 0;
+  field_idx = 1;
+  bufpos = buf;
+  buflen = 0;
+  memset (&state, '\0', sizeof(mbstate_t));
+
+  c = getc (stream);
+  empty_input = (c == EOF);
+  if (c != EOF)
+  {
+    ungetc (c, stream);
+    wc = 0;
+  }
+  else
+    wc = WEOF;
+
+  /* To support the semantics of the -s flag, we may have to buffer
+     all of the first field to determine whether it is `delimited.'
+     But that is unnecessary if all non-delimited lines must be printed
+     and the first field has been selected, or if non-delimited lines
+     must be suppressed and the first field has *not* been selected.
+     That is because a non-delimited line has exactly one field.  */
+  buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
+
+  while (1)
+    {
+      if (field_idx == 1 && buffer_first_field)
+        {
+          int len = 0;
+
+          while (1)
+            {
+              REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+              GET_NEXT_WC_FROM_BUFFER
+                (wc, bufpos, buflen, mblength, state, convfail);
+
+              if (wc == WEOF)
+                break;
+
+              field_1_buffer = xrealloc (field_1_buffer, len + mblength);
+              memcpy (field_1_buffer + len, bufpos, mblength);
+              len += mblength;
+              buflen -= mblength;
+              bufpos += mblength;
+
+              if (!convfail && (wc == line_delim || wc == wcdelim))
+                break;
+            }
+
+          if (len <= 0 && wc == WEOF)
+            break;
+
+          /* If the first field extends to the end of line (it is not
+             delimited) and we are printing all non-delimited lines,
+             print this one.  */
+          if (convfail || (!convfail && wc != wcdelim))
+            {
+              if (suppress_non_delimited)
+                {
+                  /* Empty.        */
+                }
+              else
+                {
+                  fwrite (field_1_buffer, sizeof (char), len, stdout);
+                  /* Make sure the output line is newline terminated.  */
+                  if (convfail || (!convfail && wc != line_delim))
+                    putchar (line_delim);
+                }
+              continue;
+            }
+
+          if (print_kth (1))
+            {
+              /* Print the field, but not the trailing delimiter.  */
+              fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
+              found_any_selected_field = 1;
+            }
+          next_item (&field_idx);
+        }
+
+      if (wc != WEOF)
+        {
+          if (print_kth (field_idx))
+            {
+              if (found_any_selected_field)
+                {
+                  fwrite (output_delimiter_string, sizeof (char),
+                          output_delimiter_length, stdout);
+                }
+              found_any_selected_field = 1;
+            }
+
+          while (1)
+            {
+              REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+              GET_NEXT_WC_FROM_BUFFER
+                (wc, bufpos, buflen, mblength, state, convfail);
+
+              if (wc == WEOF)
+                break;
+              else if (!convfail && (wc == wcdelim || wc == line_delim))
+                {
+                  buflen -= mblength;
+                  bufpos += mblength;
+                  break;
+                }
+
+              if (print_kth (field_idx))
+                fwrite (bufpos, mblength, sizeof(char), stdout);
+
+              buflen -= mblength;
+              bufpos += mblength;
+            }
+        }
+
+      if ((!convfail || wc == line_delim) && buflen < 1)
+        wc = WEOF;
+
+      if (!convfail && wc == wcdelim)
+        next_item (&field_idx);
+      else if (wc == WEOF || (!convfail && wc == line_delim))
+        {
+          if (found_any_selected_field
+              || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
+            putchar (line_delim);
+          if (wc == WEOF)
+            break;
+          field_idx = 1;
+          current_rp = frp;
+          found_any_selected_field = 0;
+        }
+    }
+}
+#endif
+
 static void
 cut_stream (FILE *stream)
 {
-  if (operating_mode == byte_mode)
-    cut_bytes (stream);
+#if HAVE_MBRTOWC
+  if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+    {
+      switch (operating_mode)
+        {
+        case byte_mode:
+          if (byte_mode_character_aware)
+            cut_characters_or_cut_bytes_no_split (stream);
+          else
+            cut_bytes (stream);
+          break;
+
+        case character_mode:
+          cut_characters_or_cut_bytes_no_split (stream);
+          break;
+
+        case field_mode:
+          if (delimlen == 1)
+            {
+              /* Check if we have utf8 multibyte locale, so we can use this
+                 optimization because of uniqueness of characters, which is
+                 not true for e.g. SJIS */
+              char * loc = setlocale(LC_CTYPE, NULL);
+              if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
+                  strstr (loc, "UTF8") || strstr (loc, "utf8")))
+                {
+                  cut_fields (stream);
+                  break;
+                }
+            }
+          cut_fields_mb (stream);
+          break;
+
+        default:
+          abort ();
+        }
+    }
   else
-    cut_fields (stream);
+#endif
+    {
+      if (operating_mode == field_mode)
+        cut_fields (stream);
+      else
+        cut_bytes (stream);
+    }
 }
 
 /* Process file FILE to standard output.
@@ -483,6 +836,7 @@ main (int argc, char **argv)
   bool ok;
   bool delim_specified = false;
   char *spec_list_string IF_LINT ( = NULL);
+  char mbdelim[MB_LEN_MAX + 1];
 
   initialize_main (&argc, &argv);
   set_program_name (argv[0]);
@@ -505,7 +859,6 @@ main (int argc, char **argv)
       switch (optc)
         {
         case 'b':
-        case 'c':
           /* Build the byte list. */
           if (operating_mode != undefined_mode)
             FATAL_ERROR (_("only one type of list may be specified"));
@@ -513,6 +866,14 @@ main (int argc, char **argv)
           spec_list_string = optarg;
           break;
 
+        case 'c':
+          /* Build the character list. */
+          if (operating_mode != undefined_mode)
+            FATAL_ERROR (_("only one type of list may be specified"));
+          operating_mode = character_mode;
+          spec_list_string = optarg;
+          break;
+
         case 'f':
           /* Build the field list. */
           if (operating_mode != undefined_mode)
@@ -524,10 +885,38 @@ main (int argc, char **argv)
         case 'd':
           /* New delimiter. */
           /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
-          if (optarg[0] != '\0' && optarg[1] != '\0')
-            FATAL_ERROR (_("the delimiter must be a single character"));
-          delim = optarg[0];
-          delim_specified = true;
+            {
+#if HAVE_MBRTOWC
+              if(MB_CUR_MAX > 1)
+                {
+                  mbstate_t state;
+
+                  memset (&state, '\0', sizeof(mbstate_t));
+                  delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
+
+                  if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
+                    ++force_singlebyte_mode;
+                  else
+                    {
+                      delimlen = (delimlen < 1) ? 1 : delimlen;
+                      if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
+                        FATAL_ERROR (_("the delimiter must be a single character"));
+                      memcpy (mbdelim, optarg, delimlen);
+                      mbdelim[delimlen] = '\0';
+                      if (delimlen == 1)
+                        delim = *optarg;
+                    }
+                }
+
+              if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
+#endif
+                {
+                  if (optarg[0] != '\0' && optarg[1] != '\0')
+                    FATAL_ERROR (_("the delimiter must be a single character"));
+                  delim = (unsigned char) optarg[0];
+                }
+            delim_specified = true;
+          }
           break;
 
         case OUTPUT_DELIMITER_OPTION:
@@ -540,6 +929,7 @@ main (int argc, char **argv)
           break;
 
         case 'n':
+          byte_mode_character_aware = 1;
           break;
 
         case 's':
@@ -579,15 +969,34 @@ main (int argc, char **argv)
               | (complement ? SETFLD_COMPLEMENT : 0) );
 
   if (!delim_specified)
-    delim = '\t';
+    {
+      delim = '\t';
+#ifdef HAVE_MBRTOWC
+      wcdelim = L'\t';
+      mbdelim[0] = '\t';
+      mbdelim[1] = '\0';
+      delimlen = 1;
+#endif
+    }
 
   if (output_delimiter_string == NULL)
     {
-      static char dummy[2];
-      dummy[0] = delim;
-      dummy[1] = '\0';
-      output_delimiter_string = dummy;
-      output_delimiter_length = 1;
+#ifdef HAVE_MBRTOWC
+      if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+        {
+          output_delimiter_string = xstrdup(mbdelim);
+          output_delimiter_length = delimlen;
+        }
+
+      if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
+#endif
+        {
+          static char dummy[2];
+          dummy[0] = delim;
+          dummy[1] = '\0';
+          output_delimiter_string = dummy;
+          output_delimiter_length = 1;
+        }
     }
 
   if (optind == argc)