diff --git a/src/cut.c b/src/cut.c
index 7ab6be4..022d0ad 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -28,6 +28,11 @@
#include <assert.h>
#include <getopt.h>
#include <sys/types.h>
+
+/* Get mbstate_t, mbrtowc(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
#include "system.h"
#include "error.h"
@@ -38,6 +43,18 @@
#include "set-fields.h"
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
+# undef MB_LEN_MAX
+# define MB_LEN_MAX 16
+#endif
+
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "cut"
@@ -54,6 +71,52 @@
} \
while (0)
+/* Refill the buffer BUF to get a multibyte character. */
+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
+ do \
+ { \
+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
+ { \
+ memmove (BUF, BUFPOS, BUFLEN); \
+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
+ BUFPOS = BUF; \
+ } \
+ } \
+ while (0)
+
+/* Get wide character on BUFPOS. BUFPOS is not included after that.
+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
+ do \
+ { \
+ mbstate_t state_bak; \
+ \
+ if (BUFLEN < 1) \
+ { \
+ WC = WEOF; \
+ break; \
+ } \
+ \
+ /* Get a wide character. */ \
+ CONVFAIL = false; \
+ state_bak = STATE; \
+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
+ \
+ switch (MBLENGTH) \
+ { \
+ case (size_t)-1: \
+ case (size_t)-2: \
+ CONVFAIL = true; \
+ STATE = state_bak; \
+ /* Fall througn. */ \
+ \
+ case 0: \
+ MBLENGTH = 1; \
+ break; \
+ } \
+ } \
+ while (0)
+
/* Pointer inside RP. When checking if a byte or field is selected
by a finite range, we check if it is between CURRENT_RP.LO
@@ -61,6 +124,9 @@
CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
static struct field_range_pair *current_rp;
+/* Length of the delimiter given as argument to -d. */
+size_t delimlen;
+
/* This buffer is used to support the semantics of the -s option
(or lack of same) when the specified field list includes (does
not include) the first field. In both of those cases, the entire
@@ -77,15 +143,25 @@ enum operating_mode
{
undefined_mode,
- /* Output characters that are in the given bytes. */
+ /* Output bytes that are at the given positions. */
byte_mode,
+ /* Output characters that are at the given positions. */
+ character_mode,
+
/* Output the given delimiter-separated fields. */
field_mode
};
static enum operating_mode operating_mode;
+/* If nonzero, when in byte mode, don't split multibyte characters. */
+static int byte_mode_character_aware;
+
+/* If nonzero, the function for single byte locale is work
+ if this program runs on multibyte locale. */
+static int force_singlebyte_mode;
+
/* If true do not output lines containing no delimiter characters.
Otherwise, all such lines are printed. This option is valid only
with field mode. */
@@ -97,6 +173,9 @@ static bool complement;
/* The delimiter character for field mode. */
static unsigned char delim;
+#if HAVE_WCHAR_H
+static wchar_t wcdelim;
+#endif
/* The delimiter for each line/record. */
static unsigned char line_delim = '\n';
@@ -164,7 +243,7 @@ Print selected parts of lines from each FILE to standard output.\n\
-f, --fields=LIST select only these fields; also print any line\n\
that contains no delimiter character, unless\n\
the -s option is specified\n\
- -n (ignored)\n\
+ -n with -b: don't split multibyte characters\n\
"), stdout);
fputs (_("\
--complement complement the set of selected bytes, characters\n\
@@ -280,6 +359,82 @@ cut_bytes (FILE *stream)
}
}
+#if HAVE_MBRTOWC
+/* This function is in use for the following case.
+
+ 1. Read from the stream STREAM, printing to standard output any selected
+ characters.
+
+ 2. Read from stream STREAM, printing to standard output any selected bytes,
+ without splitting multibyte characters. */
+
+static void
+cut_characters_or_cut_bytes_no_split (FILE *stream)
+{
+ uintmax_t idx; /* number of bytes or characters in the line so far. */
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ char *bufpos; /* Next read position of BUF. */
+ size_t buflen; /* The length of the byte sequence in buf. */
+ wint_t wc; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state; /* State of the stream. */
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
+ /* Whether to begin printing delimiters between ranges for the current line.
+ Set after we've begun printing data corresponding to the first range. */
+ bool print_delimiter = false;
+
+ idx = 0;
+ buflen = 0;
+ bufpos = buf;
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ current_rp = frp;
+
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
+ (void) convfail; /* ignore unused */
+
+ if (wc == WEOF)
+ {
+ if (idx > 0)
+ putchar (line_delim);
+ break;
+ }
+ else if (wc == line_delim)
+ {
+ putchar (line_delim);
+ idx = 0;
+ print_delimiter = false;
+ current_rp = frp;
+ }
+ else
+ {
+ next_item (&idx);
+ if (print_kth (idx))
+ {
+ if (output_delimiter_specified)
+ {
+ if (print_delimiter && is_range_start_index (idx))
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ print_delimiter = true;
+ }
+ fwrite (bufpos, mblength, sizeof(char), stdout);
+ }
+ }
+
+ buflen -= mblength;
+ bufpos += mblength;
+ }
+}
+#endif
+
/* Read from stream STREAM, printing to standard output any selected fields. */
static void
@@ -425,13 +580,211 @@ cut_fields (FILE *stream)
}
}
+#if HAVE_MBRTOWC
+static void
+cut_fields_mb (FILE *stream)
+{
+ int c;
+ uintmax_t field_idx;
+ int found_any_selected_field;
+ int buffer_first_field;
+ int empty_input;
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ char *bufpos; /* Next read position of BUF. */
+ size_t buflen; /* The length of the byte sequence in buf. */
+ wint_t wc = 0; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state; /* State of the stream. */
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
+
+ current_rp = frp;
+
+ found_any_selected_field = 0;
+ field_idx = 1;
+ bufpos = buf;
+ buflen = 0;
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ c = getc (stream);
+ empty_input = (c == EOF);
+ if (c != EOF)
+ {
+ ungetc (c, stream);
+ wc = 0;
+ }
+ else
+ wc = WEOF;
+
+ /* To support the semantics of the -s flag, we may have to buffer
+ all of the first field to determine whether it is `delimited.'
+ But that is unnecessary if all non-delimited lines must be printed
+ and the first field has been selected, or if non-delimited lines
+ must be suppressed and the first field has *not* been selected.
+ That is because a non-delimited line has exactly one field. */
+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
+
+ while (1)
+ {
+ if (field_idx == 1 && buffer_first_field)
+ {
+ int len = 0;
+
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+ GET_NEXT_WC_FROM_BUFFER
+ (wc, bufpos, buflen, mblength, state, convfail);
+
+ if (wc == WEOF)
+ break;
+
+ field_1_buffer = xrealloc (field_1_buffer, len + mblength);
+ memcpy (field_1_buffer + len, bufpos, mblength);
+ len += mblength;
+ buflen -= mblength;
+ bufpos += mblength;
+
+ if (!convfail && (wc == line_delim || wc == wcdelim))
+ break;
+ }
+
+ if (len <= 0 && wc == WEOF)
+ break;
+
+ /* If the first field extends to the end of line (it is not
+ delimited) and we are printing all non-delimited lines,
+ print this one. */
+ if (convfail || (!convfail && wc != wcdelim))
+ {
+ if (suppress_non_delimited)
+ {
+ /* Empty. */
+ }
+ else
+ {
+ fwrite (field_1_buffer, sizeof (char), len, stdout);
+ /* Make sure the output line is newline terminated. */
+ if (convfail || (!convfail && wc != line_delim))
+ putchar (line_delim);
+ }
+ continue;
+ }
+
+ if (print_kth (1))
+ {
+ /* Print the field, but not the trailing delimiter. */
+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
+ found_any_selected_field = 1;
+ }
+ next_item (&field_idx);
+ }
+
+ if (wc != WEOF)
+ {
+ if (print_kth (field_idx))
+ {
+ if (found_any_selected_field)
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ found_any_selected_field = 1;
+ }
+
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+ GET_NEXT_WC_FROM_BUFFER
+ (wc, bufpos, buflen, mblength, state, convfail);
+
+ if (wc == WEOF)
+ break;
+ else if (!convfail && (wc == wcdelim || wc == line_delim))
+ {
+ buflen -= mblength;
+ bufpos += mblength;
+ break;
+ }
+
+ if (print_kth (field_idx))
+ fwrite (bufpos, mblength, sizeof(char), stdout);
+
+ buflen -= mblength;
+ bufpos += mblength;
+ }
+ }
+
+ if ((!convfail || wc == line_delim) && buflen < 1)
+ wc = WEOF;
+
+ if (!convfail && wc == wcdelim)
+ next_item (&field_idx);
+ else if (wc == WEOF || (!convfail && wc == line_delim))
+ {
+ if (found_any_selected_field
+ || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
+ putchar (line_delim);
+ if (wc == WEOF)
+ break;
+ field_idx = 1;
+ current_rp = frp;
+ found_any_selected_field = 0;
+ }
+ }
+}
+#endif
+
static void
cut_stream (FILE *stream)
{
- if (operating_mode == byte_mode)
- cut_bytes (stream);
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+ {
+ switch (operating_mode)
+ {
+ case byte_mode:
+ if (byte_mode_character_aware)
+ cut_characters_or_cut_bytes_no_split (stream);
+ else
+ cut_bytes (stream);
+ break;
+
+ case character_mode:
+ cut_characters_or_cut_bytes_no_split (stream);
+ break;
+
+ case field_mode:
+ if (delimlen == 1)
+ {
+ /* Check if we have utf8 multibyte locale, so we can use this
+ optimization because of uniqueness of characters, which is
+ not true for e.g. SJIS */
+ char * loc = setlocale(LC_CTYPE, NULL);
+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
+ strstr (loc, "UTF8") || strstr (loc, "utf8")))
+ {
+ cut_fields (stream);
+ break;
+ }
+ }
+ cut_fields_mb (stream);
+ break;
+
+ default:
+ abort ();
+ }
+ }
else
- cut_fields (stream);
+#endif
+ {
+ if (operating_mode == field_mode)
+ cut_fields (stream);
+ else
+ cut_bytes (stream);
+ }
}
/* Process file FILE to standard output.
@@ -483,6 +836,7 @@ main (int argc, char **argv)
bool ok;
bool delim_specified = false;
char *spec_list_string IF_LINT ( = NULL);
+ char mbdelim[MB_LEN_MAX + 1];
initialize_main (&argc, &argv);
set_program_name (argv[0]);
@@ -505,7 +859,6 @@ main (int argc, char **argv)
switch (optc)
{
case 'b':
- case 'c':
/* Build the byte list. */
if (operating_mode != undefined_mode)
FATAL_ERROR (_("only one type of list may be specified"));
@@ -513,6 +866,14 @@ main (int argc, char **argv)
spec_list_string = optarg;
break;
+ case 'c':
+ /* Build the character list. */
+ if (operating_mode != undefined_mode)
+ FATAL_ERROR (_("only one type of list may be specified"));
+ operating_mode = character_mode;
+ spec_list_string = optarg;
+ break;
+
case 'f':
/* Build the field list. */
if (operating_mode != undefined_mode)
@@ -524,10 +885,38 @@ main (int argc, char **argv)
case 'd':
/* New delimiter. */
/* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
- if (optarg[0] != '\0' && optarg[1] != '\0')
- FATAL_ERROR (_("the delimiter must be a single character"));
- delim = optarg[0];
- delim_specified = true;
+ {
+#if HAVE_MBRTOWC
+ if(MB_CUR_MAX > 1)
+ {
+ mbstate_t state;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
+
+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
+ ++force_singlebyte_mode;
+ else
+ {
+ delimlen = (delimlen < 1) ? 1 : delimlen;
+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
+ FATAL_ERROR (_("the delimiter must be a single character"));
+ memcpy (mbdelim, optarg, delimlen);
+ mbdelim[delimlen] = '\0';
+ if (delimlen == 1)
+ delim = *optarg;
+ }
+ }
+
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
+#endif
+ {
+ if (optarg[0] != '\0' && optarg[1] != '\0')
+ FATAL_ERROR (_("the delimiter must be a single character"));
+ delim = (unsigned char) optarg[0];
+ }
+ delim_specified = true;
+ }
break;
case OUTPUT_DELIMITER_OPTION:
@@ -540,6 +929,7 @@ main (int argc, char **argv)
break;
case 'n':
+ byte_mode_character_aware = 1;
break;
case 's':
@@ -579,15 +969,34 @@ main (int argc, char **argv)
| (complement ? SETFLD_COMPLEMENT : 0) );
if (!delim_specified)
- delim = '\t';
+ {
+ delim = '\t';
+#ifdef HAVE_MBRTOWC
+ wcdelim = L'\t';
+ mbdelim[0] = '\t';
+ mbdelim[1] = '\0';
+ delimlen = 1;
+#endif
+ }
if (output_delimiter_string == NULL)
{
- static char dummy[2];
- dummy[0] = delim;
- dummy[1] = '\0';
- output_delimiter_string = dummy;
- output_delimiter_length = 1;
+#ifdef HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+ {
+ output_delimiter_string = xstrdup(mbdelim);
+ output_delimiter_length = delimlen;
+ }
+
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
+#endif
+ {
+ static char dummy[2];
+ dummy[0] = delim;
+ dummy[1] = '\0';
+ output_delimiter_string = dummy;
+ output_delimiter_length = 1;
+ }
}
if (optind == argc)