Blame SOURCES/coreutils-i18n-expand-unexpand.patch

f5e30c
From e87ab5b991b08092a7e07af82b3ec822a8604151 Mon Sep 17 00:00:00 2001
f5e30c
From: Ondrej Oprala <ooprala@redhat.com>
f5e30c
Date: Wed, 5 Aug 2015 09:15:09 +0200
f5e30c
Subject: [PATCH] expand,unexpand: add multibyte support
f5e30c
MIME-Version: 1.0
f5e30c
Content-Type: text/plain; charset=UTF-8
f5e30c
Content-Transfer-Encoding: 8bit
f5e30c
f5e30c
* NEWS: Mention the changes.
f5e30c
* bootstrap.conf: Add mbfile to the list of modules.
f5e30c
* configure.ac: Properly initialize mbfile.
f5e30c
* src/expand.c (expand): Iterate over multibyte characters properly.
f5e30c
* src/unexpand.c (unexpand): Iterate over multibyte characters
f5e30c
properly.
f5e30c
* tests/local.mk: Add new tests.
f5e30c
* tests/{expand,unexpand}/mb.sh: New tests.
f5e30c
f5e30c
Co-authored-by: Pádraig Brady <pbrady@redhat.com>
f5e30c
---
f5e30c
 bootstrap.conf       |   1 +
f5e30c
 configure.ac         |   2 +
f5e30c
 lib/mbfile.c         |   3 +
f5e30c
 lib/mbfile.h         | 255 +++++++++++++++++++++++++++++++++++++++++++++++++++
f5e30c
 m4/mbfile.m4         |  14 +++
f5e30c
 src/expand.c         |  43 +++++----
f5e30c
 src/unexpand.c       |  54 +++++++----
f5e30c
 tests/expand/mb.sh   |  98 ++++++++++++++++++++
f5e30c
 tests/local.mk       |   2 +
f5e30c
 tests/unexpand/mb.sh |  97 ++++++++++++++++++++
f5e30c
 10 files changed, 535 insertions(+), 34 deletions(-)
f5e30c
 create mode 100644 lib/mbfile.c
f5e30c
 create mode 100644 lib/mbfile.h
f5e30c
 create mode 100644 m4/mbfile.m4
f5e30c
 create mode 100755 tests/expand/mb.sh
f5e30c
 create mode 100755 tests/unexpand/mb.sh
f5e30c
f5e30c
diff --git a/bootstrap.conf b/bootstrap.conf
f5e30c
index 8a0ff31..a1c78b2 100644
f5e30c
--- a/bootstrap.conf
f5e30c
+++ b/bootstrap.conf
f5e30c
@@ -152,6 +152,7 @@ gnulib_modules="
f5e30c
   maintainer-makefile
f5e30c
   malloc-gnu
f5e30c
   manywarnings
f5e30c
+  mbfile
f5e30c
   mbrlen
f5e30c
   mbrtowc
f5e30c
   mbsalign
f5e30c
diff --git a/configure.ac b/configure.ac
f5e30c
index 1e74b36..24c9725 100644
f5e30c
--- a/configure.ac
f5e30c
+++ b/configure.ac
f5e30c
@@ -427,6 +427,8 @@ fi
f5e30c
 # I'm leaving it here for now.  This whole thing needs to be modernized...
f5e30c
 gl_WINSIZE_IN_PTEM
f5e30c
 
f5e30c
+gl_MBFILE
f5e30c
+
f5e30c
 gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H
f5e30c
 
f5e30c
 if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \
f5e30c
diff --git a/lib/mbfile.c b/lib/mbfile.c
f5e30c
new file mode 100644
f5e30c
index 0000000..b0a468e
f5e30c
--- /dev/null
f5e30c
+++ b/lib/mbfile.c
f5e30c
@@ -0,0 +1,3 @@
f5e30c
+#include <config.h>
f5e30c
+#define MBFILE_INLINE _GL_EXTERN_INLINE
f5e30c
+#include "mbfile.h"
f5e30c
diff --git a/lib/mbfile.h b/lib/mbfile.h
f5e30c
new file mode 100644
f5e30c
index 0000000..11f1b12
f5e30c
--- /dev/null
f5e30c
+++ b/lib/mbfile.h
f5e30c
@@ -0,0 +1,255 @@
f5e30c
+/* Multibyte character I/O: macros for multi-byte encodings.
f5e30c
+   Copyright (C) 2001, 2005, 2009-2015 Free Software Foundation, Inc.
f5e30c
+
f5e30c
+   This program is free software: you can redistribute it and/or modify
f5e30c
+   it under the terms of the GNU General Public License as published by
f5e30c
+   the Free Software Foundation; either version 3 of the License, or
f5e30c
+   (at your option) any later version.
f5e30c
+
f5e30c
+   This program is distributed in the hope that it will be useful,
f5e30c
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
f5e30c
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
f5e30c
+   GNU General Public License for more details.
f5e30c
+
f5e30c
+   You should have received a copy of the GNU General Public License
f5e30c
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
f5e30c
+
f5e30c
+/* Written by Mitsuru Chinen <mchinen@yamato.ibm.com>
f5e30c
+   and Bruno Haible <bruno@clisp.org>.  */
f5e30c
+
f5e30c
+/* The macros in this file implement multi-byte character input from a
f5e30c
+   stream.
f5e30c
+
f5e30c
+   mb_file_t
f5e30c
+     is the type for multibyte character input stream, usable for variable
f5e30c
+     declarations.
f5e30c
+
f5e30c
+   mbf_char_t
f5e30c
+     is the type for multibyte character or EOF, usable for variable
f5e30c
+     declarations.
f5e30c
+
f5e30c
+   mbf_init (mbf, stream)
f5e30c
+     initializes the MB_FILE for reading from stream.
f5e30c
+
f5e30c
+   mbf_getc (mbc, mbf)
f5e30c
+     reads the next multibyte character from mbf and stores it in mbc.
f5e30c
+
f5e30c
+   mb_iseof (mbc)
f5e30c
+     returns true if mbc represents the EOF value.
f5e30c
+
f5e30c
+   Here are the function prototypes of the macros.
f5e30c
+
f5e30c
+   extern void          mbf_init (mb_file_t mbf, FILE *stream);
f5e30c
+   extern void          mbf_getc (mbf_char_t mbc, mb_file_t mbf);
f5e30c
+   extern bool          mb_iseof (const mbf_char_t mbc);
f5e30c
+ */
f5e30c
+
f5e30c
+#ifndef _MBFILE_H
f5e30c
+#define _MBFILE_H 1
f5e30c
+
f5e30c
+#include <assert.h>
f5e30c
+#include <stdbool.h>
f5e30c
+#include <stdio.h>
f5e30c
+#include <string.h>
f5e30c
+
f5e30c
+/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
f5e30c
+   <wchar.h>.
f5e30c
+   BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
f5e30c
+   <wchar.h>.  */
f5e30c
+#include <stdio.h>
f5e30c
+#include <time.h>
f5e30c
+#include <wchar.h>
f5e30c
+
f5e30c
+#include "mbchar.h"
f5e30c
+
f5e30c
+#ifndef _GL_INLINE_HEADER_BEGIN
f5e30c
+ #error "Please include config.h first."
f5e30c
+#endif
f5e30c
+_GL_INLINE_HEADER_BEGIN
f5e30c
+#ifndef MBFILE_INLINE
f5e30c
+# define MBFILE_INLINE _GL_INLINE
f5e30c
+#endif
f5e30c
+
f5e30c
+struct mbfile_multi {
f5e30c
+  FILE *fp;
f5e30c
+  bool eof_seen;
f5e30c
+  bool have_pushback;
f5e30c
+  mbstate_t state;
f5e30c
+  unsigned int bufcount;
f5e30c
+  char buf[MBCHAR_BUF_SIZE];
f5e30c
+  struct mbchar pushback;
f5e30c
+};
f5e30c
+
f5e30c
+MBFILE_INLINE void
f5e30c
+mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
f5e30c
+{
f5e30c
+  size_t bytes;
f5e30c
+
f5e30c
+  /* If EOF has already been seen, don't use getc.  This matters if
f5e30c
+     mbf->fp is connected to an interactive tty.  */
f5e30c
+  if (mbf->eof_seen)
f5e30c
+    goto eof;
f5e30c
+
f5e30c
+  /* Return character pushed back, if there is one.  */
f5e30c
+  if (mbf->have_pushback)
f5e30c
+    {
f5e30c
+      mb_copy (mbc, &mbf->pushback);
f5e30c
+      mbf->have_pushback = false;
f5e30c
+      return;
f5e30c
+    }
f5e30c
+
f5e30c
+  /* Before using mbrtowc, we need at least one byte.  */
f5e30c
+  if (mbf->bufcount == 0)
f5e30c
+    {
f5e30c
+      int c = getc (mbf->fp);
f5e30c
+      if (c == EOF)
f5e30c
+        {
f5e30c
+          mbf->eof_seen = true;
f5e30c
+          goto eof;
f5e30c
+        }
f5e30c
+      mbf->buf[0] = (unsigned char) c;
f5e30c
+      mbf->bufcount++;
f5e30c
+    }
f5e30c
+
f5e30c
+  /* Handle most ASCII characters quickly, without calling mbrtowc().  */
f5e30c
+  if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
f5e30c
+    {
f5e30c
+      /* These characters are part of the basic character set.  ISO C 99
f5e30c
+         guarantees that their wide character code is identical to their
f5e30c
+         char code.  */
f5e30c
+      mbc->wc = mbc->buf[0] = mbf->buf[0];
f5e30c
+      mbc->wc_valid = true;
f5e30c
+      mbc->ptr = &mbc->buf[0];
f5e30c
+      mbc->bytes = 1;
f5e30c
+      mbf->bufcount = 0;
f5e30c
+      return;
f5e30c
+    }
f5e30c
+
f5e30c
+  /* Use mbrtowc on an increasing number of bytes.  Read only as many bytes
f5e30c
+     from mbf->fp as needed.  This is needed to give reasonable interactive
f5e30c
+     behaviour when mbf->fp is connected to an interactive tty.  */
f5e30c
+  for (;;)
f5e30c
+    {
f5e30c
+      /* We don't know whether the 'mbrtowc' function updates the state when
f5e30c
+         it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or
f5e30c
+         not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour.  We
f5e30c
+         don't have an autoconf test for this, yet.
f5e30c
+         The new behaviour would allow us to feed the bytes one by one into
f5e30c
+         mbrtowc.  But the old behaviour forces us to feed all bytes since
f5e30c
+         the end of the last character into mbrtowc.  Since we want to retry
f5e30c
+         with more bytes when mbrtowc returns -2, we must backup the state
f5e30c
+         before calling mbrtowc, because implementations with the new
f5e30c
+         behaviour will clobber it.  */
f5e30c
+      mbstate_t backup_state = mbf->state;
f5e30c
+
f5e30c
+      bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
f5e30c
+
f5e30c
+      if (bytes == (size_t) -1)
f5e30c
+        {
f5e30c
+          /* An invalid multibyte sequence was encountered.  */
f5e30c
+          /* Return a single byte.  */
f5e30c
+          bytes = 1;
f5e30c
+          mbc->wc_valid = false;
f5e30c
+          break;
f5e30c
+        }
f5e30c
+      else if (bytes == (size_t) -2)
f5e30c
+        {
f5e30c
+          /* An incomplete multibyte character.  */
f5e30c
+          mbf->state = backup_state;
f5e30c
+          if (mbf->bufcount == MBCHAR_BUF_SIZE)
f5e30c
+            {
f5e30c
+              /* An overlong incomplete multibyte sequence was encountered.  */
f5e30c
+              /* Return a single byte.  */
f5e30c
+              bytes = 1;
f5e30c
+              mbc->wc_valid = false;
f5e30c
+              break;
f5e30c
+            }
f5e30c
+          else
f5e30c
+            {
f5e30c
+              /* Read one more byte and retry mbrtowc.  */
f5e30c
+              int c = getc (mbf->fp);
f5e30c
+              if (c == EOF)
f5e30c
+                {
f5e30c
+                  /* An incomplete multibyte character at the end.  */
f5e30c
+                  mbf->eof_seen = true;
f5e30c
+                  bytes = mbf->bufcount;
f5e30c
+                  mbc->wc_valid = false;
f5e30c
+                  break;
f5e30c
+                }
f5e30c
+              mbf->buf[mbf->bufcount] = (unsigned char) c;
f5e30c
+              mbf->bufcount++;
f5e30c
+            }
f5e30c
+        }
f5e30c
+      else
f5e30c
+        {
f5e30c
+          if (bytes == 0)
f5e30c
+            {
f5e30c
+              /* A null wide character was encountered.  */
f5e30c
+              bytes = 1;
f5e30c
+              assert (mbf->buf[0] == '\0');
f5e30c
+              assert (mbc->wc == 0);
f5e30c
+            }
f5e30c
+          mbc->wc_valid = true;
f5e30c
+          break;
f5e30c
+        }
f5e30c
+    }
f5e30c
+
f5e30c
+  /* Return the multibyte sequence mbf->buf[0..bytes-1].  */
f5e30c
+  mbc->ptr = &mbc->buf[0];
f5e30c
+  memcpy (&mbc->buf[0], &mbf->buf[0], bytes);
f5e30c
+  mbc->bytes = bytes;
f5e30c
+
f5e30c
+  mbf->bufcount -= bytes;
f5e30c
+  if (mbf->bufcount > 0)
f5e30c
+    {
f5e30c
+      /* It's not worth calling memmove() for so few bytes.  */
f5e30c
+      unsigned int count = mbf->bufcount;
f5e30c
+      char *p = &mbf->buf[0];
f5e30c
+
f5e30c
+      do
f5e30c
+        {
f5e30c
+          *p = *(p + bytes);
f5e30c
+          p++;
f5e30c
+        }
f5e30c
+      while (--count > 0);
f5e30c
+    }
f5e30c
+  return;
f5e30c
+
f5e30c
+eof:
f5e30c
+  /* An mbchar_t with bytes == 0 is used to indicate EOF.  */
f5e30c
+  mbc->ptr = NULL;
f5e30c
+  mbc->bytes = 0;
f5e30c
+  mbc->wc_valid = false;
f5e30c
+  return;
f5e30c
+}
f5e30c
+
f5e30c
+MBFILE_INLINE void
f5e30c
+mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf)
f5e30c
+{
f5e30c
+  mb_copy (&mbf->pushback, mbc);
f5e30c
+  mbf->have_pushback = true;
f5e30c
+}
f5e30c
+
f5e30c
+typedef struct mbfile_multi mb_file_t;
f5e30c
+
f5e30c
+typedef mbchar_t mbf_char_t;
f5e30c
+
f5e30c
+#define mbf_init(mbf, stream)                                           \
f5e30c
+  ((mbf).fp = (stream),                                                 \
f5e30c
+   (mbf).eof_seen = false,                                              \
f5e30c
+   (mbf).have_pushback = false,                                         \
f5e30c
+   memset (&(mbf).state, '\0', sizeof (mbstate_t)),                     \
f5e30c
+   (mbf).bufcount = 0)
f5e30c
+
f5e30c
+#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf))
f5e30c
+
f5e30c
+#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf))
f5e30c
+
f5e30c
+#define mb_iseof(mbc) ((mbc).bytes == 0)
f5e30c
+
f5e30c
+#ifndef _GL_INLINE_HEADER_BEGIN
f5e30c
+ #error "Please include config.h first."
f5e30c
+#endif
f5e30c
+_GL_INLINE_HEADER_BEGIN
f5e30c
+
f5e30c
+#endif /* _MBFILE_H */
f5e30c
diff --git a/m4/mbfile.m4 b/m4/mbfile.m4
f5e30c
new file mode 100644
f5e30c
index 0000000..8589902
f5e30c
--- /dev/null
f5e30c
+++ b/m4/mbfile.m4
f5e30c
@@ -0,0 +1,14 @@
f5e30c
+# mbfile.m4 serial 7
f5e30c
+dnl Copyright (C) 2005, 2008-2015 Free Software Foundation, Inc.
f5e30c
+dnl This file is free software; the Free Software Foundation
f5e30c
+dnl gives unlimited permission to copy and/or distribute it,
f5e30c
+dnl with or without modifications, as long as this notice is preserved.
f5e30c
+
f5e30c
+dnl autoconf tests required for use of mbfile.h
f5e30c
+dnl From Bruno Haible.
f5e30c
+
f5e30c
+AC_DEFUN([gl_MBFILE],
f5e30c
+[
f5e30c
+  AC_REQUIRE([AC_TYPE_MBSTATE_T])
f5e30c
+  :
f5e30c
+])
f5e30c
diff --git a/src/expand.c b/src/expand.c
f5e30c
index 9fa2e10..380e020 100644
f5e30c
--- a/src/expand.c
f5e30c
+++ b/src/expand.c
f5e30c
@@ -37,6 +37,9 @@
f5e30c
 #include <stdio.h>
f5e30c
 #include <getopt.h>
f5e30c
 #include <sys/types.h>
f5e30c
+
f5e30c
+#include <mbfile.h>
f5e30c
+
f5e30c
 #include "system.h"
f5e30c
 #include "die.h"
f5e30c
 #include "xstrndup.h"
f5e30c
@@ -100,19 +103,19 @@ expand (void)
f5e30c
 {
f5e30c
   /* Input stream.  */
f5e30c
   FILE *fp = next_file (NULL);
f5e30c
+  mb_file_t mbf;
f5e30c
+  mbf_char_t c;
f5e30c
 
f5e30c
   if (!fp)
f5e30c
     return;
f5e30c
 
f5e30c
+  mbf_init (mbf, fp);
f5e30c
+
f5e30c
   while (true)
f5e30c
     {
f5e30c
-      /* Input character, or EOF.  */
f5e30c
-      int c;
f5e30c
-
f5e30c
       /* If true, perform translations.  */
f5e30c
       bool convert = true;
f5e30c
 
f5e30c
-
f5e30c
       /* The following variables have valid values only when CONVERT
f5e30c
          is true:  */
f5e30c
 
f5e30c
@@ -122,17 +125,23 @@ expand (void)
f5e30c
       /* Index in TAB_LIST of next tab stop to examine.  */
f5e30c
       size_t tab_index = 0;
f5e30c
 
f5e30c
-
f5e30c
       /* Convert a line of text.  */
f5e30c
 
f5e30c
       do
f5e30c
         {
f5e30c
-          while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
f5e30c
-            continue;
f5e30c
+          do {
f5e30c
+            mbf_getc (c, mbf);
f5e30c
+            if (mb_iseof (c))
f5e30c
+              {
f5e30c
+                mbf_init (mbf, fp = next_file (fp));
f5e30c
+                continue;
f5e30c
+              }
f5e30c
+            }
f5e30c
+          while (false);
f5e30c
 
f5e30c
           if (convert)
f5e30c
             {
f5e30c
-              if (c == '\t')
f5e30c
+              if (mb_iseq (c, '\t'))
f5e30c
                 {
f5e30c
                   /* Column the next input tab stop is on.  */
f5e30c
                   uintmax_t next_tab_column;
f5e30c
@@ -151,32 +160,34 @@ expand (void)
f5e30c
                     if (putchar (' ') < 0)
f5e30c
                       die (EXIT_FAILURE, errno, _("write error"));
f5e30c
 
f5e30c
-                  c = ' ';
f5e30c
+                  mb_setascii (&c, ' ');
f5e30c
                 }
f5e30c
-              else if (c == '\b')
f5e30c
+              else if (mb_iseq (c, '\b'))
f5e30c
                 {
f5e30c
                   /* Go back one column, and force recalculation of the
f5e30c
                      next tab stop.  */
f5e30c
                   column -= !!column;
f5e30c
                   tab_index -= !!tab_index;
f5e30c
                 }
f5e30c
-              else
f5e30c
+              /* A leading control character could make us trip over.  */
f5e30c
+              else if (!mb_iscntrl (c))
f5e30c
                 {
f5e30c
-                  column++;
f5e30c
+                  column += mb_width (c);
f5e30c
                   if (!column)
f5e30c
                     die (EXIT_FAILURE, 0, _("input line is too long"));
f5e30c
                 }
f5e30c
 
f5e30c
-              convert &= convert_entire_line || !! isblank (c);
f5e30c
+              convert &= convert_entire_line || mb_isblank (c);
f5e30c
             }
f5e30c
 
f5e30c
-          if (c < 0)
f5e30c
+          if (mb_iseof (c))
f5e30c
             return;
f5e30c
 
f5e30c
-          if (putchar (c) < 0)
f5e30c
+          mb_putc (c, stdout);
f5e30c
+          if (ferror (stdout))
f5e30c
             die (EXIT_FAILURE, errno, _("write error"));
f5e30c
         }
f5e30c
-      while (c != '\n');
f5e30c
+      while (!mb_iseq (c, '\n'));
f5e30c
     }
f5e30c
 }
f5e30c
 
f5e30c
diff --git a/src/unexpand.c b/src/unexpand.c
f5e30c
index 7801274..569a7ee 100644
f5e30c
--- a/src/unexpand.c
f5e30c
+++ b/src/unexpand.c
f5e30c
@@ -38,6 +38,9 @@
f5e30c
 #include <stdio.h>
f5e30c
 #include <getopt.h>
f5e30c
 #include <sys/types.h>
f5e30c
+
f5e30c
+#include <mbfile.h>
f5e30c
+
f5e30c
 #include "system.h"
f5e30c
 #include "die.h"
f5e30c
 #include "xstrndup.h"
f5e30c
@@ -107,11 +110,12 @@ unexpand (void)
f5e30c
 {
f5e30c
   /* Input stream.  */
f5e30c
   FILE *fp = next_file (NULL);
f5e30c
+  mb_file_t mbf;
f5e30c
 
f5e30c
   /* The array of pending blanks.  In non-POSIX locales, blanks can
f5e30c
      include characters other than spaces, so the blanks must be
f5e30c
      stored, not merely counted.  */
f5e30c
-  char *pending_blank;
f5e30c
+  mbf_char_t *pending_blank;
f5e30c
 
f5e30c
   if (!fp)
f5e30c
     return;
f5e30c
@@ -119,12 +123,14 @@ unexpand (void)
f5e30c
   /* The worst case is a non-blank character, then one blank, then a
f5e30c
      tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
f5e30c
      allocate MAX_COLUMN_WIDTH bytes to store the blanks.  */
f5e30c
-  pending_blank = xmalloc (max_column_width);
f5e30c
+  pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t));
f5e30c
+
f5e30c
+  mbf_init (mbf, fp);
f5e30c
 
f5e30c
   while (true)
f5e30c
     {
f5e30c
       /* Input character, or EOF.  */
f5e30c
-      int c;
f5e30c
+      mbf_char_t c;
f5e30c
 
f5e30c
       /* If true, perform translations.  */
f5e30c
       bool convert = true;
f5e30c
@@ -158,12 +164,19 @@ unexpand (void)
f5e30c
 
f5e30c
       do
f5e30c
         {
f5e30c
-          while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
f5e30c
-            continue;
f5e30c
+          do {
f5e30c
+            mbf_getc (c, mbf);
f5e30c
+            if (mb_iseof (c))
f5e30c
+              {
f5e30c
+                mbf_init (mbf, fp = next_file (fp));
f5e30c
+                continue;
f5e30c
+              }
f5e30c
+            }
f5e30c
+          while (false);
f5e30c
 
f5e30c
           if (convert)
f5e30c
             {
f5e30c
-              bool blank = !! isblank (c);
f5e30c
+              bool blank = mb_isblank (c);
f5e30c
 
f5e30c
               if (blank)
f5e30c
                 {
f5e30c
@@ -180,16 +193,16 @@ unexpand (void)
f5e30c
                       if (next_tab_column < column)
f5e30c
                         die (EXIT_FAILURE, 0, _("input line is too long"));
f5e30c
 
f5e30c
-                      if (c == '\t')
f5e30c
+                      if (mb_iseq (c, '\t'))
f5e30c
                         {
f5e30c
                           column = next_tab_column;
f5e30c
 
f5e30c
                           if (pending)
f5e30c
-                            pending_blank[0] = '\t';
f5e30c
+                            mb_setascii (&pending_blank[0], '\t');
f5e30c
                         }
f5e30c
                       else
f5e30c
                         {
f5e30c
-                          column++;
f5e30c
+                          column += mb_width (c);
f5e30c
 
f5e30c
                           if (! (prev_blank && column == next_tab_column))
f5e30c
                             {
f5e30c
@@ -197,13 +210,14 @@ unexpand (void)
f5e30c
                                  will be replaced by tabs.  */
f5e30c
                               if (column == next_tab_column)
f5e30c
                                 one_blank_before_tab_stop = true;
f5e30c
-                              pending_blank[pending++] = c;
f5e30c
+                              mb_copy (&pending_blank[pending++], &c);
f5e30c
                               prev_blank = true;
f5e30c
                               continue;
f5e30c
                             }
f5e30c
 
f5e30c
                           /* Replace the pending blanks by a tab or two.  */
f5e30c
-                          pending_blank[0] = c = '\t';
f5e30c
+                          mb_setascii (&c, '\t');
f5e30c
+                          mb_setascii (&pending_blank[0], '\t');
f5e30c
                         }
f5e30c
 
f5e30c
                       /* Discard pending blanks, unless it was a single
f5e30c
@@ -211,7 +225,7 @@ unexpand (void)
f5e30c
                       pending = one_blank_before_tab_stop;
f5e30c
                     }
f5e30c
                 }
f5e30c
-              else if (c == '\b')
f5e30c
+              else if (mb_iseq (c, '\b'))
f5e30c
                 {
f5e30c
                   /* Go back one column, and force recalculation of the
f5e30c
                      next tab stop.  */
ff1630
@@ -221,16 +235,20 @@ unexpand (void)
f5e30c
                 }
f5e30c
               else
f5e30c
                 {
f5e30c
-                  column++;
ff1630
-                  if (!column)
ff1630
+                  const uintmax_t orig_column = column;
f5e30c
+                  column += mb_width (c);
ff1630
+                  if (column < orig_column)
f5e30c
                     die (EXIT_FAILURE, 0, _("input line is too long"));
f5e30c
                 }
ff1630
 
f5e30c
               if (pending)
f5e30c
                 {
f5e30c
                   if (pending > 1 && one_blank_before_tab_stop)
f5e30c
-                    pending_blank[0] = '\t';
f5e30c
-                  if (fwrite (pending_blank, 1, pending, stdout) != pending)
f5e30c
+                    mb_setascii (&pending_blank[0], '\t');
f5e30c
+
f5e30c
+                  for (int n = 0; n < pending; ++n)
f5e30c
+                    mb_putc (pending_blank[n], stdout);
f5e30c
+                  if (ferror (stdout))
f5e30c
                     die (EXIT_FAILURE, errno, _("write error"));
f5e30c
                   pending = 0;
f5e30c
                   one_blank_before_tab_stop = false;
ff1630
@@ -240,16 +258,17 @@ unexpand (void)
f5e30c
               convert &= convert_entire_line || blank;
f5e30c
             }
f5e30c
 
f5e30c
-          if (c < 0)
f5e30c
+          if (mb_iseof (c))
f5e30c
             {
f5e30c
               free (pending_blank);
f5e30c
               return;
f5e30c
             }
f5e30c
 
f5e30c
-          if (putchar (c) < 0)
f5e30c
+          mb_putc (c, stdout);
f5e30c
+          if (ferror (stdout))
f5e30c
             die (EXIT_FAILURE, errno, _("write error"));
f5e30c
         }
f5e30c
-      while (c != '\n');
f5e30c
+      while (!mb_iseq (c, '\n'));
f5e30c
     }
f5e30c
 }
f5e30c
 
f5e30c
diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh
f5e30c
new file mode 100755
f5e30c
index 0000000..7971e18
f5e30c
--- /dev/null
f5e30c
+++ b/tests/expand/mb.sh
f5e30c
@@ -0,0 +1,98 @@
f5e30c
+#!/bin/sh
f5e30c
+
f5e30c
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
f5e30c
+
f5e30c
+# This program is free software: you can redistribute it and/or modify
f5e30c
+# it under the terms of the GNU General Public License as published by
f5e30c
+# the Free Software Foundation, either version 3 of the License, or
f5e30c
+# (at your option) any later version.
f5e30c
+
f5e30c
+# This program is distributed in the hope that it will be useful,
f5e30c
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
f5e30c
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
f5e30c
+# GNU General Public License for more details.
f5e30c
+
f5e30c
+# You should have received a copy of the GNU General Public License
f5e30c
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
f5e30c
+
f5e30c
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
f5e30c
+print_ver_ expand
f5e30c
+
f5e30c
+export LC_ALL=en_US.UTF-8
f5e30c
+
f5e30c
+#input containing multibyte characters
f5e30c
+cat <<\EOF > in || framework_failure_
f5e30c
+1234567812345678123456781
f5e30c
+.       .       .       .
f5e30c
+a	b	c	d
f5e30c
+.       .       .       .
f5e30c
+ä	ö	ü	ß
f5e30c
+.       .       .       .
f5e30c
+EOF
f5e30c
+env printf '   äöü\t.    öüä.   \tä xx\n' >> in || framework_failure_
f5e30c
+
f5e30c
+cat <<\EOF > exp || framework_failure_
f5e30c
+1234567812345678123456781
f5e30c
+.       .       .       .
f5e30c
+a       b       c       d
f5e30c
+.       .       .       .
f5e30c
+ä       ö       ü       ß
f5e30c
+.       .       .       .
f5e30c
+   äöü  .    öüä.       ä xx
f5e30c
+EOF
f5e30c
+
f5e30c
+expand < in > out || fail=1
f5e30c
+compare exp out > /dev/null 2>&1 || fail=1
f5e30c
+
f5e30c
+#test characters with display widths != 1
f5e30c
+env printf '12345678
f5e30c
+e\t|ascii(1)
f5e30c
+\u00E9\t|composed(1)
f5e30c
+e\u0301\t|decomposed(1)
f5e30c
+\u3000\t|ideo-space(2)
f5e30c
+\uFF0D\t|full-hypen(2)
f5e30c
+' > in || framework_failure_
f5e30c
+
f5e30c
+env printf '12345678
f5e30c
+e       |ascii(1)
f5e30c
+\u00E9       |composed(1)
f5e30c
+e\u0301       |decomposed(1)
f5e30c
+\u3000      |ideo-space(2)
f5e30c
+\uFF0D      |full-hypen(2)
f5e30c
+' > exp || framework_failure_
f5e30c
+
f5e30c
+expand < in > out || fail=1
f5e30c
+compare exp out > /dev/null 2>&1 || fail=1
f5e30c
+
f5e30c
+#shouldn't fail with "input line too long"
f5e30c
+#when a line starts with a control character
f5e30c
+env printf '\n' > in || framework_failure_
f5e30c
+
f5e30c
+expand < in > out || fail=1
f5e30c
+compare in out > /dev/null 2>&1 || fail=1
f5e30c
+
f5e30c
+#non-Unicode characters interspersed between Unicode ones
f5e30c
+env printf '12345678
f5e30c
+\t\xFF|
f5e30c
+\xFF\t|
f5e30c
+\t\xFFä|
f5e30c
+ä\xFF\t|
f5e30c
+\tä\xFF|
f5e30c
+\xFF\tä|
f5e30c
+äbcdef\xFF\t|
f5e30c
+' > in || framework_failure_
f5e30c
+
f5e30c
+env printf '12345678
f5e30c
+        \xFF|
f5e30c
+\xFF       |
f5e30c
+        \xFFä|
f5e30c
+ä\xFF      |
f5e30c
+        ä\xFF|
f5e30c
+\xFF       ä|
f5e30c
+äbcdef\xFF |
f5e30c
+' > exp || framework_failure_
f5e30c
+
f5e30c
+expand < in > out || fail=1
f5e30c
+compare exp out > /dev/null 2>&1 || fail=1
f5e30c
+
f5e30c
+exit $fail
f5e30c
diff --git a/tests/local.mk b/tests/local.mk
f5e30c
index 192f776..8053397 100644
f5e30c
--- a/tests/local.mk
f5e30c
+++ b/tests/local.mk
f5e30c
@@ -544,6 +544,7 @@ all_tests =					\
f5e30c
   tests/du/threshold.sh				\
f5e30c
   tests/du/trailing-slash.sh			\
f5e30c
   tests/du/two-args.sh				\
f5e30c
+  tests/expand/mb.sh				\
f5e30c
   tests/id/gnu-zero-uids.sh			\
f5e30c
   tests/id/no-context.sh			\
f5e30c
   tests/id/context.sh				\
f5e30c
@@ -684,6 +685,7 @@ all_tests =					\
f5e30c
   tests/touch/read-only.sh			\
f5e30c
   tests/touch/relative.sh			\
f5e30c
   tests/touch/trailing-slash.sh			\
f5e30c
+  tests/unexpand/mb.sh				\
f5e30c
   $(all_root_tests)
f5e30c
 
f5e30c
 # See tests/factor/create-test.sh.
f5e30c
diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh
f5e30c
new file mode 100755
f5e30c
index 0000000..60d4c1a
f5e30c
--- /dev/null
f5e30c
+++ b/tests/unexpand/mb.sh
f5e30c
@@ -0,0 +1,97 @@
f5e30c
+#!/bin/sh
f5e30c
+
f5e30c
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
f5e30c
+
f5e30c
+# This program is free software: you can redistribute it and/or modify
f5e30c
+# it under the terms of the GNU General Public License as published by
f5e30c
+# the Free Software Foundation, either version 3 of the License, or
f5e30c
+# (at your option) any later version.
f5e30c
+
f5e30c
+# This program is distributed in the hope that it will be useful,
f5e30c
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
f5e30c
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
f5e30c
+# GNU General Public License for more details.
f5e30c
+
f5e30c
+# You should have received a copy of the GNU General Public License
f5e30c
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
f5e30c
+
f5e30c
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
f5e30c
+print_ver_ unexpand
f5e30c
+
f5e30c
+export LC_ALL=en_US.UTF-8
f5e30c
+
f5e30c
+#input containing multibyte characters
f5e30c
+cat > in <<\EOF
f5e30c
+1234567812345678123456781
f5e30c
+.       .       .       .
f5e30c
+a       b       c       d
f5e30c
+.       .       .       .
f5e30c
+ä       ö       ü       ß
f5e30c
+.       .       .       .
f5e30c
+   äöü  .    öüä.       ä xx
f5e30c
+EOF
f5e30c
+
f5e30c
+cat > exp <<\EOF
f5e30c
+1234567812345678123456781
f5e30c
+.	.	.	.
f5e30c
+a	b	c	d
f5e30c
+.	.	.	.
f5e30c
+ä	ö	ü	ß
f5e30c
+.	.	.	.
f5e30c
+   äöü	.    öüä.	ä xx
f5e30c
+EOF
f5e30c
+
f5e30c
+unexpand -a < in > out || fail=1
f5e30c
+compare exp out > /dev/null 2>&1 || fail=1
f5e30c
+
f5e30c
+#test characters with a display width larger than 1
f5e30c
+
f5e30c
+env printf '12345678
f5e30c
+e       |ascii(1)
f5e30c
+\u00E9       |composed(1)
f5e30c
+e\u0301       |decomposed(1)
f5e30c
+\u3000      |ideo-space(2)
f5e30c
+\uFF0D      |full-hypen(2)
f5e30c
+' > in || framework_failure_
f5e30c
+
f5e30c
+env printf '12345678
f5e30c
+e\t|ascii(1)
f5e30c
+\u00E9\t|composed(1)
f5e30c
+e\u0301\t|decomposed(1)
f5e30c
+\u3000\t|ideo-space(2)
f5e30c
+\uFF0D\t|full-hypen(2)
f5e30c
+' > exp || framework_failure_
f5e30c
+
f5e30c
+unexpand -a < in > out || fail=1
f5e30c
+compare exp out > /dev/null 2>&1 || fail=1
f5e30c
+
f5e30c
+#test input where a blank of width > 1 is not being substituted
f5e30c
+in="$(LC_ALL=en_US.UTF-8 printf ' \u3000  ö       ü       ß')"
f5e30c
+exp='    ö	     ü	     ß'
f5e30c
+
f5e30c
+unexpand -a < in > out || fail=1
f5e30c
+compare exp out > /dev/null 2>&1 || fail=1
f5e30c
+
f5e30c
+#non-Unicode characters interspersed between Unicode ones
f5e30c
+env printf '12345678
f5e30c
+        \xFF|
f5e30c
+\xFF       |
f5e30c
+        \xFFä|
f5e30c
+ä\xFF      |
f5e30c
+        ä\xFF|
f5e30c
+\xFF       ä|
f5e30c
+äbcdef\xFF |
f5e30c
+' > in || framework_failure_
f5e30c
+
f5e30c
+env printf '12345678
f5e30c
+\t\xFF|
f5e30c
+\xFF\t|
f5e30c
+\t\xFFä|
f5e30c
+ä\xFF\t|
f5e30c
+\tä\xFF|
f5e30c
+\xFF\tä|
f5e30c
+äbcdef\xFF\t|
f5e30c
+' > exp || framework_failure_
f5e30c
+
f5e30c
+unexpand -a < in > out || fail=1
f5e30c
+compare exp out > /dev/null 2>&1 || fail=1
f5e30c
-- 
f5e30c
2.7.4
f5e30c