From e87ab5b991b08092a7e07af82b3ec822a8604151 Mon Sep 17 00:00:00 2001 From: Ondrej Oprala Date: Wed, 5 Aug 2015 09:15:09 +0200 Subject: [PATCH] expand,unexpand: add multibyte support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * NEWS: Mention the changes. * bootstrap.conf: Add mbfile to the list of modules. * configure.ac: Properly initialize mbfile. * src/expand.c (expand): Iterate over multibyte characters properly. * src/unexpand.c (unexpand): Iterate over multibyte characters properly. * tests/local.mk: Add new tests. * tests/{expand,unexpand}/mb.sh: New tests. Co-authored-by: Pádraig Brady --- bootstrap.conf | 1 + configure.ac | 2 + lib/mbfile.c | 3 + lib/mbfile.h | 255 +++++++++++++++++++++++++++++++++++++++++++++++++++ m4/mbfile.m4 | 14 +++ src/expand.c | 43 +++++---- src/unexpand.c | 54 +++++++---- tests/expand/mb.sh | 98 ++++++++++++++++++++ tests/local.mk | 2 + tests/unexpand/mb.sh | 97 ++++++++++++++++++++ 10 files changed, 535 insertions(+), 34 deletions(-) create mode 100644 lib/mbfile.c create mode 100644 lib/mbfile.h create mode 100644 m4/mbfile.m4 create mode 100755 tests/expand/mb.sh create mode 100755 tests/unexpand/mb.sh diff --git a/bootstrap.conf b/bootstrap.conf index 8a0ff31..a1c78b2 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -152,6 +152,7 @@ gnulib_modules=" maintainer-makefile malloc-gnu manywarnings + mbfile mbrlen mbrtowc mbsalign diff --git a/configure.ac b/configure.ac index 1e74b36..24c9725 100644 --- a/configure.ac +++ b/configure.ac @@ -427,6 +427,8 @@ fi # I'm leaving it here for now. This whole thing needs to be modernized... gl_WINSIZE_IN_PTEM +gl_MBFILE + gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \ diff --git a/lib/mbfile.c b/lib/mbfile.c new file mode 100644 index 0000000..b0a468e --- /dev/null +++ b/lib/mbfile.c @@ -0,0 +1,3 @@ +#include +#define MBFILE_INLINE _GL_EXTERN_INLINE +#include "mbfile.h" diff --git a/lib/mbfile.h b/lib/mbfile.h new file mode 100644 index 0000000..11f1b12 --- /dev/null +++ b/lib/mbfile.h @@ -0,0 +1,255 @@ +/* Multibyte character I/O: macros for multi-byte encodings. + Copyright (C) 2001, 2005, 2009-2015 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Mitsuru Chinen + and Bruno Haible . */ + +/* The macros in this file implement multi-byte character input from a + stream. + + mb_file_t + is the type for multibyte character input stream, usable for variable + declarations. + + mbf_char_t + is the type for multibyte character or EOF, usable for variable + declarations. + + mbf_init (mbf, stream) + initializes the MB_FILE for reading from stream. + + mbf_getc (mbc, mbf) + reads the next multibyte character from mbf and stores it in mbc. + + mb_iseof (mbc) + returns true if mbc represents the EOF value. + + Here are the function prototypes of the macros. + + extern void mbf_init (mb_file_t mbf, FILE *stream); + extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf); + extern bool mb_iseof (const mbf_char_t mbc); + */ + +#ifndef _MBFILE_H +#define _MBFILE_H 1 + +#include +#include +#include +#include + +/* Tru64 with Desktop Toolkit C has a bug: must be included before + . + BSD/OS 4.1 has a bug: and must be included before + . */ +#include +#include +#include + +#include "mbchar.h" + +#ifndef _GL_INLINE_HEADER_BEGIN + #error "Please include config.h first." +#endif +_GL_INLINE_HEADER_BEGIN +#ifndef MBFILE_INLINE +# define MBFILE_INLINE _GL_INLINE +#endif + +struct mbfile_multi { + FILE *fp; + bool eof_seen; + bool have_pushback; + mbstate_t state; + unsigned int bufcount; + char buf[MBCHAR_BUF_SIZE]; + struct mbchar pushback; +}; + +MBFILE_INLINE void +mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) +{ + size_t bytes; + + /* If EOF has already been seen, don't use getc. This matters if + mbf->fp is connected to an interactive tty. */ + if (mbf->eof_seen) + goto eof; + + /* Return character pushed back, if there is one. */ + if (mbf->have_pushback) + { + mb_copy (mbc, &mbf->pushback); + mbf->have_pushback = false; + return; + } + + /* Before using mbrtowc, we need at least one byte. */ + if (mbf->bufcount == 0) + { + int c = getc (mbf->fp); + if (c == EOF) + { + mbf->eof_seen = true; + goto eof; + } + mbf->buf[0] = (unsigned char) c; + mbf->bufcount++; + } + + /* Handle most ASCII characters quickly, without calling mbrtowc(). */ + if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0])) + { + /* These characters are part of the basic character set. ISO C 99 + guarantees that their wide character code is identical to their + char code. */ + mbc->wc = mbc->buf[0] = mbf->buf[0]; + mbc->wc_valid = true; + mbc->ptr = &mbc->buf[0]; + mbc->bytes = 1; + mbf->bufcount = 0; + return; + } + + /* Use mbrtowc on an increasing number of bytes. Read only as many bytes + from mbf->fp as needed. This is needed to give reasonable interactive + behaviour when mbf->fp is connected to an interactive tty. */ + for (;;) + { + /* We don't know whether the 'mbrtowc' function updates the state when + it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or + not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We + don't have an autoconf test for this, yet. + The new behaviour would allow us to feed the bytes one by one into + mbrtowc. But the old behaviour forces us to feed all bytes since + the end of the last character into mbrtowc. Since we want to retry + with more bytes when mbrtowc returns -2, we must backup the state + before calling mbrtowc, because implementations with the new + behaviour will clobber it. */ + mbstate_t backup_state = mbf->state; + + bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state); + + if (bytes == (size_t) -1) + { + /* An invalid multibyte sequence was encountered. */ + /* Return a single byte. */ + bytes = 1; + mbc->wc_valid = false; + break; + } + else if (bytes == (size_t) -2) + { + /* An incomplete multibyte character. */ + mbf->state = backup_state; + if (mbf->bufcount == MBCHAR_BUF_SIZE) + { + /* An overlong incomplete multibyte sequence was encountered. */ + /* Return a single byte. */ + bytes = 1; + mbc->wc_valid = false; + break; + } + else + { + /* Read one more byte and retry mbrtowc. */ + int c = getc (mbf->fp); + if (c == EOF) + { + /* An incomplete multibyte character at the end. */ + mbf->eof_seen = true; + bytes = mbf->bufcount; + mbc->wc_valid = false; + break; + } + mbf->buf[mbf->bufcount] = (unsigned char) c; + mbf->bufcount++; + } + } + else + { + if (bytes == 0) + { + /* A null wide character was encountered. */ + bytes = 1; + assert (mbf->buf[0] == '\0'); + assert (mbc->wc == 0); + } + mbc->wc_valid = true; + break; + } + } + + /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ + mbc->ptr = &mbc->buf[0]; + memcpy (&mbc->buf[0], &mbf->buf[0], bytes); + mbc->bytes = bytes; + + mbf->bufcount -= bytes; + if (mbf->bufcount > 0) + { + /* It's not worth calling memmove() for so few bytes. */ + unsigned int count = mbf->bufcount; + char *p = &mbf->buf[0]; + + do + { + *p = *(p + bytes); + p++; + } + while (--count > 0); + } + return; + +eof: + /* An mbchar_t with bytes == 0 is used to indicate EOF. */ + mbc->ptr = NULL; + mbc->bytes = 0; + mbc->wc_valid = false; + return; +} + +MBFILE_INLINE void +mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf) +{ + mb_copy (&mbf->pushback, mbc); + mbf->have_pushback = true; +} + +typedef struct mbfile_multi mb_file_t; + +typedef mbchar_t mbf_char_t; + +#define mbf_init(mbf, stream) \ + ((mbf).fp = (stream), \ + (mbf).eof_seen = false, \ + (mbf).have_pushback = false, \ + memset (&(mbf).state, '\0', sizeof (mbstate_t)), \ + (mbf).bufcount = 0) + +#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf)) + +#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf)) + +#define mb_iseof(mbc) ((mbc).bytes == 0) + +#ifndef _GL_INLINE_HEADER_BEGIN + #error "Please include config.h first." +#endif +_GL_INLINE_HEADER_BEGIN + +#endif /* _MBFILE_H */ diff --git a/m4/mbfile.m4 b/m4/mbfile.m4 new file mode 100644 index 0000000..8589902 --- /dev/null +++ b/m4/mbfile.m4 @@ -0,0 +1,14 @@ +# mbfile.m4 serial 7 +dnl Copyright (C) 2005, 2008-2015 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl autoconf tests required for use of mbfile.h +dnl From Bruno Haible. + +AC_DEFUN([gl_MBFILE], +[ + AC_REQUIRE([AC_TYPE_MBSTATE_T]) + : +]) diff --git a/src/expand.c b/src/expand.c index 9fa2e10..380e020 100644 --- a/src/expand.c +++ b/src/expand.c @@ -37,6 +37,9 @@ #include #include #include + +#include + #include "system.h" #include "die.h" #include "xstrndup.h" @@ -100,19 +103,19 @@ expand (void) { /* Input stream. */ FILE *fp = next_file (NULL); + mb_file_t mbf; + mbf_char_t c; if (!fp) return; + mbf_init (mbf, fp); + while (true) { - /* Input character, or EOF. */ - int c; - /* If true, perform translations. */ bool convert = true; - /* The following variables have valid values only when CONVERT is true: */ @@ -122,17 +125,23 @@ expand (void) /* Index in TAB_LIST of next tab stop to examine. */ size_t tab_index = 0; - /* Convert a line of text. */ do { - while ((c = getc (fp)) < 0 && (fp = next_file (fp))) - continue; + do { + mbf_getc (c, mbf); + if (mb_iseof (c)) + { + mbf_init (mbf, fp = next_file (fp)); + continue; + } + } + while (false); if (convert) { - if (c == '\t') + if (mb_iseq (c, '\t')) { /* Column the next input tab stop is on. */ uintmax_t next_tab_column; @@ -151,32 +160,34 @@ expand (void) if (putchar (' ') < 0) die (EXIT_FAILURE, errno, _("write error")); - c = ' '; + mb_setascii (&c, ' '); } - else if (c == '\b') + else if (mb_iseq (c, '\b')) { /* Go back one column, and force recalculation of the next tab stop. */ column -= !!column; tab_index -= !!tab_index; } - else + /* A leading control character could make us trip over. */ + else if (!mb_iscntrl (c)) { - column++; + column += mb_width (c); if (!column) die (EXIT_FAILURE, 0, _("input line is too long")); } - convert &= convert_entire_line || !! isblank (c); + convert &= convert_entire_line || mb_isblank (c); } - if (c < 0) + if (mb_iseof (c)) return; - if (putchar (c) < 0) + mb_putc (c, stdout); + if (ferror (stdout)) die (EXIT_FAILURE, errno, _("write error")); } - while (c != '\n'); + while (!mb_iseq (c, '\n')); } } diff --git a/src/unexpand.c b/src/unexpand.c index 7801274..569a7ee 100644 --- a/src/unexpand.c +++ b/src/unexpand.c @@ -38,6 +38,9 @@ #include #include #include + +#include + #include "system.h" #include "die.h" #include "xstrndup.h" @@ -107,11 +110,12 @@ unexpand (void) { /* Input stream. */ FILE *fp = next_file (NULL); + mb_file_t mbf; /* The array of pending blanks. In non-POSIX locales, blanks can include characters other than spaces, so the blanks must be stored, not merely counted. */ - char *pending_blank; + mbf_char_t *pending_blank; if (!fp) return; @@ -119,12 +123,14 @@ unexpand (void) /* The worst case is a non-blank character, then one blank, then a tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ - pending_blank = xmalloc (max_column_width); + pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t)); + + mbf_init (mbf, fp); while (true) { /* Input character, or EOF. */ - int c; + mbf_char_t c; /* If true, perform translations. */ bool convert = true; @@ -158,12 +164,19 @@ unexpand (void) do { - while ((c = getc (fp)) < 0 && (fp = next_file (fp))) - continue; + do { + mbf_getc (c, mbf); + if (mb_iseof (c)) + { + mbf_init (mbf, fp = next_file (fp)); + continue; + } + } + while (false); if (convert) { - bool blank = !! isblank (c); + bool blank = mb_isblank (c); if (blank) { @@ -180,16 +193,16 @@ unexpand (void) if (next_tab_column < column) die (EXIT_FAILURE, 0, _("input line is too long")); - if (c == '\t') + if (mb_iseq (c, '\t')) { column = next_tab_column; if (pending) - pending_blank[0] = '\t'; + mb_setascii (&pending_blank[0], '\t'); } else { - column++; + column += mb_width (c); if (! (prev_blank && column == next_tab_column)) { @@ -197,13 +210,14 @@ unexpand (void) will be replaced by tabs. */ if (column == next_tab_column) one_blank_before_tab_stop = true; - pending_blank[pending++] = c; + mb_copy (&pending_blank[pending++], &c); prev_blank = true; continue; } /* Replace the pending blanks by a tab or two. */ - pending_blank[0] = c = '\t'; + mb_setascii (&c, '\t'); + mb_setascii (&pending_blank[0], '\t'); } /* Discard pending blanks, unless it was a single @@ -211,7 +225,7 @@ unexpand (void) pending = one_blank_before_tab_stop; } } - else if (c == '\b') + else if (mb_iseq (c, '\b')) { /* Go back one column, and force recalculation of the next tab stop. */ @@ -221,7 +235,7 @@ unexpand (void) } else { - column++; + column += mb_width (c); if (!column) die (EXIT_FAILURE, 0, _("input line is too long")); } @@ -229,8 +243,11 @@ unexpand (void) if (pending) { if (pending > 1 && one_blank_before_tab_stop) - pending_blank[0] = '\t'; - if (fwrite (pending_blank, 1, pending, stdout) != pending) + mb_setascii (&pending_blank[0], '\t'); + + for (int n = 0; n < pending; ++n) + mb_putc (pending_blank[n], stdout); + if (ferror (stdout)) die (EXIT_FAILURE, errno, _("write error")); pending = 0; one_blank_before_tab_stop = false; @@ -240,16 +257,17 @@ unexpand (void) convert &= convert_entire_line || blank; } - if (c < 0) + if (mb_iseof (c)) { free (pending_blank); return; } - if (putchar (c) < 0) + mb_putc (c, stdout); + if (ferror (stdout)) die (EXIT_FAILURE, errno, _("write error")); } - while (c != '\n'); + while (!mb_iseq (c, '\n')); } } diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh new file mode 100755 index 0000000..7971e18 --- /dev/null +++ b/tests/expand/mb.sh @@ -0,0 +1,98 @@ +#!/bin/sh + +# Copyright (C) 2012-2015 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ expand + +export LC_ALL=en_US.UTF-8 + +#input containing multibyte characters +cat <<\EOF > in || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ + +cat <<\EOF > exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test characters with display widths != 1 +env printf '12345678 +e\t|ascii(1) +\u00E9\t|composed(1) +e\u0301\t|decomposed(1) +\u3000\t|ideo-space(2) +\uFF0D\t|full-hypen(2) +' > in || framework_failure_ + +env printf '12345678 +e |ascii(1) +\u00E9 |composed(1) +e\u0301 |decomposed(1) +\u3000 |ideo-space(2) +\uFF0D |full-hypen(2) +' > exp || framework_failure_ + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#shouldn't fail with "input line too long" +#when a line starts with a control character +env printf '\n' > in || framework_failure_ + +expand < in > out || fail=1 +compare in out > /dev/null 2>&1 || fail=1 + +#non-Unicode characters interspersed between Unicode ones +env printf '12345678 +\t\xFF| +\xFF\t| +\t\xFFä| +ä\xFF\t| +\tä\xFF| +\xFF\tä| +äbcdef\xFF\t| +' > in || framework_failure_ + +env printf '12345678 + \xFF| +\xFF | + \xFFä| +ä\xFF | + ä\xFF| +\xFF ä| +äbcdef\xFF | +' > exp || framework_failure_ + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +exit $fail diff --git a/tests/local.mk b/tests/local.mk index 192f776..8053397 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -544,6 +544,7 @@ all_tests = \ tests/du/threshold.sh \ tests/du/trailing-slash.sh \ tests/du/two-args.sh \ + tests/expand/mb.sh \ tests/id/gnu-zero-uids.sh \ tests/id/no-context.sh \ tests/id/context.sh \ @@ -684,6 +685,7 @@ all_tests = \ tests/touch/read-only.sh \ tests/touch/relative.sh \ tests/touch/trailing-slash.sh \ + tests/unexpand/mb.sh \ $(all_root_tests) # See tests/factor/create-test.sh. diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh new file mode 100755 index 0000000..60d4c1a --- /dev/null +++ b/tests/unexpand/mb.sh @@ -0,0 +1,97 @@ +#!/bin/sh + +# Copyright (C) 2012-2015 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ unexpand + +export LC_ALL=en_US.UTF-8 + +#input containing multibyte characters +cat > in <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +cat > exp <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test characters with a display width larger than 1 + +env printf '12345678 +e |ascii(1) +\u00E9 |composed(1) +e\u0301 |decomposed(1) +\u3000 |ideo-space(2) +\uFF0D |full-hypen(2) +' > in || framework_failure_ + +env printf '12345678 +e\t|ascii(1) +\u00E9\t|composed(1) +e\u0301\t|decomposed(1) +\u3000\t|ideo-space(2) +\uFF0D\t|full-hypen(2) +' > exp || framework_failure_ + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test input where a blank of width > 1 is not being substituted +in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')" +exp='   ö ü ß' + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#non-Unicode characters interspersed between Unicode ones +env printf '12345678 + \xFF| +\xFF | + \xFFä| +ä\xFF | + ä\xFF| +\xFF ä| +äbcdef\xFF | +' > in || framework_failure_ + +env printf '12345678 +\t\xFF| +\xFF\t| +\t\xFFä| +ä\xFF\t| +\tä\xFF| +\xFF\tä| +äbcdef\xFF\t| +' > exp || framework_failure_ + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 -- 2.7.4