8ae002
From 835c3bf23a119a7fcb8c70734d4fdf49461d8195 Mon Sep 17 00:00:00 2001
8ae002
From: Stefan Liebler <stli@linux.vnet.ibm.com>
8ae002
Date: Mon, 7 Nov 2016 16:14:07 +0100
8ae002
Subject: [PATCH 09/17] S390: Optimize utf8-utf32 module.
8ae002
8ae002
Upstream commit 421c5278d83e72740150259960a431706ac343f9
8ae002
8ae002
This patch reworks the s390 specific module to convert between utf8 and utf32.
8ae002
Now ifunc is used to choose either the c or etf3eh (with convert utf
8ae002
instruction) variants at runtime.
8ae002
Furthermore a new vector variant for z13 is introduced which will be build
8ae002
and chosen if vector support is available at build / runtime.
8ae002
The vector variants optimize input of 1byte utf8 characters. The convert utf
8ae002
instruction is used if a multibyte utf8 character is found.
8ae002
8ae002
This patch also fixes some whitespace errors. The c variants are rejecting
8ae002
UTF-16 surrogates and values above 0x10ffff now.
8ae002
Furthermore, the etf3eh variants are handling the "UTF-xx//IGNORE" case now.
8ae002
Before they ignored the ignore-case and always stopped at an error.
8ae002
8ae002
ChangeLog:
8ae002
8ae002
	* sysdeps/s390/s390-64/utf8-utf32-z9.c: Use ifunc to select c, etf3eh
8ae002
	or new vector loop-variant.
8ae002
---
8ae002
 sysdeps/s390/s390-64/utf8-utf32-z9.c | 664 +++++++++++++++++++++++++----------
8ae002
 1 file changed, 480 insertions(+), 184 deletions(-)
8ae002
8ae002
diff --git a/sysdeps/s390/s390-64/utf8-utf32-z9.c b/sysdeps/s390/s390-64/utf8-utf32-z9.c
8ae002
index 721279e..1ce5ac5 100644
8ae002
--- a/sysdeps/s390/s390-64/utf8-utf32-z9.c
8ae002
+++ b/sysdeps/s390/s390-64/utf8-utf32-z9.c
8ae002
@@ -30,35 +30,25 @@
8ae002
 #include <dl-procinfo.h>
8ae002
 #include <gconv.h>
8ae002
 
8ae002
-/* UTF-32 big endian byte order mark.  */
8ae002
-#define BOM	                0x0000feffu
8ae002
+#if defined HAVE_S390_VX_GCC_SUPPORT
8ae002
+# define ASM_CLOBBER_VR(NR) , NR
8ae002
+#else
8ae002
+# define ASM_CLOBBER_VR(NR)
8ae002
+#endif
8ae002
 
8ae002
+/* Defines for skeleton.c.  */
8ae002
 #define DEFINE_INIT		0
8ae002
 #define DEFINE_FINI		0
8ae002
-/* These definitions apply to the UTF-8 to UTF-32 direction.  The
8ae002
-   software implementation for UTF-8 still supports multibyte
8ae002
-   characters up to 6 bytes whereas the hardware variant does not.  */
8ae002
 #define MIN_NEEDED_FROM		1
8ae002
 #define MAX_NEEDED_FROM		6
8ae002
 #define MIN_NEEDED_TO		4
8ae002
-#define FROM_LOOP		from_utf8_loop
8ae002
-#define TO_LOOP			to_utf8_loop
8ae002
+#define FROM_LOOP		__from_utf8_loop
8ae002
+#define TO_LOOP			__to_utf8_loop
8ae002
 #define FROM_DIRECTION		(dir == from_utf8)
8ae002
 #define ONE_DIRECTION           0
8ae002
-#define PREPARE_LOOP							\
8ae002
-  enum direction dir = ((struct utf8_data *) step->__data)->dir;	\
8ae002
-  int emit_bom = ((struct utf8_data *) step->__data)->emit_bom;		\
8ae002
-									\
8ae002
-  if (emit_bom && !data->__internal_use					\
8ae002
-      && data->__invocation_counter == 0)				\
8ae002
-    {									\
8ae002
-      /* Emit the Byte Order Mark.  */					\
8ae002
-      if (__glibc_unlikely (outbuf + 4 > outend))			      \
8ae002
-	return __GCONV_FULL_OUTPUT;					\
8ae002
-									\
8ae002
-      put32u (outbuf, BOM);						\
8ae002
-      outbuf += 4;							\
8ae002
-    }
8ae002
+
8ae002
+/* UTF-32 big endian byte order mark.  */
8ae002
+#define BOM			0x0000feffu
8ae002
 
8ae002
 /* Direction of the transformation.  */
8ae002
 enum direction
8ae002
@@ -155,16 +145,16 @@ gconv_end (struct __gconv_step *data)
8ae002
     register unsigned long long outlen __asm__("11") = outend - outptr;	\
8ae002
     uint64_t cc = 0;							\
8ae002
 									\
8ae002
-    __asm__ volatile (".machine push       \n\t"			\
8ae002
-		      ".machine \"z9-109\" \n\t"			\
8ae002
-		      "0: " INSTRUCTION "  \n\t"			\
8ae002
-		      ".machine pop        \n\t"			\
8ae002
-		      "   jo     0b        \n\t"			\
8ae002
-		      "   ipm    %2        \n"				\
8ae002
-		      : "+a" (pOutput), "+a" (pInput), "+d" (cc),	\
8ae002
-		      "+d" (outlen), "+d" (inlen)			\
8ae002
-		      :							\
8ae002
-		      : "cc", "memory");				\
8ae002
+    __asm__ __volatile__ (".machine push       \n\t"			\
8ae002
+			  ".machine \"z9-109\" \n\t"			\
8ae002
+			  "0: " INSTRUCTION "  \n\t"			\
8ae002
+			  ".machine pop        \n\t"			\
8ae002
+			  "   jo     0b        \n\t"			\
8ae002
+			  "   ipm    %2        \n"			\
8ae002
+			  : "+a" (pOutput), "+a" (pInput), "+d" (cc),	\
8ae002
+			    "+d" (outlen), "+d" (inlen)			\
8ae002
+			  :						\
8ae002
+			  : "cc", "memory");				\
8ae002
 									\
8ae002
     inptr = pInput;							\
8ae002
     outptr = pOutput;							\
8ae002
@@ -173,49 +163,150 @@ gconv_end (struct __gconv_step *data)
8ae002
     if (cc == 1)							\
8ae002
       {									\
8ae002
 	result = __GCONV_FULL_OUTPUT;					\
8ae002
-	break;								\
8ae002
       }									\
8ae002
     else if (cc == 2)							\
8ae002
       {									\
8ae002
 	result = __GCONV_ILLEGAL_INPUT;					\
8ae002
-	break;								\
8ae002
       }									\
8ae002
   }
8ae002
 
8ae002
+#define PREPARE_LOOP							\
8ae002
+  enum direction dir = ((struct utf8_data *) step->__data)->dir;	\
8ae002
+  int emit_bom = ((struct utf8_data *) step->__data)->emit_bom;		\
8ae002
+									\
8ae002
+  if (emit_bom && !data->__internal_use					\
8ae002
+      && data->__invocation_counter == 0)				\
8ae002
+    {									\
8ae002
+      /* Emit the Byte Order Mark.  */					\
8ae002
+      if (__glibc_unlikely (outbuf + 4 > outend))			\
8ae002
+	return __GCONV_FULL_OUTPUT;					\
8ae002
+									\
8ae002
+      put32u (outbuf, BOM);						\
8ae002
+      outbuf += 4;							\
8ae002
+    }
8ae002
+
8ae002
 /* Conversion function from UTF-8 to UTF-32 internal/BE.  */
8ae002
 
8ae002
-#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
8ae002
-#define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
8ae002
-#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
8ae002
-#define LOOPFCT			FROM_LOOP
8ae002
-/* The software routine is copied from gconv_simple.c.  */
8ae002
-#define BODY								\
8ae002
+#define STORE_REST_COMMON						      \
8ae002
+  {									      \
8ae002
+    /* We store the remaining bytes while converting them into the UCS4	      \
8ae002
+       format.  We can assume that the first byte in the buffer is	      \
8ae002
+       correct and that it requires a larger number of bytes than there	      \
8ae002
+       are in the input buffer.  */					      \
8ae002
+    wint_t ch = **inptrp;						      \
8ae002
+    size_t cnt, r;							      \
8ae002
+									      \
8ae002
+    state->__count = inend - *inptrp;					      \
8ae002
+									      \
8ae002
+    assert (ch != 0xc0 && ch != 0xc1);					      \
8ae002
+    if (ch >= 0xc2 && ch < 0xe0)					      \
8ae002
+      {									      \
8ae002
+	/* We expect two bytes.  The first byte cannot be 0xc0 or	      \
8ae002
+	   0xc1, otherwise the wide character could have been		      \
8ae002
+	   represented using a single byte.  */				      \
8ae002
+	cnt = 2;							      \
8ae002
+	ch &= 0x1f;							      \
8ae002
+      }									      \
8ae002
+    else if (__glibc_likely ((ch & 0xf0) == 0xe0))			      \
8ae002
+      {									      \
8ae002
+	/* We expect three bytes.  */					      \
8ae002
+	cnt = 3;							      \
8ae002
+	ch &= 0x0f;							      \
8ae002
+      }									      \
8ae002
+    else if (__glibc_likely ((ch & 0xf8) == 0xf0))			      \
8ae002
+      {									      \
8ae002
+	/* We expect four bytes.  */					      \
8ae002
+	cnt = 4;							      \
8ae002
+	ch &= 0x07;							      \
8ae002
+      }									      \
8ae002
+    else if (__glibc_likely ((ch & 0xfc) == 0xf8))			      \
8ae002
+      {									      \
8ae002
+	/* We expect five bytes.  */					      \
8ae002
+	cnt = 5;							      \
8ae002
+	ch &= 0x03;							      \
8ae002
+      }									      \
8ae002
+    else								      \
8ae002
+      {									      \
8ae002
+	/* We expect six bytes.  */					      \
8ae002
+	cnt = 6;							      \
8ae002
+	ch &= 0x01;							      \
8ae002
+      }									      \
8ae002
+									      \
8ae002
+    /* The first byte is already consumed.  */				      \
8ae002
+    r = cnt - 1;							      \
8ae002
+    while (++(*inptrp) < inend)						      \
8ae002
+      {									      \
8ae002
+	ch <<= 6;							      \
8ae002
+	ch |= **inptrp & 0x3f;						      \
8ae002
+	--r;								      \
8ae002
+      }									      \
8ae002
+									      \
8ae002
+    /* Shift for the so far missing bytes.  */				      \
8ae002
+    ch <<= r * 6;							      \
8ae002
+									      \
8ae002
+    /* Store the number of bytes expected for the entire sequence.  */	      \
8ae002
+    state->__count |= cnt << 8;						      \
8ae002
+									      \
8ae002
+    /* Store the value.  */						      \
8ae002
+    state->__value.__wch = ch;						      \
8ae002
+  }
8ae002
+
8ae002
+#define UNPACK_BYTES_COMMON \
8ae002
+  {									      \
8ae002
+    static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };  \
8ae002
+    wint_t wch = state->__value.__wch;					      \
8ae002
+    size_t ntotal = state->__count >> 8;				      \
8ae002
+									      \
8ae002
+    inlen = state->__count & 255;					      \
8ae002
+									      \
8ae002
+    bytebuf[0] = inmask[ntotal - 2];					      \
8ae002
+									      \
8ae002
+    do									      \
8ae002
+      {									      \
8ae002
+	if (--ntotal < inlen)						      \
8ae002
+	  bytebuf[ntotal] = 0x80 | (wch & 0x3f);			      \
8ae002
+	wch >>= 6;							      \
8ae002
+      }									      \
8ae002
+    while (ntotal > 1);							      \
8ae002
+									      \
8ae002
+    bytebuf[0] |= wch;							      \
8ae002
+  }
8ae002
+
8ae002
+#define CLEAR_STATE_COMMON \
8ae002
+  state->__count = 0
8ae002
+
8ae002
+#define BODY_FROM_HW(ASM)						\
8ae002
   {									\
8ae002
-    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
8ae002
-      {									\
8ae002
-	HARDWARE_CONVERT ("cu14 %0, %1, 1");				\
8ae002
+    ASM;								\
8ae002
+    if (__glibc_likely (inptr == inend)					\
8ae002
+	|| result == __GCONV_FULL_OUTPUT)				\
8ae002
+      break;								\
8ae002
 									\
8ae002
-	if (inptr != inend)						\
8ae002
-	  {								\
8ae002
-	    int i;							\
8ae002
-	    for (i = 1; inptr + i < inend; ++i)				\
8ae002
-	      if ((inptr[i] & 0xc0) != 0x80)				\
8ae002
-		break;							\
8ae002
+    int i;								\
8ae002
+    for (i = 1; inptr + i < inend && i < 5; ++i)			\
8ae002
+      if ((inptr[i] & 0xc0) != 0x80)					\
8ae002
+	break;								\
8ae002
 									\
8ae002
-	    if (__glibc_likely (inptr + i == inend))			      \
8ae002
-	      {								\
8ae002
-		result = __GCONV_INCOMPLETE_INPUT;			\
8ae002
-		break;							\
8ae002
-	      }								\
8ae002
-	    STANDARD_FROM_LOOP_ERR_HANDLER (i);				\
8ae002
-	  }								\
8ae002
-	continue;							\
8ae002
+    if (__glibc_likely (inptr + i == inend				\
8ae002
+			&& result == __GCONV_EMPTY_INPUT))		\
8ae002
+      {									\
8ae002
+	result = __GCONV_INCOMPLETE_INPUT;				\
8ae002
+	break;								\
8ae002
       }									\
8ae002
-									\
8ae002
+    STANDARD_FROM_LOOP_ERR_HANDLER (i);					\
8ae002
+  }
8ae002
+
8ae002
+/* This hardware routine uses the Convert UTF8 to UTF32 (cu14) instruction.  */
8ae002
+#define BODY_FROM_ETF3EH BODY_FROM_HW (HARDWARE_CONVERT ("cu14 %0, %1, 1"))
8ae002
+
8ae002
+
8ae002
+/* The software routine is copied from gconv_simple.c.  */
8ae002
+#define BODY_FROM_C							\
8ae002
+  {									\
8ae002
     /* Next input byte.  */						\
8ae002
     uint32_t ch = *inptr;						\
8ae002
 									\
8ae002
-    if (__glibc_likely (ch < 0x80))					      \
8ae002
+    if (__glibc_likely (ch < 0x80))					\
8ae002
       {									\
8ae002
 	/* One byte sequence.  */					\
8ae002
 	++inptr;							\
8ae002
@@ -233,30 +324,18 @@ gconv_end (struct __gconv_step *data)
8ae002
 	    cnt = 2;							\
8ae002
 	    ch &= 0x1f;							\
8ae002
 	  }								\
8ae002
-        else if (__glibc_likely ((ch & 0xf0) == 0xe0))			      \
8ae002
+	else if (__glibc_likely ((ch & 0xf0) == 0xe0))			\
8ae002
 	  {								\
8ae002
 	    /* We expect three bytes.  */				\
8ae002
 	    cnt = 3;							\
8ae002
 	    ch &= 0x0f;							\
8ae002
 	  }								\
8ae002
-	else if (__glibc_likely ((ch & 0xf8) == 0xf0))			      \
8ae002
+	else if (__glibc_likely ((ch & 0xf8) == 0xf0))			\
8ae002
 	  {								\
8ae002
 	    /* We expect four bytes.  */				\
8ae002
 	    cnt = 4;							\
8ae002
 	    ch &= 0x07;							\
8ae002
 	  }								\
8ae002
-	else if (__glibc_likely ((ch & 0xfc) == 0xf8))			      \
8ae002
-	  {								\
8ae002
-	    /* We expect five bytes.  */				\
8ae002
-	    cnt = 5;							\
8ae002
-	    ch &= 0x03;							\
8ae002
-	  }								\
8ae002
-	else if (__glibc_likely ((ch & 0xfe) == 0xfc))			      \
8ae002
-	  {								\
8ae002
-	    /* We expect six bytes.  */					\
8ae002
-	    cnt = 6;							\
8ae002
-	    ch &= 0x01;							\
8ae002
-	  }								\
8ae002
 	else								\
8ae002
 	  {								\
8ae002
 	    /* Search the end of this ill-formed UTF-8 character.  This	\
8ae002
@@ -272,7 +351,7 @@ gconv_end (struct __gconv_step *data)
8ae002
 	    STANDARD_FROM_LOOP_ERR_HANDLER (i);				\
8ae002
 	  }								\
8ae002
 									\
8ae002
-	if (__glibc_unlikely (inptr + cnt > inend))			      \
8ae002
+	if (__glibc_unlikely (inptr + cnt > inend))			\
8ae002
 	  {								\
8ae002
 	    /* We don't have enough input.  But before we report	\
8ae002
 	       that check that all the bytes are correct.  */		\
8ae002
@@ -280,7 +359,7 @@ gconv_end (struct __gconv_step *data)
8ae002
 	      if ((inptr[i] & 0xc0) != 0x80)				\
8ae002
 		break;							\
8ae002
 									\
8ae002
-	    if (__glibc_likely (inptr + i == inend))			      \
8ae002
+	    if (__glibc_likely (inptr + i == inend))			\
8ae002
 	      {								\
8ae002
 		result = __GCONV_INCOMPLETE_INPUT;			\
8ae002
 		break;							\
8ae002
@@ -305,7 +384,10 @@ gconv_end (struct __gconv_step *data)
8ae002
 	/* If i < cnt, some trail byte was not >= 0x80, < 0xc0.		\
8ae002
 	   If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could	\
8ae002
 	   have been represented with fewer than cnt bytes.  */		\
8ae002
-	if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0))		\
8ae002
+	if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)		\
8ae002
+	    /* Do not accept UTF-16 surrogates.  */			\
8ae002
+	    || (ch >= 0xd800 && ch <= 0xdfff)				\
8ae002
+	    || (ch > 0x10ffff))						\
8ae002
 	  {								\
8ae002
 	    /* This is an illegal encoding.  */				\
8ae002
 	    goto errout;						\
8ae002
@@ -318,137 +400,212 @@ gconv_end (struct __gconv_step *data)
8ae002
     *((uint32_t *) outptr) = ch;					\
8ae002
     outptr += sizeof (uint32_t);					\
8ae002
   }
8ae002
-#define LOOP_NEED_FLAGS
8ae002
 
8ae002
-#define STORE_REST							\
8ae002
-  {									      \
8ae002
-    /* We store the remaining bytes while converting them into the UCS4	      \
8ae002
-       format.  We can assume that the first byte in the buffer is	      \
8ae002
-       correct and that it requires a larger number of bytes than there	      \
8ae002
-       are in the input buffer.  */					      \
8ae002
-    wint_t ch = **inptrp;						      \
8ae002
-    size_t cnt, r;							      \
8ae002
-									      \
8ae002
-    state->__count = inend - *inptrp;					      \
8ae002
-									      \
8ae002
-    if (ch >= 0xc2 && ch < 0xe0)					      \
8ae002
-      {									      \
8ae002
-	/* We expect two bytes.  The first byte cannot be 0xc0 or	      \
8ae002
-	   0xc1, otherwise the wide character could have been		      \
8ae002
-	   represented using a single byte.  */				      \
8ae002
-	cnt = 2;							      \
8ae002
-	ch &= 0x1f;							      \
8ae002
-      }									      \
8ae002
-    else if (__glibc_likely ((ch & 0xf0) == 0xe0))			      \
8ae002
-      {									      \
8ae002
-	/* We expect three bytes.  */					      \
8ae002
-	cnt = 3;							      \
8ae002
-	ch &= 0x0f;							      \
8ae002
-      }									      \
8ae002
-    else if (__glibc_likely ((ch & 0xf8) == 0xf0))			      \
8ae002
-      {									      \
8ae002
-	/* We expect four bytes.  */					      \
8ae002
-	cnt = 4;							      \
8ae002
-	ch &= 0x07;							      \
8ae002
-      }									      \
8ae002
-    else if (__glibc_likely ((ch & 0xfc) == 0xf8))			      \
8ae002
-      {									      \
8ae002
-	/* We expect five bytes.  */					      \
8ae002
-	cnt = 5;							      \
8ae002
-	ch &= 0x03;							      \
8ae002
-      }									      \
8ae002
-    else								      \
8ae002
-      {									      \
8ae002
-	/* We expect six bytes.  */					      \
8ae002
-	cnt = 6;							      \
8ae002
-	ch &= 0x01;							      \
8ae002
-      }									      \
8ae002
-									      \
8ae002
-    /* The first byte is already consumed.  */				      \
8ae002
-    r = cnt - 1;							      \
8ae002
-    while (++(*inptrp) < inend)						      \
8ae002
-      {									      \
8ae002
-	ch <<= 6;							      \
8ae002
-	ch |= **inptrp & 0x3f;						      \
8ae002
-	--r;								      \
8ae002
-      }									      \
8ae002
-									      \
8ae002
-    /* Shift for the so far missing bytes.  */				      \
8ae002
-    ch <<= r * 6;							      \
8ae002
-									      \
8ae002
-    /* Store the number of bytes expected for the entire sequence.  */	      \
8ae002
-    state->__count |= cnt << 8;						      \
8ae002
-									      \
8ae002
-    /* Store the value.  */						      \
8ae002
-    state->__value.__wch = ch;						      \
8ae002
+#define HW_FROM_VX							\
8ae002
+  {									\
8ae002
+    register const unsigned char* pInput asm ("8") = inptr;		\
8ae002
+    register size_t inlen asm ("9") = inend - inptr;			\
8ae002
+    register unsigned char* pOutput asm ("10") = outptr;		\
8ae002
+    register size_t outlen asm("11") = outend - outptr;			\
8ae002
+    unsigned long tmp, tmp2, tmp3;					\
8ae002
+    asm volatile (".machine push\n\t"					\
8ae002
+		  ".machine \"z13\"\n\t"				\
8ae002
+		  ".machinemode \"zarch_nohighgprs\"\n\t"		\
8ae002
+		  "    vrepib %%v30,0x7f\n\t" /* For compare > 0x7f.  */ \
8ae002
+		  "    vrepib %%v31,0x20\n\t"				\
8ae002
+		  /* Loop which handles UTF-8 chars <=0x7f.  */		\
8ae002
+		  "0:  clgijl %[R_INLEN],16,20f\n\t"			\
8ae002
+		  "    clgijl %[R_OUTLEN],64,20f\n\t"			\
8ae002
+		  "1: vl %%v16,0(%[R_IN])\n\t"				\
8ae002
+		  "    vstrcbs %%v17,%%v16,%%v30,%%v31\n\t"		\
8ae002
+		  "    jno 10f\n\t" /* Jump away if not all bytes are 1byte \
8ae002
+				   UTF8 chars.  */			\
8ae002
+		  /* Enlarge to UCS4.  */				\
8ae002
+		  "    vuplhb %%v18,%%v16\n\t"				\
8ae002
+		  "    vupllb %%v19,%%v16\n\t"				\
8ae002
+		  "    la %[R_IN],16(%[R_IN])\n\t"			\
8ae002
+		  "    vuplhh %%v20,%%v18\n\t"				\
8ae002
+		  "    aghi %[R_INLEN],-16\n\t"				\
8ae002
+		  "    vupllh %%v21,%%v18\n\t"				\
8ae002
+		  "    aghi %[R_OUTLEN],-64\n\t"			\
8ae002
+		  "    vuplhh %%v22,%%v19\n\t"				\
8ae002
+		  "    vupllh %%v23,%%v19\n\t"				\
8ae002
+		  /* Store 64 bytes to buf_out.  */			\
8ae002
+		  "    vstm %%v20,%%v23,0(%[R_OUT])\n\t"		\
8ae002
+		  "    la %[R_OUT],64(%[R_OUT])\n\t"			\
8ae002
+		  "    clgijl %[R_INLEN],16,20f\n\t"			\
8ae002
+		  "    clgijl %[R_OUTLEN],64,20f\n\t"			\
8ae002
+		  "    j 1b\n\t"					\
8ae002
+		  "10: \n\t"						\
8ae002
+		  /* At least one byte is > 0x7f.			\
8ae002
+		     Store the preceding 1-byte chars.  */		\
8ae002
+		  "    vlgvb %[R_TMP],%%v17,7\n\t"			\
8ae002
+		  "    sllk %[R_TMP2],%[R_TMP],2\n\t" /* Compute highest \
8ae002
+						     index to store. */ \
8ae002
+		  "    llgfr %[R_TMP3],%[R_TMP2]\n\t"			\
8ae002
+		  "    ahi %[R_TMP2],-1\n\t"				\
8ae002
+		  "    jl 20f\n\t"					\
8ae002
+		  "    vuplhb %%v18,%%v16\n\t"				\
8ae002
+		  "    vuplhh %%v20,%%v18\n\t"				\
8ae002
+		  "    vstl %%v20,%[R_TMP2],0(%[R_OUT])\n\t"		\
8ae002
+		  "    ahi %[R_TMP2],-16\n\t"				\
8ae002
+		  "    jl 11f\n\t"					\
8ae002
+		  "    vupllh %%v21,%%v18\n\t"				\
8ae002
+		  "    vstl %%v21,%[R_TMP2],16(%[R_OUT])\n\t"		\
8ae002
+		  "    ahi %[R_TMP2],-16\n\t"				\
8ae002
+		  "    jl 11f\n\t"					\
8ae002
+		  "    vupllb %%v19,%%v16\n\t"				\
8ae002
+		  "    vuplhh %%v22,%%v19\n\t"				\
8ae002
+		  "    vstl %%v22,%[R_TMP2],32(%[R_OUT])\n\t"		\
8ae002
+		  "    ahi %[R_TMP2],-16\n\t"				\
8ae002
+		  "    jl 11f\n\t"					\
8ae002
+		  "    vupllh %%v23,%%v19\n\t"				\
8ae002
+		  "    vstl %%v23,%[R_TMP2],48(%[R_OUT])\n\t"		\
8ae002
+		  "11: \n\t"						\
8ae002
+		  /* Update pointers.  */				\
8ae002
+		  "    la %[R_IN],0(%[R_TMP],%[R_IN])\n\t"		\
8ae002
+		  "    slgr %[R_INLEN],%[R_TMP]\n\t"			\
8ae002
+		  "    la %[R_OUT],0(%[R_TMP3],%[R_OUT])\n\t"		\
8ae002
+		  "    slgr %[R_OUTLEN],%[R_TMP3]\n\t"			\
8ae002
+		  /* Handle multibyte utf8-char with convert instruction. */ \
8ae002
+		  "20: cu14 %[R_OUT],%[R_IN],1\n\t"			\
8ae002
+		  "    jo 0b\n\t" /* Try vector implemenation again.  */ \
8ae002
+		  "    lochil %[R_RES],%[RES_OUT_FULL]\n\t" /* cc == 1.  */ \
8ae002
+		  "    lochih %[R_RES],%[RES_IN_ILL]\n\t" /* cc == 2.  */ \
8ae002
+		  ".machine pop"					\
8ae002
+		  : /* outputs */ [R_IN] "+a" (pInput)			\
8ae002
+		    , [R_INLEN] "+d" (inlen), [R_OUT] "+a" (pOutput)	\
8ae002
+		    , [R_OUTLEN] "+d" (outlen), [R_TMP] "=a" (tmp)	\
8ae002
+		    , [R_TMP2] "=d" (tmp2), [R_TMP3] "=a" (tmp3)	\
8ae002
+		    , [R_RES] "+d" (result)				\
8ae002
+		  : /* inputs */					\
8ae002
+		    [RES_OUT_FULL] "i" (__GCONV_FULL_OUTPUT)		\
8ae002
+		    , [RES_IN_ILL] "i" (__GCONV_ILLEGAL_INPUT)		\
8ae002
+		  : /* clobber list */ "memory", "cc"			\
8ae002
+		    ASM_CLOBBER_VR ("v16") ASM_CLOBBER_VR ("v17")	\
8ae002
+		    ASM_CLOBBER_VR ("v18") ASM_CLOBBER_VR ("v19")	\
8ae002
+		    ASM_CLOBBER_VR ("v20") ASM_CLOBBER_VR ("v21")	\
8ae002
+		    ASM_CLOBBER_VR ("v22") ASM_CLOBBER_VR ("v30")	\
8ae002
+		    ASM_CLOBBER_VR ("v31")				\
8ae002
+		  );							\
8ae002
+    inptr = pInput;							\
8ae002
+    outptr = pOutput;							\
8ae002
   }
8ae002
+#define BODY_FROM_VX BODY_FROM_HW (HW_FROM_VX)
8ae002
 
8ae002
-#define UNPACK_BYTES \
8ae002
-  {									      \
8ae002
-    static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };  \
8ae002
-    wint_t wch = state->__value.__wch;					      \
8ae002
-    size_t ntotal = state->__count >> 8;				      \
8ae002
-									      \
8ae002
-    inlen = state->__count & 255;					      \
8ae002
-									      \
8ae002
-    bytebuf[0] = inmask[ntotal - 2];					      \
8ae002
-									      \
8ae002
-    do									      \
8ae002
-      {									      \
8ae002
-	if (--ntotal < inlen)						      \
8ae002
-	  bytebuf[ntotal] = 0x80 | (wch & 0x3f);			      \
8ae002
-	wch >>= 6;							      \
8ae002
-      }									      \
8ae002
-    while (ntotal > 1);							      \
8ae002
-									      \
8ae002
-    bytebuf[0] |= wch;							      \
8ae002
-  }
8ae002
+/* These definitions apply to the UTF-8 to UTF-32 direction.  The
8ae002
+   software implementation for UTF-8 still supports multibyte
8ae002
+   characters up to 6 bytes whereas the hardware variant does not.  */
8ae002
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
8ae002
+#define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
8ae002
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
8ae002
+#define LOOPFCT			__from_utf8_loop_c
8ae002
 
8ae002
-#define CLEAR_STATE \
8ae002
-  state->__count = 0
8ae002
+#define LOOP_NEED_FLAGS
8ae002
 
8ae002
+#define STORE_REST		STORE_REST_COMMON
8ae002
+#define UNPACK_BYTES		UNPACK_BYTES_COMMON
8ae002
+#define CLEAR_STATE		CLEAR_STATE_COMMON
8ae002
+#define BODY			BODY_FROM_C
8ae002
 #include <iconv/loop.c>
8ae002
 
8ae002
+
8ae002
+/* Generate loop-function with hardware utf-convert instruction.  */
8ae002
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
8ae002
+#define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
8ae002
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
8ae002
+#define LOOPFCT			__from_utf8_loop_etf3eh
8ae002
+
8ae002
+#define LOOP_NEED_FLAGS
8ae002
+
8ae002
+#define STORE_REST		STORE_REST_COMMON
8ae002
+#define UNPACK_BYTES		UNPACK_BYTES_COMMON
8ae002
+#define CLEAR_STATE		CLEAR_STATE_COMMON
8ae002
+#define BODY			BODY_FROM_ETF3EH
8ae002
+#include <iconv/loop.c>
8ae002
+
8ae002
+#if defined HAVE_S390_VX_ASM_SUPPORT
8ae002
+/* Generate loop-function with hardware vector instructions.  */
8ae002
+# define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
8ae002
+# define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
8ae002
+# define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
8ae002
+# define LOOPFCT		__from_utf8_loop_vx
8ae002
+
8ae002
+# define LOOP_NEED_FLAGS
8ae002
+
8ae002
+# define STORE_REST		STORE_REST_COMMON
8ae002
+# define UNPACK_BYTES		UNPACK_BYTES_COMMON
8ae002
+# define CLEAR_STATE		CLEAR_STATE_COMMON
8ae002
+# define BODY			BODY_FROM_VX
8ae002
+# include <iconv/loop.c>
8ae002
+#endif
8ae002
+
8ae002
+
8ae002
+/* Generate ifunc'ed loop function.  */
8ae002
+__typeof(__from_utf8_loop_c)
8ae002
+__attribute__ ((ifunc ("__from_utf8_loop_resolver")))
8ae002
+__from_utf8_loop;
8ae002
+
8ae002
+static void *
8ae002
+__from_utf8_loop_resolver (unsigned long int dl_hwcap)
8ae002
+{
8ae002
+#if defined HAVE_S390_VX_ASM_SUPPORT
8ae002
+  if (dl_hwcap & HWCAP_S390_VX)
8ae002
+    return __from_utf8_loop_vx;
8ae002
+  else
8ae002
+#endif
8ae002
+  if (dl_hwcap & HWCAP_S390_ETF3EH)
8ae002
+    return __from_utf8_loop_etf3eh;
8ae002
+  else
8ae002
+    return __from_utf8_loop_c;
8ae002
+}
8ae002
+
8ae002
+strong_alias (__from_utf8_loop_c_single, __from_utf8_loop_single)
8ae002
+
8ae002
+
8ae002
 /* Conversion from UTF-32 internal/BE to UTF-8.  */
8ae002
+#define BODY_TO_HW(ASM)							\
8ae002
+  {									\
8ae002
+    ASM;								\
8ae002
+    if (__glibc_likely (inptr == inend)					\
8ae002
+	|| result == __GCONV_FULL_OUTPUT)				\
8ae002
+      break;								\
8ae002
+    if (inptr + 4 > inend)						\
8ae002
+      {									\
8ae002
+	result = __GCONV_INCOMPLETE_INPUT;				\
8ae002
+	break;								\
8ae002
+      }									\
8ae002
+    STANDARD_TO_LOOP_ERR_HANDLER (4);					\
8ae002
+  }
8ae002
+
8ae002
+/* The hardware routine uses the S/390 cu41 instruction.  */
8ae002
+#define BODY_TO_ETF3EH BODY_TO_HW (HARDWARE_CONVERT ("cu41 %0, %1"))
8ae002
+
8ae002
+/* The hardware routine uses the S/390 vector and cu41 instructions.  */
8ae002
+#define BODY_TO_VX BODY_TO_HW (HW_TO_VX)
8ae002
 
8ae002
-#define MIN_NEEDED_INPUT	MIN_NEEDED_TO
8ae002
-#define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
8ae002
-#define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
8ae002
-#define LOOPFCT			TO_LOOP
8ae002
 /* The software routine mimics the S/390 cu41 instruction.  */
8ae002
-#define BODY							\
8ae002
+#define BODY_TO_C						\
8ae002
   {								\
8ae002
-    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)			\
8ae002
-      {								\
8ae002
-	HARDWARE_CONVERT ("cu41 %0, %1");			\
8ae002
-								\
8ae002
-	if (inptr != inend)					\
8ae002
-	  {							\
8ae002
-	    result = __GCONV_INCOMPLETE_INPUT;			\
8ae002
-	    break;						\
8ae002
-	  }							\
8ae002
-	continue;						\
8ae002
-      }								\
8ae002
-								\
8ae002
     uint32_t wc = *((const uint32_t *) inptr);			\
8ae002
 								\
8ae002
-    if (__glibc_likely (wc <= 0x7f))					      \
8ae002
+    if (__glibc_likely (wc <= 0x7f))				\
8ae002
       {								\
8ae002
-        /* Single UTF-8 char.  */				\
8ae002
-        *outptr = (uint8_t)wc;					\
8ae002
+	/* Single UTF-8 char.  */				\
8ae002
+	*outptr = (uint8_t)wc;					\
8ae002
 	outptr++;						\
8ae002
       }								\
8ae002
     else if (wc <= 0x7ff)					\
8ae002
       {								\
8ae002
-        /* Two UTF-8 chars.  */					\
8ae002
-        if (__glibc_unlikely (outptr + 2 > outend))			      \
8ae002
+	/* Two UTF-8 chars.  */					\
8ae002
+	if (__glibc_unlikely (outptr + 2 > outend))		\
8ae002
 	  {							\
8ae002
 	    /* Overflow in the output buffer.  */		\
8ae002
 	    result = __GCONV_FULL_OUTPUT;			\
8ae002
 	    break;						\
8ae002
 	  }							\
8ae002
 								\
8ae002
-        outptr[0] = 0xc0;					\
8ae002
+	outptr[0] = 0xc0;					\
8ae002
 	outptr[0] |= wc >> 6;					\
8ae002
 								\
8ae002
 	outptr[1] = 0x80;					\
8ae002
@@ -459,12 +616,18 @@ gconv_end (struct __gconv_step *data)
8ae002
     else if (wc <= 0xffff)					\
8ae002
       {								\
8ae002
 	/* Three UTF-8 chars.  */				\
8ae002
-	if (__glibc_unlikely (outptr + 3 > outend))			      \
8ae002
+	if (__glibc_unlikely (outptr + 3 > outend))		\
8ae002
 	  {							\
8ae002
 	    /* Overflow in the output buffer.  */		\
8ae002
 	    result = __GCONV_FULL_OUTPUT;			\
8ae002
 	    break;						\
8ae002
 	  }							\
8ae002
+	if (wc >= 0xd800 && wc < 0xdc00)			\
8ae002
+	  {							\
8ae002
+	    /* Do not accept UTF-16 surrogates.   */		\
8ae002
+	    result = __GCONV_ILLEGAL_INPUT;			\
8ae002
+	    STANDARD_TO_LOOP_ERR_HANDLER (4);			\
8ae002
+	  }							\
8ae002
 	outptr[0] = 0xe0;					\
8ae002
 	outptr[0] |= wc >> 12;					\
8ae002
 								\
8ae002
@@ -479,7 +642,7 @@ gconv_end (struct __gconv_step *data)
8ae002
       else if (wc <= 0x10ffff)					\
8ae002
 	{							\
8ae002
 	  /* Four UTF-8 chars.  */				\
8ae002
-	  if (__glibc_unlikely (outptr + 4 > outend))			      \
8ae002
+	  if (__glibc_unlikely (outptr + 4 > outend))		\
8ae002
 	    {							\
8ae002
 	      /* Overflow in the output buffer.  */		\
8ae002
 	      result = __GCONV_FULL_OUTPUT;			\
8ae002
@@ -505,7 +668,140 @@ gconv_end (struct __gconv_step *data)
8ae002
 	}							\
8ae002
     inptr += 4;							\
8ae002
   }
8ae002
+
8ae002
+#define HW_TO_VX							\
8ae002
+  {									\
8ae002
+    register const unsigned char* pInput asm ("8") = inptr;		\
8ae002
+    register size_t inlen asm ("9") = inend - inptr;			\
8ae002
+    register unsigned char* pOutput asm ("10") = outptr;		\
8ae002
+    register size_t outlen asm("11") = outend - outptr;			\
8ae002
+    unsigned long tmp, tmp2;						\
8ae002
+    asm volatile (".machine push\n\t"					\
8ae002
+		  ".machine \"z13\"\n\t"				\
8ae002
+		  ".machinemode \"zarch_nohighgprs\"\n\t"		\
8ae002
+		  "    vleif %%v20,127,0\n\t"   /* element 0: 127  */	\
8ae002
+		  "    vzero %%v21\n\t"					\
8ae002
+		  "    vleih %%v21,8192,0\n\t"  /* element 0:   >  */	\
8ae002
+		  "    vleih %%v21,-8192,2\n\t" /* element 1: =<>  */	\
8ae002
+		  /* Loop which handles UTF-32 chars <=0x7f.  */	\
8ae002
+		  "0:  clgijl %[R_INLEN],64,20f\n\t"			\
8ae002
+		  "    clgijl %[R_OUTLEN],16,20f\n\t"			\
8ae002
+		  "1:  vlm %%v16,%%v19,0(%[R_IN])\n\t"			\
8ae002
+		  "    lghi %[R_TMP],0\n\t"				\
8ae002
+		  /* Shorten to byte values.  */			\
8ae002
+		  "    vpkf %%v23,%%v16,%%v17\n\t"			\
8ae002
+		  "    vpkf %%v24,%%v18,%%v19\n\t"			\
8ae002
+		  "    vpkh %%v23,%%v23,%%v24\n\t"			\
8ae002
+		  /* Checking for values > 0x7f.  */			\
8ae002
+		  "    vstrcfs %%v22,%%v16,%%v20,%%v21\n\t"		\
8ae002
+		  "    jno 10f\n\t"					\
8ae002
+		  "    vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
8ae002
+		  "    jno 11f\n\t"					\
8ae002
+		  "    vstrcfs %%v22,%%v18,%%v20,%%v21\n\t"		\
8ae002
+		  "    jno 12f\n\t"					\
8ae002
+		  "    vstrcfs %%v22,%%v19,%%v20,%%v21\n\t"		\
8ae002
+		  "    jno 13f\n\t"					\
8ae002
+		  /* Store 16bytes to outptr.  */			\
8ae002
+		  "    vst %%v23,0(%[R_OUT])\n\t"			\
8ae002
+		  "    aghi %[R_INLEN],-64\n\t"				\
8ae002
+		  "    aghi %[R_OUTLEN],-16\n\t"			\
8ae002
+		  "    la %[R_IN],64(%[R_IN])\n\t"			\
8ae002
+		  "    la %[R_OUT],16(%[R_OUT])\n\t"			\
8ae002
+		  "    clgijl %[R_INLEN],64,20f\n\t"			\
8ae002
+		  "    clgijl %[R_OUTLEN],16,20f\n\t"			\
8ae002
+		  "    j 1b\n\t"					\
8ae002
+		  /* Found a value > 0x7f.  */				\
8ae002
+		  "13: ahi %[R_TMP],4\n\t"				\
8ae002
+		  "12: ahi %[R_TMP],4\n\t"				\
8ae002
+		  "11: ahi %[R_TMP],4\n\t"				\
8ae002
+		  "10: vlgvb %[R_I],%%v22,7\n\t"			\
8ae002
+		  "    srlg %[R_I],%[R_I],2\n\t"			\
8ae002
+		  "    agr %[R_I],%[R_TMP]\n\t"				\
8ae002
+		  "    je 20f\n\t"					\
8ae002
+		  /* Store characters before invalid one...  */		\
8ae002
+		  "    slgr %[R_OUTLEN],%[R_I]\n\t"			\
8ae002
+		  "15: aghi %[R_I],-1\n\t"				\
8ae002
+		  "    vstl %%v23,%[R_I],0(%[R_OUT])\n\t"		\
8ae002
+		  /* ... and update pointers.  */			\
8ae002
+		  "    aghi %[R_I],1\n\t"				\
8ae002
+		  "    la %[R_OUT],0(%[R_I],%[R_OUT])\n\t"		\
8ae002
+		  "    sllg %[R_I],%[R_I],2\n\t"			\
8ae002
+		  "    la %[R_IN],0(%[R_I],%[R_IN])\n\t"		\
8ae002
+		  "    slgr %[R_INLEN],%[R_I]\n\t"			\
8ae002
+		  /* Handle multibyte utf8-char with convert instruction. */ \
8ae002
+		  "20: cu41 %[R_OUT],%[R_IN]\n\t"			\
8ae002
+		  "    jo 0b\n\t" /* Try vector implemenation again.  */ \
8ae002
+		  "    lochil %[R_RES],%[RES_OUT_FULL]\n\t" /* cc == 1.  */ \
8ae002
+		  "    lochih %[R_RES],%[RES_IN_ILL]\n\t" /* cc == 2.  */ \
8ae002
+		  ".machine pop"					\
8ae002
+		  : /* outputs */ [R_IN] "+a" (pInput)			\
8ae002
+		    , [R_INLEN] "+d" (inlen), [R_OUT] "+a" (pOutput)	\
8ae002
+		    , [R_OUTLEN] "+d" (outlen), [R_TMP] "=d" (tmp)	\
8ae002
+		    , [R_I] "=a" (tmp2)					\
8ae002
+		    , [R_RES] "+d" (result)				\
8ae002
+		  : /* inputs */					\
8ae002
+		    [RES_OUT_FULL] "i" (__GCONV_FULL_OUTPUT)		\
8ae002
+		    , [RES_IN_ILL] "i" (__GCONV_ILLEGAL_INPUT)		\
8ae002
+		  : /* clobber list */ "memory", "cc"			\
8ae002
+		    ASM_CLOBBER_VR ("v16") ASM_CLOBBER_VR ("v17")	\
8ae002
+		    ASM_CLOBBER_VR ("v18") ASM_CLOBBER_VR ("v19")	\
8ae002
+		    ASM_CLOBBER_VR ("v20") ASM_CLOBBER_VR ("v21")	\
8ae002
+		    ASM_CLOBBER_VR ("v22") ASM_CLOBBER_VR ("v23")	\
8ae002
+		    ASM_CLOBBER_VR ("v24")				\
8ae002
+		  );							\
8ae002
+    inptr = pInput;							\
8ae002
+    outptr = pOutput;							\
8ae002
+  }
8ae002
+
8ae002
+/* Generate loop-function with software routing.  */
8ae002
+#define MIN_NEEDED_INPUT	MIN_NEEDED_TO
8ae002
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
8ae002
+#define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
8ae002
+#define LOOPFCT			__to_utf8_loop_c
8ae002
+#define BODY			BODY_TO_C
8ae002
+#define LOOP_NEED_FLAGS
8ae002
+#include <iconv/loop.c>
8ae002
+
8ae002
+/* Generate loop-function with hardware utf-convert instruction.  */
8ae002
+#define MIN_NEEDED_INPUT	MIN_NEEDED_TO
8ae002
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
8ae002
+#define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
8ae002
+#define LOOPFCT			__to_utf8_loop_etf3eh
8ae002
 #define LOOP_NEED_FLAGS
8ae002
+#define BODY			BODY_TO_ETF3EH
8ae002
 #include <iconv/loop.c>
8ae002
 
8ae002
+#if defined HAVE_S390_VX_ASM_SUPPORT
8ae002
+/* Generate loop-function with hardware vector and utf-convert instructions.  */
8ae002
+# define MIN_NEEDED_INPUT	MIN_NEEDED_TO
8ae002
+# define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
8ae002
+# define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
8ae002
+# define LOOPFCT		__to_utf8_loop_vx
8ae002
+# define BODY			BODY_TO_VX
8ae002
+# define LOOP_NEED_FLAGS
8ae002
+# include <iconv/loop.c>
8ae002
+#endif
8ae002
+
8ae002
+/* Generate ifunc'ed loop function.  */
8ae002
+__typeof(__to_utf8_loop_c)
8ae002
+__attribute__ ((ifunc ("__to_utf8_loop_resolver")))
8ae002
+__to_utf8_loop;
8ae002
+
8ae002
+static void *
8ae002
+__to_utf8_loop_resolver (unsigned long int dl_hwcap)
8ae002
+{
8ae002
+#if defined HAVE_S390_VX_ASM_SUPPORT
8ae002
+  if (dl_hwcap & HWCAP_S390_VX)
8ae002
+    return __to_utf8_loop_vx;
8ae002
+  else
8ae002
+#endif
8ae002
+  if (dl_hwcap & HWCAP_S390_ETF3EH)
8ae002
+    return __to_utf8_loop_etf3eh;
8ae002
+  else
8ae002
+    return __to_utf8_loop_c;
8ae002
+}
8ae002
+
8ae002
+strong_alias (__to_utf8_loop_c_single, __to_utf8_loop_single)
8ae002
+
8ae002
+
8ae002
 #include <iconv/skeleton.c>
8ae002
-- 
8ae002
1.8.3.1
8ae002