51f0aa
From 5855d3b7d0fc85b2827755bbb3b4dacf6a08dae7 Mon Sep 17 00:00:00 2001
51f0aa
From: Stefan Liebler <stli@linux.vnet.ibm.com>
51f0aa
Date: Thu, 27 Jul 2017 10:53:59 +0200
51f0aa
Subject: [PATCH 10/10] S390: Use cu21 instruction for converting from utf16 to
51f0aa
 utf8.
51f0aa
51f0aa
upstream-commit a37b5daa6bc7fbcbbc229b2549a161fa15023f41
51f0aa
51f0aa
This patch adds an ifunc variant to use the cu instruction on arch12 CPUs.
51f0aa
This new ifunc variant can be built if binutils support z13 vector
51f0aa
instructions.  At runtime, HWCAP_S390_VXE decides if we can use the
51f0aa
cu21 instruction.
51f0aa
51f0aa
ChangeLog:
51f0aa
51f0aa
	* sysdeps/s390/utf8-utf16-z9.c (__to_utf8_loop_vx_cu):
51f0aa
	Use vector and cu21 instruction.
51f0aa
	* sysdeps/s390/multiarch/utf8-utf16-z9.c:
51f0aa
	Add __to_utf8_loop_vx_cu in ifunc resolver.
51f0aa
---
51f0aa
 sysdeps/s390/multiarch/utf8-utf16-z9.c |   8 ++-
51f0aa
 sysdeps/s390/utf8-utf16-z9.c           | 117 +++++++++++++++++++++++++++++++++
51f0aa
 2 files changed, 122 insertions(+), 3 deletions(-)
51f0aa
51f0aa
diff --git a/sysdeps/s390/multiarch/utf8-utf16-z9.c b/sysdeps/s390/multiarch/utf8-utf16-z9.c
51f0aa
index b55ef1a..1252281 100644
51f0aa
--- a/sysdeps/s390/multiarch/utf8-utf16-z9.c
51f0aa
+++ b/sysdeps/s390/multiarch/utf8-utf16-z9.c
51f0aa
@@ -41,8 +41,10 @@ s390_libc_ifunc_expr (FROM_LOOP_DEFAULT, FROM_LOOP,
51f0aa
 			: FROM_LOOP_DEFAULT);
51f0aa
 
51f0aa
 s390_libc_ifunc_expr (TO_LOOP_DEFAULT, TO_LOOP,
51f0aa
-		      (HAVE_TO_VX && (hwcap & HWCAP_S390_VX))
51f0aa
-		      ? TO_LOOP_VX
51f0aa
-		      : TO_LOOP_DEFAULT);
51f0aa
+		      (HAVE_TO_VX_CU && (hwcap & HWCAP_S390_VXE))
51f0aa
+		      ? TO_LOOP_VX_CU
51f0aa
+		      : (HAVE_TO_VX && (hwcap & HWCAP_S390_VX))
51f0aa
+			? TO_LOOP_VX
51f0aa
+			: TO_LOOP_DEFAULT);
51f0aa
 
51f0aa
 #include <iconv/skeleton.c>
51f0aa
diff --git a/sysdeps/s390/utf8-utf16-z9.c b/sysdeps/s390/utf8-utf16-z9.c
51f0aa
index d870a29..76e463a 100644
51f0aa
--- a/sysdeps/s390/utf8-utf16-z9.c
51f0aa
+++ b/sysdeps/s390/utf8-utf16-z9.c
51f0aa
@@ -52,9 +52,11 @@
51f0aa
 #if defined HAVE_S390_VX_ASM_SUPPORT && defined USE_MULTIARCH
51f0aa
 # define HAVE_FROM_VX		1
51f0aa
 # define HAVE_TO_VX		1
51f0aa
+# define HAVE_TO_VX_CU		1
51f0aa
 #else
51f0aa
 # define HAVE_FROM_VX		0
51f0aa
 # define HAVE_TO_VX		0
51f0aa
+# define HAVE_TO_VX_CU		0
51f0aa
 #endif
51f0aa
 
51f0aa
 #if defined HAVE_S390_VX_GCC_SUPPORT
51f0aa
@@ -817,6 +819,121 @@ gconv_end (struct __gconv_step *data)
51f0aa
 # define TO_LOOP_VX		NULL
51f0aa
 #endif /* HAVE_TO_VX != 1  */
51f0aa
 
51f0aa
+#if HAVE_TO_VX_CU == 1
51f0aa
+#define BODY_TO_VX_CU							\
51f0aa
+  {									\
51f0aa
+    register const unsigned char* pInput asm ("8") = inptr;		\
51f0aa
+    register size_t inlen asm ("9") = inend - inptr;			\
51f0aa
+    register unsigned char* pOutput asm ("10") = outptr;		\
51f0aa
+    register size_t outlen asm ("11") = outend - outptr;		\
51f0aa
+    unsigned long tmp, tmp2, tmp3;					\
51f0aa
+    asm volatile (".machine push\n\t"					\
51f0aa
+		  ".machine \"z13\"\n\t"				\
51f0aa
+		  ".machinemode \"zarch_nohighgprs\"\n\t"		\
51f0aa
+		  /* Setup to check for values <= 0x7f.  */		\
51f0aa
+		  "    larl %[R_TMP],9f\n\t"				\
51f0aa
+		  "    vlm %%v30,%%v31,0(%[R_TMP])\n\t"			\
51f0aa
+		  CONVERT_32BIT_SIZE_T ([R_INLEN])			\
51f0aa
+		  CONVERT_32BIT_SIZE_T ([R_OUTLEN])			\
51f0aa
+		  /* Loop which handles UTF-16 chars <=0x7f.  */	\
51f0aa
+		  "0:  clgijl %[R_INLEN],32,20f\n\t"			\
51f0aa
+		  "    clgijl %[R_OUTLEN],16,20f\n\t"			\
51f0aa
+		  "1:  vlm %%v16,%%v17,0(%[R_IN])\n\t"			\
51f0aa
+		  "    lghi %[R_TMP2],0\n\t"				\
51f0aa
+		  /* Check for > 1byte UTF-8 chars.  */			\
51f0aa
+		  "    vstrchs %%v19,%%v16,%%v30,%%v31\n\t"		\
51f0aa
+		  "    jno 10f\n\t" /* Jump away if not all bytes are 1byte \
51f0aa
+				       UTF8 chars.  */			\
51f0aa
+		  "    vstrchs %%v19,%%v17,%%v30,%%v31\n\t"		\
51f0aa
+		  "    jno 11f\n\t" /* Jump away if not all bytes are 1byte \
51f0aa
+				       UTF8 chars.  */			\
51f0aa
+		  /* Shorten to UTF-8.  */				\
51f0aa
+		  "    vpkh %%v18,%%v16,%%v17\n\t"			\
51f0aa
+		  "    la %[R_IN],32(%[R_IN])\n\t"			\
51f0aa
+		  "    aghi %[R_INLEN],-32\n\t"				\
51f0aa
+		  /* Store 16 bytes to buf_out.  */			\
51f0aa
+		  "    vst %%v18,0(%[R_OUT])\n\t"			\
51f0aa
+		  "    aghi %[R_OUTLEN],-16\n\t"			\
51f0aa
+		  "    la %[R_OUT],16(%[R_OUT])\n\t"			\
51f0aa
+		  "    clgijl %[R_INLEN],32,20f\n\t"			\
51f0aa
+		  "    clgijl %[R_OUTLEN],16,20f\n\t"			\
51f0aa
+		  "    j 1b\n\t"					\
51f0aa
+		  /* Setup to check for ch > 0x7f. (v30, v31)  */	\
51f0aa
+		  "9:  .short 0x7f,0x7f,0x0,0x0,0x0,0x0,0x0,0x0\n\t"	\
51f0aa
+		  "    .short 0x2000,0x2000,0x0,0x0,0x0,0x0,0x0,0x0\n\t" \
51f0aa
+		  /* At least one byte is > 0x7f.			\
51f0aa
+		     Store the preceding 1-byte chars.  */		\
51f0aa
+		  "11: lghi %[R_TMP2],16\n\t" /* match was found in v17.  */ \
51f0aa
+		  "10: vlgvb %[R_TMP],%%v19,7\n\t"			\
51f0aa
+		  /* Shorten to UTF-8.  */				\
51f0aa
+		  "    vpkh %%v18,%%v16,%%v17\n\t"			\
51f0aa
+		  "    ar %[R_TMP],%[R_TMP2]\n\t" /* Number of in bytes.  */ \
51f0aa
+		  "    srlg %[R_TMP3],%[R_TMP],1\n\t" /* Number of out bytes.  */ \
51f0aa
+		  "    ahik %[R_TMP2],%[R_TMP3],-1\n\t" /* Highest index to store.  */ \
51f0aa
+		  "    jl 20f\n\t"					\
51f0aa
+		  "    vstl %%v18,%[R_TMP2],0(%[R_OUT])\n\t"		\
51f0aa
+		  /* Update pointers.  */				\
51f0aa
+		  "    la %[R_IN],0(%[R_TMP],%[R_IN])\n\t"		\
51f0aa
+		  "    slgr %[R_INLEN],%[R_TMP]\n\t"			\
51f0aa
+		  "    la %[R_OUT],0(%[R_TMP3],%[R_OUT])\n\t"		\
51f0aa
+		  "    slgr %[R_OUTLEN],%[R_TMP3]\n\t"			\
51f0aa
+		  /* Handles UTF16 surrogates with convert instruction.  */ \
51f0aa
+		  "20: cu21 %[R_OUT],%[R_IN],1\n\t"			\
51f0aa
+		  "    jo 0b\n\t" /* Try vector implemenation again.  */ \
51f0aa
+		  "    lochil %[R_RES],%[RES_OUT_FULL]\n\t" /* cc == 1.  */ \
51f0aa
+		  "    lochih %[R_RES],%[RES_IN_ILL]\n\t" /* cc == 2.  */ \
51f0aa
+		  ".machine pop"					\
51f0aa
+		  : /* outputs */ [R_IN] "+a" (pInput)			\
51f0aa
+		    , [R_INLEN] "+d" (inlen), [R_OUT] "+a" (pOutput)	\
51f0aa
+		    , [R_OUTLEN] "+d" (outlen), [R_TMP] "=a" (tmp)	\
51f0aa
+		    , [R_TMP2] "=d" (tmp2), [R_TMP3] "=a" (tmp3)	\
51f0aa
+		    , [R_RES] "+d" (result)				\
51f0aa
+		  : /* inputs */					\
51f0aa
+		    [RES_OUT_FULL] "i" (__GCONV_FULL_OUTPUT)		\
51f0aa
+		    , [RES_IN_ILL] "i" (__GCONV_ILLEGAL_INPUT)		\
51f0aa
+		  : /* clobber list */ "memory", "cc"			\
51f0aa
+		    ASM_CLOBBER_VR ("v16") ASM_CLOBBER_VR ("v17")	\
51f0aa
+		    ASM_CLOBBER_VR ("v18") ASM_CLOBBER_VR ("v19")	\
51f0aa
+		    ASM_CLOBBER_VR ("v30") ASM_CLOBBER_VR ("v31")	\
51f0aa
+		  );							\
51f0aa
+    inptr = pInput;							\
51f0aa
+    outptr = pOutput;							\
51f0aa
+									\
51f0aa
+    if (__glibc_likely (inlen == 0)					\
51f0aa
+	|| result == __GCONV_FULL_OUTPUT)				\
51f0aa
+      break;								\
51f0aa
+    if (inlen == 1)							\
51f0aa
+      {									\
51f0aa
+	/* Input does not contain a complete utf16 character.  */	\
51f0aa
+	result = __GCONV_INCOMPLETE_INPUT;				\
51f0aa
+	break;								\
51f0aa
+      }									\
51f0aa
+    else if (result != __GCONV_ILLEGAL_INPUT)				\
51f0aa
+      {									\
51f0aa
+	/* Input is >= 2 and < 4 bytes (as cu21 would have processed	\
51f0aa
+	   a possible next utf16 character) and not illegal.		\
51f0aa
+	   => we have a single high surrogate at end of input.  */	\
51f0aa
+	result = __GCONV_INCOMPLETE_INPUT;				\
51f0aa
+	break;								\
51f0aa
+      }									\
51f0aa
+									\
51f0aa
+    STANDARD_TO_LOOP_ERR_HANDLER (2);					\
51f0aa
+  }
51f0aa
+
51f0aa
+/* Generate loop-function with vector and utf-convert instructions.  */
51f0aa
+# define MIN_NEEDED_INPUT	MIN_NEEDED_TO
51f0aa
+# define MAX_NEEDED_INPUT	MAX_NEEDED_TO
51f0aa
+# define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
51f0aa
+# define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
51f0aa
+# define TO_LOOP_VX_CU		__to_utf8_loop_vx_cu
51f0aa
+# define LOOPFCT		TO_LOOP_VX_CU
51f0aa
+# define BODY                   BODY_TO_VX_CU
51f0aa
+# define LOOP_NEED_FLAGS
51f0aa
+# include <iconv/loop.c>
51f0aa
+#else
51f0aa
+# define TO_LOOP_VX_CU		NULL
51f0aa
+#endif /* HAVE_TO_VX_CU != 1  */
51f0aa
+
51f0aa
 /* This file also exists in sysdeps/s390/multiarch/ which
51f0aa
    generates ifunc resolvers for FROM/TO_LOOP functions
51f0aa
    and includes iconv/skeleton.c afterwards.  */
51f0aa
-- 
51f0aa
1.8.3.1
51f0aa