00db10
From 9b9fd1ec26b5386072fa967bac0771df5e86a284 Mon Sep 17 00:00:00 2001
00db10
From: Stefan Liebler <stli@linux.vnet.ibm.com>
00db10
Date: Thu, 27 Jul 2017 10:53:58 +0200
00db10
Subject: [PATCH 07/10] S390: Use cu41 instruction for converting from utf32 to
00db10
 utf8.
00db10
00db10
upstream-commit 23ea69a9d6e9ab28c66a232b767a800b04eaa938
00db10
00db10
This patch adds an ifunc variant to use the cu instruction on arch12 CPUs.
00db10
This new ifunc variant can be built if binutils support z13 vector
00db10
instructions.  At runtime, HWCAP_S390_VXE decides if we can use the
00db10
cu41 instruction.
00db10
00db10
ChangeLog:
00db10
00db10
	* sysdeps/s390/utf8-utf32-z9.c (__to_utf8_loop_vx_cu):
00db10
	Use vector and cu41 instruction.
00db10
	* sysdeps/s390/multiarch/utf8-utf32-z9.c: Add __to_utf8_loop_vx_cu
00db10
	in ifunc resolver.
00db10
---
00db10
 sysdeps/s390/multiarch/utf8-utf32-z9.c |   8 ++-
00db10
 sysdeps/s390/utf8-utf32-z9.c           | 112 +++++++++++++++++++++++++++++++++
00db10
 2 files changed, 117 insertions(+), 3 deletions(-)
00db10
00db10
diff --git a/sysdeps/s390/multiarch/utf8-utf32-z9.c b/sysdeps/s390/multiarch/utf8-utf32-z9.c
00db10
index faf1f46..0c6d9e9 100644
00db10
--- a/sysdeps/s390/multiarch/utf8-utf32-z9.c
00db10
+++ b/sysdeps/s390/multiarch/utf8-utf32-z9.c
00db10
@@ -41,8 +41,10 @@ s390_libc_ifunc_expr (FROM_LOOP_DEFAULT, FROM_LOOP,
00db10
 			: FROM_LOOP_DEFAULT);
00db10
 
00db10
 s390_libc_ifunc_expr (TO_LOOP_DEFAULT, TO_LOOP,
00db10
-		      (HAVE_TO_VX && (hwcap & HWCAP_S390_VX))
00db10
-		      ? TO_LOOP_VX
00db10
-		      : TO_LOOP_DEFAULT);
00db10
+		      (HAVE_TO_VX_CU && (hwcap & HWCAP_S390_VXE))
00db10
+		      ? TO_LOOP_VX_CU
00db10
+		      : (HAVE_TO_VX && (hwcap & HWCAP_S390_VX))
00db10
+			? TO_LOOP_VX
00db10
+			: TO_LOOP_DEFAULT);
00db10
 
00db10
 #include <iconv/skeleton.c>
00db10
diff --git a/sysdeps/s390/utf8-utf32-z9.c b/sysdeps/s390/utf8-utf32-z9.c
00db10
index e06d11e..e4f2e0c 100644
00db10
--- a/sysdeps/s390/utf8-utf32-z9.c
00db10
+++ b/sysdeps/s390/utf8-utf32-z9.c
00db10
@@ -52,9 +52,11 @@
00db10
 #if defined HAVE_S390_VX_ASM_SUPPORT && defined USE_MULTIARCH
00db10
 # define HAVE_FROM_VX		1
00db10
 # define HAVE_TO_VX		1
00db10
+# define HAVE_TO_VX_CU		1
00db10
 #else
00db10
 # define HAVE_FROM_VX		0
00db10
 # define HAVE_TO_VX		0
00db10
+# define HAVE_TO_VX_CU		0
00db10
 #endif
00db10
 
00db10
 #if defined HAVE_S390_VX_GCC_SUPPORT
00db10
@@ -863,6 +865,116 @@ gconv_end (struct __gconv_step *data)
00db10
 # define TO_LOOP_VX		NULL
00db10
 #endif /* HAVE_TO_VX != 1  */
00db10
 
00db10
+#if HAVE_TO_VX_CU == 1
00db10
+#define BODY_TO_VX_CU							\
00db10
+  {									\
00db10
+    register const unsigned char* pInput asm ("8") = inptr;		\
00db10
+    register size_t inlen asm ("9") = inend - inptr;			\
00db10
+    register unsigned char* pOutput asm ("10") = outptr;		\
00db10
+    register size_t outlen asm ("11") = outend - outptr;		\
00db10
+    unsigned long tmp, tmp2;						\
00db10
+    asm volatile (".machine push\n\t"					\
00db10
+		  ".machine \"z13\"\n\t"				\
00db10
+		  ".machinemode \"zarch_nohighgprs\"\n\t"		\
00db10
+		  "    vleif %%v20,127,0\n\t"   /* element 0: 127  */	\
00db10
+		  "    vzero %%v21\n\t"					\
00db10
+		  "    vleih %%v21,8192,0\n\t"  /* element 0:   >  */	\
00db10
+		  "    vleih %%v21,-8192,2\n\t" /* element 1: =<>  */	\
00db10
+		  CONVERT_32BIT_SIZE_T ([R_INLEN])			\
00db10
+		  CONVERT_32BIT_SIZE_T ([R_OUTLEN])			\
00db10
+		  /* Loop which handles UTF-32 chars <= 0x7f.  */	\
00db10
+		  "0:  clgijl %[R_INLEN],64,20f\n\t"			\
00db10
+		  "    clgijl %[R_OUTLEN],16,20f\n\t"			\
00db10
+		  "1:  vlm %%v16,%%v19,0(%[R_IN])\n\t"			\
00db10
+		  "    lghi %[R_TMP],0\n\t"				\
00db10
+		  /* Shorten to byte values.  */			\
00db10
+		  "    vpkf %%v23,%%v16,%%v17\n\t"			\
00db10
+		  "    vpkf %%v24,%%v18,%%v19\n\t"			\
00db10
+		  "    vpkh %%v23,%%v23,%%v24\n\t"			\
00db10
+		  /* Checking for values > 0x7f.  */			\
00db10
+		  "    vstrcfs %%v22,%%v16,%%v20,%%v21\n\t"		\
00db10
+		  "    jno 10f\n\t"					\
00db10
+		  "    vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
00db10
+		  "    jno 11f\n\t"					\
00db10
+		  "    vstrcfs %%v22,%%v18,%%v20,%%v21\n\t"		\
00db10
+		  "    jno 12f\n\t"					\
00db10
+		  "    vstrcfs %%v22,%%v19,%%v20,%%v21\n\t"		\
00db10
+		  "    jno 13f\n\t"					\
00db10
+		  /* Store 16bytes to outptr.  */			\
00db10
+		  "    vst %%v23,0(%[R_OUT])\n\t"			\
00db10
+		  "    aghi %[R_INLEN],-64\n\t"				\
00db10
+		  "    aghi %[R_OUTLEN],-16\n\t"			\
00db10
+		  "    la %[R_IN],64(%[R_IN])\n\t"			\
00db10
+		  "    la %[R_OUT],16(%[R_OUT])\n\t"			\
00db10
+		  "    clgijl %[R_INLEN],64,20f\n\t"			\
00db10
+		  "    clgijl %[R_OUTLEN],16,20f\n\t"			\
00db10
+		  "    j 1b\n\t"					\
00db10
+		  /* Found a value > 0x7f.  */				\
00db10
+		  "13: ahi %[R_TMP],4\n\t"				\
00db10
+		  "12: ahi %[R_TMP],4\n\t"				\
00db10
+		  "11: ahi %[R_TMP],4\n\t"				\
00db10
+		  "10: vlgvb %[R_I],%%v22,7\n\t"			\
00db10
+		  "    srlg %[R_I],%[R_I],2\n\t"			\
00db10
+		  "    agr %[R_I],%[R_TMP]\n\t"				\
00db10
+		  "    je 20f\n\t"					\
00db10
+		  /* Store characters before invalid one...  */		\
00db10
+		  "    slgr %[R_OUTLEN],%[R_I]\n\t"			\
00db10
+		  "15: aghi %[R_I],-1\n\t"				\
00db10
+		  "    vstl %%v23,%[R_I],0(%[R_OUT])\n\t"		\
00db10
+		  /* ... and update pointers.  */			\
00db10
+		  "    aghi %[R_I],1\n\t"				\
00db10
+		  "    la %[R_OUT],0(%[R_I],%[R_OUT])\n\t"		\
00db10
+		  "    sllg %[R_I],%[R_I],2\n\t"			\
00db10
+		  "    la %[R_IN],0(%[R_I],%[R_IN])\n\t"		\
00db10
+		  "    slgr %[R_INLEN],%[R_I]\n\t"			\
00db10
+		  /* Handle multibyte utf8-char with convert instruction. */ \
00db10
+		  "20: cu41 %[R_OUT],%[R_IN]\n\t"			\
00db10
+		  "    jo 0b\n\t" /* Try vector implemenation again.  */ \
00db10
+		  "    lochil %[R_RES],%[RES_OUT_FULL]\n\t" /* cc == 1.  */ \
00db10
+		  "    lochih %[R_RES],%[RES_IN_ILL]\n\t" /* cc == 2.  */ \
00db10
+		  ".machine pop"					\
00db10
+		  : /* outputs */ [R_IN] "+a" (pInput)			\
00db10
+		    , [R_INLEN] "+d" (inlen), [R_OUT] "+a" (pOutput)	\
00db10
+		    , [R_OUTLEN] "+d" (outlen), [R_TMP] "=d" (tmp)	\
00db10
+		    , [R_I] "=a" (tmp2)					\
00db10
+		    , [R_RES] "+d" (result)				\
00db10
+		  : /* inputs */					\
00db10
+		    [RES_OUT_FULL] "i" (__GCONV_FULL_OUTPUT)		\
00db10
+		    , [RES_IN_ILL] "i" (__GCONV_ILLEGAL_INPUT)		\
00db10
+		  : /* clobber list */ "memory", "cc"			\
00db10
+		    ASM_CLOBBER_VR ("v16") ASM_CLOBBER_VR ("v17")	\
00db10
+		    ASM_CLOBBER_VR ("v18") ASM_CLOBBER_VR ("v19")	\
00db10
+		    ASM_CLOBBER_VR ("v20") ASM_CLOBBER_VR ("v21")	\
00db10
+		    ASM_CLOBBER_VR ("v22") ASM_CLOBBER_VR ("v23")	\
00db10
+		    ASM_CLOBBER_VR ("v24")				\
00db10
+		  );							\
00db10
+    inptr = pInput;							\
00db10
+    outptr = pOutput;							\
00db10
+									\
00db10
+    if (__glibc_likely (inptr == inend)					\
00db10
+	|| result == __GCONV_FULL_OUTPUT)				\
00db10
+      break;								\
00db10
+    if (inptr + 4 > inend)						\
00db10
+      {									\
00db10
+	result = __GCONV_INCOMPLETE_INPUT;				\
00db10
+	break;								\
00db10
+      }									\
00db10
+    STANDARD_TO_LOOP_ERR_HANDLER (4);					\
00db10
+  }
00db10
+
00db10
+/* Generate loop-function with hardware vector and utf-convert instructions.  */
00db10
+# define MIN_NEEDED_INPUT	MIN_NEEDED_TO
00db10
+# define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
00db10
+# define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
00db10
+# define TO_LOOP_VX_CU		__to_utf8_loop_vx_cu
00db10
+# define LOOPFCT		TO_LOOP_VX_CU
00db10
+# define BODY			BODY_TO_VX_CU
00db10
+# define LOOP_NEED_FLAGS
00db10
+# include <iconv/loop.c>
00db10
+#else
00db10
+# define TO_LOOP_VX_CU		NULL
00db10
+#endif /* HAVE_TO_VX_CU != 1  */
00db10
+
00db10
 /* This file also exists in sysdeps/s390/multiarch/ which
00db10
    generates ifunc resolvers for FROM/TO_LOOP functions
00db10
    and includes iconv/skeleton.c afterwards.  */
00db10
-- 
00db10
1.8.3.1
00db10