ce426f
From 7dd60718b327b3eb6112ec3900750007b0259189 Mon Sep 17 00:00:00 2001
ce426f
From: raji <raji@oc4354787705.ibm.com>
ce426f
Date: Tue, 14 Jun 2016 14:51:16 +0530
ce426f
Subject: [PATCH] powerpc: strcasecmp/strncasecmp optmization for power8
ce426f
ce426f
This implementation utilizes vectors to improve performance
ce426f
compared to current byte by byte implementation for POWER7.
ce426f
The performance improvement is upto 4x.  This patch is tested
ce426f
on powerpc64 and powerpc64le.
ce426f
ce426f
(cherry picked from commit c8376f3e07602aaef9cb843bb73cb5f2b860634a)
ce426f
ce426f
Conflicts:
ce426f
	sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
ce426f
	sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
ce426f
---
ce426f
 ChangeLog                                          |  22 +
ce426f
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   4 +-
ce426f
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
ce426f
 .../powerpc64/multiarch/strcasecmp-power7.S        |  20 +-
ce426f
 .../powerpc64/multiarch/strcasecmp-power8.S        |  28 ++
ce426f
 .../powerpc/powerpc64/multiarch/strcasecmp-ppc64.c |  21 +
ce426f
 sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c   |  32 +-
ce426f
 .../powerpc/powerpc64/multiarch/strncase-power8.S  |  28 ++
ce426f
 .../powerpc/powerpc64/multiarch/strncase-ppc64.c   |  21 +
ce426f
 sysdeps/powerpc/powerpc64/multiarch/strncase.c     |  25 +-
ce426f
 sysdeps/powerpc/powerpc64/power8/strcasecmp.S      | 446 +++++++++++++++++++++
ce426f
 sysdeps/powerpc/powerpc64/power8/strncase.S        |  20 +
ce426f
 12 files changed, 622 insertions(+), 51 deletions(-)
ce426f
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
ce426f
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
ce426f
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
ce426f
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
ce426f
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strcasecmp.S
ce426f
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strncase.S
ce426f
ce426f
diff --git a/ChangeLog b/ChangeLog
ce426f
index c01d1a0..9385bd0 100644
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
ce426f
index 9ee9bc2..e3ac285 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
ce426f
@@ -21,6 +21,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
ce426f
                   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
ce426f
                   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
ce426f
                   stpcpy-power8 stpcpy-power7 stpcpy-ppc64 \
ce426f
+                  strcasecmp-ppc64 strcasecmp-power8 \
ce426f
+                  strncase-ppc64 strncase-power8 \
ce426f
                   strcasestr-power8 strcasestr-ppc64 \
ce426f
                   strcat-power8 strcat-power7 strcat-ppc64 \
ce426f
                   strcmp-power8 strcmp-power7 strcmp-ppc64 \
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
ce426f
index 228891f..aabd7bc 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
ce426f
@@ -204,6 +204,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
ce426f
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c.  */
ce426f
   IFUNC_IMPL (i, name, strcasecmp,
ce426f
              IFUNC_IMPL_ADD (array, i, strcasecmp,
ce426f
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
ce426f
+                              __strcasecmp_power8)
ce426f
+             IFUNC_IMPL_ADD (array, i, strcasecmp,
ce426f
                              hwcap & PPC_FEATURE_HAS_VSX,
ce426f
                              __strcasecmp_power7)
ce426f
              IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ppc))
ce426f
@@ -219,6 +222,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
ce426f
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncase.c.  */
ce426f
   IFUNC_IMPL (i, name, strncasecmp,
ce426f
              IFUNC_IMPL_ADD (array, i, strncasecmp,
ce426f
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
ce426f
+                             __strncasecmp_power8)
ce426f
+             IFUNC_IMPL_ADD (array, i, strncasecmp,
ce426f
                              hwcap & PPC_FEATURE_HAS_VSX,
ce426f
                              __strncasecmp_power7)
ce426f
              IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_ppc))
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
ce426f
index 56eed9a..99cd7bd 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
ce426f
@@ -1,5 +1,5 @@
ce426f
-/* Optimized strcasecmp implementation foOWER7.
ce426f
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
ce426f
+/* Optimized strcasecmp implementation for POWER7.
ce426f
+   Copyright (C) 2013-2016 Free Software Foundation, Inc.
ce426f
    This file is part of the GNU C Library.
ce426f
 
ce426f
    The GNU C Library is free software; you can redistribute it and/or
ce426f
@@ -18,21 +18,7 @@
ce426f
 
ce426f
 #include <sysdep.h>
ce426f
 
ce426f
-#undef ENTRY
ce426f
-#define ENTRY(name)						\
ce426f
-  .section ".text";						\
ce426f
-  ENTRY_2(__strcasecmp_power7)					\
ce426f
-  .align ALIGNARG(2);						\
ce426f
-  BODY_LABEL(__strcasecmp_power7):				\
ce426f
-  cfi_startproc;						\
ce426f
-  LOCALENTRY(__strcasecmp_power7)
ce426f
-
ce426f
-#undef END
ce426f
-#define END(name)						\
ce426f
-  cfi_endproc;							\
ce426f
-  TRACEBACK(__strcasecmp_power7)				\
ce426f
-  END_2(__strcasecmp_power7)
ce426f
-
ce426f
+#define __strcasecmp __strcasecmp_power7
ce426f
 #undef weak_alias
ce426f
 #define weak_alias(name, alias)
ce426f
 
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
ce426f
new file mode 100644
ce426f
index 0000000..492047a
ce426f
--- /dev/null
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
ce426f
@@ -0,0 +1,28 @@
ce426f
+/* Optimized strcasecmp implementation for POWER8.
ce426f
+   Copyright (C) 2016 Free Software Foundation, Inc.
ce426f
+   This file is part of the GNU C Library.
ce426f
+
ce426f
+   The GNU C Library is free software; you can redistribute it and/or
ce426f
+   modify it under the terms of the GNU Lesser General Public
ce426f
+   License as published by the Free Software Foundation; either
ce426f
+   version 2.1 of the License, or (at your option) any later version.
ce426f
+
ce426f
+   The GNU C Library is distributed in the hope that it will be useful,
ce426f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
ce426f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
ce426f
+   Lesser General Public License for more details.
ce426f
+
ce426f
+   You should have received a copy of the GNU Lesser General Public
ce426f
+   License along with the GNU C Library; if not, see
ce426f
+   <http://www.gnu.org/licenses/>.  */
ce426f
+
ce426f
+#include <sysdep.h>
ce426f
+
ce426f
+#define __strcasecmp __strcasecmp_power8
ce426f
+#undef weak_alias
ce426f
+#define weak_alias(name, alias)
ce426f
+
ce426f
+#undef libc_hidden_builtin_def
ce426f
+#define libc_hidden_builtin_def(name)
ce426f
+
ce426f
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
ce426f
new file mode 100644
ce426f
index 0000000..6318b4a
ce426f
--- /dev/null
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
ce426f
@@ -0,0 +1,21 @@
ce426f
+/* Multiarch strcasecmp for PPC64.
ce426f
+   Copyright (C) 2016 Free Software Foundation, Inc.
ce426f
+   This file is part of the GNU C Library.
ce426f
+
ce426f
+   The GNU C Library is free software; you can redistribute it and/or
ce426f
+   modify it under the terms of the GNU Lesser General Public
ce426f
+   License as published by the Free Software Foundation; either
ce426f
+   version 2.1 of the License, or (at your option) any later version.
ce426f
+
ce426f
+   The GNU C Library is distributed in the hope that it will be useful,
ce426f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
ce426f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
ce426f
+   Lesser General Public License for more details.
ce426f
+
ce426f
+   You should have received a copy of the GNU Lesser General Public
ce426f
+   License along with the GNU C Library; if not, see
ce426f
+   <http://www.gnu.org/licenses/>.  */
ce426f
+
ce426f
+#define strcasecmp __strcasecmp_ppc
ce426f
+
ce426f
+#include <string/strcasecmp.c>
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
ce426f
index 979e9f1..5ec6885 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
ce426f
@@ -1,5 +1,5 @@
ce426f
-/* Multiple versions of strcasecmp.
ce426f
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
ce426f
+/* Multiple versions of strcasecmp
ce426f
+   Copyright (C) 2013-2016 Free Software Foundation, Inc.
ce426f
    This file is part of the GNU C Library.
ce426f
 
ce426f
    The GNU C Library is free software; you can redistribute it and/or
ce426f
@@ -16,25 +16,21 @@
ce426f
    License along with the GNU C Library; if not, see
ce426f
    <http://www.gnu.org/licenses/>.  */
ce426f
 
ce426f
-#if IS_IN (libc)
ce426f
-# include <string.h>
ce426f
-# define strcasecmp __strcasecmp_ppc
ce426f
-extern __typeof (__strcasecmp) __strcasecmp_ppc attribute_hidden;
ce426f
-extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
ce426f
-#endif
ce426f
+#include <string.h>
ce426f
+#include <shlib-compat.h>
ce426f
+#include "init-arch.h"
ce426f
 
ce426f
-#include <string/strcasecmp.c>
ce426f
-#undef strcasecmp
ce426f
+extern __typeof (__strcasecmp) __libc_strcasecmp;
ce426f
 
ce426f
-#if IS_IN (libc)
ce426f
-# include <shlib-compat.h>
ce426f
-# include "init-arch.h"
ce426f
+extern __typeof (__strcasecmp) __strcasecmp_ppc attribute_hidden;
ce426f
+extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
ce426f
+extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
ce426f
 
ce426f
-extern __typeof (__strcasecmp) __libc_strcasecmp;
ce426f
 libc_ifunc (__libc_strcasecmp,
ce426f
-	    (hwcap & PPC_FEATURE_HAS_VSX)
ce426f
-            ? __strcasecmp_power7
ce426f
-            : __strcasecmp_ppc);
ce426f
+	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
ce426f
+             ? __strcasecmp_power8:
ce426f
+	     (hwcap & PPC_FEATURE_HAS_VSX)
ce426f
+             ? __strcasecmp_power7
ce426f
+             : __strcasecmp_ppc);
ce426f
 
ce426f
 weak_alias (__libc_strcasecmp, strcasecmp)
ce426f
-#endif
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
ce426f
new file mode 100644
ce426f
index 0000000..01a63b5
ce426f
--- /dev/null
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
ce426f
@@ -0,0 +1,28 @@
ce426f
+/* Optimized strncasecmp implementation for POWER8.
ce426f
+   Copyright (C) 2016 Free Software Foundation, Inc.
ce426f
+   This file is part of the GNU C Library.
ce426f
+
ce426f
+   The GNU C Library is free software; you can redistribute it and/or
ce426f
+   modify it under the terms of the GNU Lesser General Public
ce426f
+   License as published by the Free Software Foundation; either
ce426f
+   version 2.1 of the License, or (at your option) any later version.
ce426f
+
ce426f
+   The GNU C Library is distributed in the hope that it will be useful,
ce426f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
ce426f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
ce426f
+   Lesser General Public License for more details.
ce426f
+
ce426f
+   You should have received a copy of the GNU Lesser General Public
ce426f
+   License along with the GNU C Library; if not, see
ce426f
+   <http://www.gnu.org/licenses/>.  */
ce426f
+
ce426f
+#include <sysdep.h>
ce426f
+
ce426f
+#define __strncasecmp __strncasecmp_power8
ce426f
+#undef weak_alias
ce426f
+#define weak_alias(name, alias)
ce426f
+
ce426f
+#undef libc_hidden_builtin_def
ce426f
+#define libc_hidden_builtin_def(name)
ce426f
+
ce426f
+#include <sysdeps/powerpc/powerpc64/power8/strncase.S>
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
ce426f
new file mode 100644
ce426f
index 0000000..c245d77
ce426f
--- /dev/null
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
ce426f
@@ -0,0 +1,21 @@
ce426f
+/* Multiarch strncasecmp for PPC64.
ce426f
+   Copyright (C) 2016 Free Software Foundation, Inc.
ce426f
+   This file is part of the GNU C Library.
ce426f
+
ce426f
+   The GNU C Library is free software; you can redistribute it and/or
ce426f
+   modify it under the terms of the GNU Lesser General Public
ce426f
+   License as published by the Free Software Foundation; either
ce426f
+   version 2.1 of the License, or (at your option) any later version.
ce426f
+
ce426f
+   The GNU C Library is distributed in the hope that it will be useful,
ce426f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
ce426f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
ce426f
+   Lesser General Public License for more details.
ce426f
+
ce426f
+   You should have received a copy of the GNU Lesser General Public
ce426f
+   License along with the GNU C Library; if not, see
ce426f
+   <http://www.gnu.org/licenses/>.  */
ce426f
+
ce426f
+#define strncasecmp __strncasecmp_ppc
ce426f
+
ce426f
+#include <string/strncase.c>
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase.c b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
ce426f
index 4339f3a..5bfaf65 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/multiarch/strncase.c
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
ce426f
@@ -16,26 +16,21 @@
ce426f
    License along with the GNU C Library; if not, see
ce426f
    <http://www.gnu.org/licenses/>.  */
ce426f
 
ce426f
-#if IS_IN (libc)
ce426f
-# include <string.h>
ce426f
-# define strncasecmp __strncasecmp_ppc
ce426f
-extern __typeof (__strncasecmp) __strncasecmp_ppc attribute_hidden;
ce426f
-extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
ce426f
-#endif
ce426f
+#include <string.h>
ce426f
+#include <shlib-compat.h>
ce426f
+#include "init-arch.h"
ce426f
 
ce426f
-#include <string/strncase.c>
ce426f
-#undef strncasecmp
ce426f
+extern __typeof (__strncasecmp) __libc_strncasecmp;
ce426f
 
ce426f
-#if IS_IN (libc)
ce426f
-# include <shlib-compat.h>
ce426f
-# include "init-arch.h"
ce426f
+extern __typeof (__strncasecmp) __strncasecmp_ppc attribute_hidden;
ce426f
+extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
ce426f
+extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
ce426f
 
ce426f
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ce426f
-   ifunc symbol properly.  */
ce426f
-extern __typeof (__strncasecmp) __libc_strncasecmp;
ce426f
 libc_ifunc (__libc_strncasecmp,
ce426f
+	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
ce426f
+             ? __strncasecmp_power8:
ce426f
 	     (hwcap & PPC_FEATURE_HAS_VSX)
ce426f
              ? __strncasecmp_power7
ce426f
              : __strncasecmp_ppc);
ce426f
+
ce426f
 weak_alias (__libc_strncasecmp, strncasecmp)
ce426f
-#endif
ce426f
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
ce426f
new file mode 100644
ce426f
index 0000000..63f6217
ce426f
--- /dev/null
ce426f
+++ b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
ce426f
@@ -0,0 +1,446 @@
ce426f
+/* Optimized strcasecmp implementation for PowerPC64.
ce426f
+   Copyright (C) 2016 Free Software Foundation, Inc.
ce426f
+   This file is part of the GNU C Library.
ce426f
+
ce426f
+   The GNU C Library is free software; you can redistribute it and/or
ce426f
+   modify it under the terms of the GNU Lesser General Public
ce426f
+   License as published by the Free Software Foundation; either
ce426f
+   version 2.1 of the License, or (at your option) any later version.
ce426f
+
ce426f
+   The GNU C Library is distributed in the hope that it will be useful,
ce426f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
ce426f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
ce426f
+   Lesser General Public License for more details.
ce426f
+
ce426f
+   You should have received a copy of the GNU Lesser General Public
ce426f
+   License along with the GNU C Library; if not, see
ce426f
+   <http://www.gnu.org/licenses/>.  */
ce426f
+
ce426f
+#include <sysdep.h>
ce426f
+#include <locale-defines.h>
ce426f
+
ce426f
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
ce426f
+
ce426f
+#ifndef USE_AS_STRNCASECMP
ce426f
+#  define __STRCASECMP __strcasecmp
ce426f
+#  define STRCASECMP   strcasecmp
ce426f
+#else
ce426f
+#  define __STRCASECMP __strncasecmp
ce426f
+#  define STRCASECMP   strncasecmp
ce426f
+#endif
ce426f
+/* Convert 16 bytes to lowercase and compare */
ce426f
+#define TOLOWER()     \
ce426f
+	vaddubm	v8, v4, v1; \
ce426f
+	vaddubm	v7, v4, v3; \
ce426f
+	vcmpgtub	v8, v8, v2; \
ce426f
+	vsel	v4, v7, v4, v8; \
ce426f
+	vaddubm	v8, v5, v1; \
ce426f
+	vaddubm	v7, v5, v3; \
ce426f
+	vcmpgtub	v8, v8, v2; \
ce426f
+	vsel	v5, v7, v5, v8; \
ce426f
+	vcmpequb.	v7, v5, v4;
ce426f
+
ce426f
+/* Get 16 bytes for unaligned case.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+#define GET16BYTES(reg1, reg2, reg3) \
ce426f
+	lvx	reg1, 0, reg2; \
ce426f
+	vcmpequb.	v8, v0, reg1; \
ce426f
+	beq	cr6, 1f; \
ce426f
+	vspltisb	v9, 0; \
ce426f
+	b	2f; \
ce426f
+	.align 4; \
ce426f
+1: \
ce426f
+	addi	r6, reg2, 16; \
ce426f
+	lvx	v9, 0, r6; \
ce426f
+2: \
ce426f
+	vperm	reg1, v9, reg1, reg3;
ce426f
+#else
ce426f
+#define GET16BYTES(reg1, reg2, reg3) \
ce426f
+	lvx	reg1, 0, reg2; \
ce426f
+	vcmpequb.	v8, v0, reg1; \
ce426f
+	beq	cr6, 1f; \
ce426f
+	vspltisb	v9, 0; \
ce426f
+	b	2f; \
ce426f
+	.align 4; \
ce426f
+1: \
ce426f
+	addi	r6, reg2, 16; \
ce426f
+	lvx	v9, 0, r6; \
ce426f
+2: \
ce426f
+	vperm	reg1, reg1, v9, reg3;
ce426f
+#endif
ce426f
+
ce426f
+/* Check null in v4, v5 and convert to lower.  */
ce426f
+#define CHECKNULLANDCONVERT() \
ce426f
+	vcmpequb.	v7, v0, v5; \
ce426f
+	beq	cr6, 3f; \
ce426f
+	vcmpequb.	v7, v0, v4; \
ce426f
+	beq	cr6, 3f; \
ce426f
+	b	L(null_found); \
ce426f
+	.align  4; \
ce426f
+3: \
ce426f
+	TOLOWER()
ce426f
+
ce426f
+#ifdef _ARCH_PWR8
ce426f
+#  define VCLZD_V8_v7	vclzd	v8, v7;
ce426f
+#  define MFVRD_R3_V1	mfvrd	r3, v1;
ce426f
+#  define VSUBUDM_V9_V8	vsubudm	v9, v9, v8;
ce426f
+#  define VPOPCNTD_V8_V8	vpopcntd v8, v8;
ce426f
+#  define VADDUQM_V7_V8	vadduqm	v9, v7, v8;
ce426f
+#else
ce426f
+#  define VCLZD_V8_v7	.long	0x11003fc2
ce426f
+#  define MFVRD_R3_V1	.long	0x7c230067
ce426f
+#  define VSUBUDM_V9_V8	.long	0x112944c0
ce426f
+#  define VPOPCNTD_V8_V8	.long	0x110047c3
ce426f
+#  define VADDUQM_V7_V8	.long	0x11274100
ce426f
+#endif
ce426f
+
ce426f
+	.machine  power7
ce426f
+
ce426f
+ENTRY (__STRCASECMP)
ce426f
+#ifdef USE_AS_STRNCASECMP
ce426f
+	CALL_MCOUNT 3
ce426f
+#else
ce426f
+	CALL_MCOUNT 2
ce426f
+#endif
ce426f
+#define rRTN	r3	/* Return value */
ce426f
+#define rSTR1	r10	/* 1st string */
ce426f
+#define rSTR2	r4	/* 2nd string */
ce426f
+#define rCHAR1	r6	/* Byte read from 1st string */
ce426f
+#define rCHAR2	r7	/* Byte read from 2nd string */
ce426f
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
ce426f
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
ce426f
+#define rLWR1	r8	/* Word tolower(rCHAR1) */
ce426f
+#define rLWR2	r12	/* Word tolower(rCHAR2) */
ce426f
+#define rTMP	r9
ce426f
+#define rLOC	r11	/* Default locale address */
ce426f
+
ce426f
+	cmpd	cr7, rRTN, rSTR2
ce426f
+
ce426f
+	/* Get locale address.  */
ce426f
+	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
ce426f
+	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
ce426f
+	ld	rLOC, 0(rLOC)
ce426f
+
ce426f
+	mr	rSTR1, rRTN
ce426f
+	li	rRTN, 0
ce426f
+	beqlr	cr7
ce426f
+#ifdef USE_AS_STRNCASECMP
ce426f
+	cmpdi	cr7, r5, 0
ce426f
+	beq	cr7, L(retnull)
ce426f
+	cmpdi	cr7, r5, 16
ce426f
+	blt	cr7, L(bytebybyte)
ce426f
+#endif
ce426f
+	vspltisb	v0, 0
ce426f
+	vspltisb	v8, -1
ce426f
+	/* Check for null in initial characters.
ce426f
+	   Check max of 16 char depending on the alignment.
ce426f
+	   If null is present, proceed byte by byte.  */
ce426f
+	lvx	v4, 0, rSTR1
ce426f
+#ifdef  __LITTLE_ENDIAN__
ce426f
+	lvsr	v10, 0, rSTR1	/* Compute mask.  */
ce426f
+	vperm	v9, v8, v4, v10	/* Mask bits that are not part of string.  */
ce426f
+#else
ce426f
+	lvsl	v10, 0, rSTR1
ce426f
+	vperm	v9, v4, v8, v10
ce426f
+#endif
ce426f
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
ce426f
+	bne	cr6, L(bytebybyte)
ce426f
+	lvx	v5, 0, rSTR2
ce426f
+	/* Calculate alignment.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	lvsr	v6, 0, rSTR2
ce426f
+	vperm	v9, v8, v5, v6	/* Mask bits that are not part of string.  */
ce426f
+#else
ce426f
+	lvsl	v6, 0, rSTR2
ce426f
+	vperm	v9, v5, v8, v6
ce426f
+#endif
ce426f
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
ce426f
+	bne	cr6, L(bytebybyte)
ce426f
+	/* Check if locale has non ascii characters.  */
ce426f
+	ld	rTMP, 0(rLOC)
ce426f
+	addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
ce426f
+	lwz	rTMP, 0(r6)
ce426f
+	cmpdi	cr7, rTMP, 1
ce426f
+	beq	cr7, L(bytebybyte)
ce426f
+
ce426f
+	/* Load vector registers with values used for TOLOWER.  */
ce426f
+	/* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte.  */
ce426f
+	vspltisb	v3, 2
ce426f
+	vspltisb	v9, 4
ce426f
+	vsl	v3, v3, v9
ce426f
+	vaddubm	v1, v3, v3
ce426f
+	vnor	v1, v1, v1
ce426f
+	vspltisb	v2, 7
ce426f
+	vsububm	v2, v3, v2
ce426f
+
ce426f
+	andi.	rADDR1, rSTR1, 0xF
ce426f
+	beq	cr0, L(align)
ce426f
+	addi	r6, rSTR1, 16
ce426f
+	lvx	v9, 0, r6
ce426f
+	/* Compute 16 bytes from previous two loads.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm	v4, v9, v4, v10
ce426f
+#else
ce426f
+	vperm	v4, v4, v9, v10
ce426f
+#endif
ce426f
+L(align):
ce426f
+	andi.	rADDR2, rSTR2, 0xF
ce426f
+	beq	cr0, L(align1)
ce426f
+	addi	r6, rSTR2, 16
ce426f
+	lvx	v9, 0, r6
ce426f
+	/* Compute 16 bytes from previous two loads.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm	v5, v9, v5, v6
ce426f
+#else
ce426f
+	vperm	v5, v5, v9, v6
ce426f
+#endif
ce426f
+L(align1):
ce426f
+	CHECKNULLANDCONVERT()
ce426f
+	blt	cr6, L(match)
ce426f
+	b	L(different)
ce426f
+	.align 	4
ce426f
+L(match):
ce426f
+	clrldi	r6, rSTR1, 60
ce426f
+	subfic	r7, r6, 16
ce426f
+#ifdef USE_AS_STRNCASECMP
ce426f
+	sub	r5, r5, r7
ce426f
+#endif
ce426f
+	add	rSTR1, rSTR1, r7
ce426f
+	add	rSTR2, rSTR2, r7
ce426f
+	andi.	rADDR2, rSTR2, 0xF
ce426f
+	addi	rSTR1, rSTR1, -16
ce426f
+	addi	rSTR2, rSTR2, -16
ce426f
+	beq	cr0, L(aligned)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	lvsr	v6, 0, rSTR2
ce426f
+#else
ce426f
+	lvsl	v6, 0, rSTR2
ce426f
+#endif
ce426f
+	/* There are 2 loops depending on the input alignment.
ce426f
+	   Each loop gets 16 bytes from s1 and s2, check for null,
ce426f
+	   convert to lowercase and compare. Loop till difference
ce426f
+	   or null occurs. */
ce426f
+L(s1_align):
ce426f
+	addi	rSTR1, rSTR1, 16
ce426f
+	addi	rSTR2, rSTR2, 16
ce426f
+#ifdef USE_AS_STRNCASECMP
ce426f
+	cmpdi	cr7, r5, 16
ce426f
+	blt	cr7, L(bytebybyte)
ce426f
+	addi	r5, r5, -16
ce426f
+#endif
ce426f
+	lvx	v4, 0, rSTR1
ce426f
+	GET16BYTES(v5, rSTR2, v6)
ce426f
+	CHECKNULLANDCONVERT()
ce426f
+	blt	cr6, L(s1_align)
ce426f
+	b	L(different)
ce426f
+	.align 	4
ce426f
+L(aligned):
ce426f
+	addi	rSTR1, rSTR1, 16
ce426f
+	addi	rSTR2, rSTR2, 16
ce426f
+#ifdef USE_AS_STRNCASECMP
ce426f
+	cmpdi	cr7, r5, 16
ce426f
+	blt	cr7, L(bytebybyte)
ce426f
+	addi	r5, r5, -16
ce426f
+#endif
ce426f
+	lvx	v4, 0, rSTR1
ce426f
+	lvx	v5, 0, rSTR2
ce426f
+	CHECKNULLANDCONVERT()
ce426f
+	blt	cr6, L(aligned)
ce426f
+
ce426f
+	/* Calculate and return the difference. */
ce426f
+L(different):
ce426f
+	vaddubm	v1, v3, v3
ce426f
+	vcmpequb	v7, v0, v7
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	/* Count trailing zero.  */
ce426f
+	vspltisb	v8, -1
ce426f
+	VADDUQM_V7_V8
ce426f
+	vandc	v8, v9, v7
ce426f
+	VPOPCNTD_V8_V8
ce426f
+	vspltb	v6, v8, 15
ce426f
+	vcmpequb.	v6, v6, v1
ce426f
+	blt	cr6, L(shift8)
ce426f
+#else
ce426f
+	/* Count leading zero.  */
ce426f
+	VCLZD_V8_v7
ce426f
+	vspltb	v6, v8, 7
ce426f
+	vcmpequb.	v6, v6, v1
ce426f
+	blt	cr6, L(shift8)
ce426f
+	vsro	v8, v8, v1
ce426f
+#endif
ce426f
+	b	L(skipsum)
ce426f
+	.align  4
ce426f
+L(shift8):
ce426f
+	vsumsws		v8, v8, v0
ce426f
+L(skipsum):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	/* Shift registers based on leading zero count.  */
ce426f
+	vsro	v6, v5, v8
ce426f
+	vsro	v7, v4, v8
ce426f
+	/* Merge and move to GPR.  */
ce426f
+	vmrglb	v6, v6, v7
ce426f
+	vslo	v1, v6, v1
ce426f
+	MFVRD_R3_V1
ce426f
+	/* Place the characters that are different in first position.  */
ce426f
+	sldi	rSTR2, rRTN, 56
ce426f
+	srdi	rSTR2, rSTR2, 56
ce426f
+	sldi	rSTR1, rRTN, 48
ce426f
+	srdi	rSTR1, rSTR1, 56
ce426f
+#else
ce426f
+	vslo	v6, v5, v8
ce426f
+	vslo	v7, v4, v8
ce426f
+	vmrghb	v1, v6, v7
ce426f
+	MFVRD_R3_V1
ce426f
+	srdi	rSTR2, rRTN, 48
ce426f
+	sldi	rSTR2, rSTR2, 56
ce426f
+	srdi	rSTR2, rSTR2, 56
ce426f
+	srdi	rSTR1, rRTN, 56
ce426f
+#endif
ce426f
+	subf  	rRTN, rSTR1, rSTR2
ce426f
+	extsw 	rRTN, rRTN
ce426f
+	blr
ce426f
+
ce426f
+	.align  4
ce426f
+	/* OK. We've hit the end of the string. We need to be careful that
ce426f
+	   we don't compare two strings as different because of junk beyond
ce426f
+	   the end of the strings...  */
ce426f
+L(null_found):
ce426f
+	vaddubm	v10, v3, v3
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	/* Count trailing zero.  */
ce426f
+	vspltisb	v8, -1
ce426f
+	VADDUQM_V7_V8
ce426f
+	vandc	v8, v9, v7
ce426f
+	VPOPCNTD_V8_V8
ce426f
+	vspltb	v6, v8, 15
ce426f
+	vcmpequb.	v6, v6, v10
ce426f
+	blt	cr6, L(shift_8)
ce426f
+#else
ce426f
+	/* Count leading zero.  */
ce426f
+	VCLZD_V8_v7
ce426f
+	vspltb	v6, v8, 7
ce426f
+	vcmpequb.	v6, v6, v10
ce426f
+	blt	cr6, L(shift_8)
ce426f
+	vsro	v8, v8, v10
ce426f
+#endif
ce426f
+	b	L(skipsum1)
ce426f
+	.align  4
ce426f
+L(shift_8):
ce426f
+	vsumsws	v8, v8, v0
ce426f
+L(skipsum1):
ce426f
+	/* Calculate shift count based on count of zero.  */
ce426f
+	vspltisb	v10, 7
ce426f
+	vslb	v10, v10, v10
ce426f
+	vsldoi	v9, v0, v10, 1
ce426f
+	VSUBUDM_V9_V8
ce426f
+	vspltisb	v8, 8
ce426f
+	vsldoi	v8, v0, v8, 1
ce426f
+	VSUBUDM_V9_V8
ce426f
+	/* Shift and remove junk after null character.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vslo	v5, v5, v9
ce426f
+	vslo	v4, v4, v9
ce426f
+#else
ce426f
+	vsro	v5, v5, v9
ce426f
+	vsro	v4, v4, v9
ce426f
+#endif
ce426f
+	/* Convert and compare 16 bytes.  */
ce426f
+	TOLOWER()
ce426f
+	blt	cr6, L(retnull)
ce426f
+	b	L(different)
ce426f
+	.align  4
ce426f
+L(retnull):
ce426f
+	li	rRTN, 0
ce426f
+	blr
ce426f
+	.align  4
ce426f
+L(bytebybyte):
ce426f
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
ce426f
+	offset and string descriptors are only updated in the end
ce426f
+	of loop unrolling. */
ce426f
+	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
ce426f
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
ce426f
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
ce426f
+#ifdef USE_AS_STRNCASECMP
ce426f
+	rldicl	rTMP, r5, 62, 2
ce426f
+	cmpdi	cr7, rTMP, 0
ce426f
+	beq	cr7, L(lessthan4)
ce426f
+	mtctr	rTMP
ce426f
+#endif
ce426f
+L(loop):
ce426f
+	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
ce426f
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
ce426f
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
ce426f
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
ce426f
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
ce426f
+	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
ce426f
+	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
ce426f
+	beq	cr1, L(done)
ce426f
+	lbz	rCHAR1, 1(rSTR1)
ce426f
+	lbz	rCHAR2, 1(rSTR2)
ce426f
+	cmpdi	rCHAR1, 0
ce426f
+	sldi	rADDR1, rCHAR1, 2
ce426f
+	sldi	rADDR2, rCHAR2, 2
ce426f
+	lwzx	rLWR1, rLOC, rADDR1
ce426f
+	lwzx	rLWR2, rLOC, rADDR2
ce426f
+	cmpw	cr1, rLWR1, rLWR2
ce426f
+	crorc	4*cr1+eq,eq,4*cr1+eq
ce426f
+	beq	cr1, L(done)
ce426f
+	lbz	rCHAR1, 2(rSTR1)
ce426f
+	lbz	rCHAR2, 2(rSTR2)
ce426f
+	cmpdi	rCHAR1, 0
ce426f
+	sldi	rADDR1, rCHAR1, 2
ce426f
+	sldi	rADDR2, rCHAR2, 2
ce426f
+	lwzx	rLWR1, rLOC, rADDR1
ce426f
+	lwzx	rLWR2, rLOC, rADDR2
ce426f
+	cmpw	cr1, rLWR1, rLWR2
ce426f
+	crorc	4*cr1+eq,eq,4*cr1+eq
ce426f
+	beq	cr1, L(done)
ce426f
+	lbz	rCHAR1, 3(rSTR1)
ce426f
+	lbz	rCHAR2, 3(rSTR2)
ce426f
+	cmpdi	rCHAR1, 0
ce426f
+	/* Increment both string descriptors */
ce426f
+	addi	rSTR1, rSTR1, 4
ce426f
+	addi	rSTR2, rSTR2, 4
ce426f
+	sldi	rADDR1, rCHAR1, 2
ce426f
+	sldi	rADDR2, rCHAR2, 2
ce426f
+	lwzx	rLWR1, rLOC, rADDR1
ce426f
+	lwzx	rLWR2, rLOC, rADDR2
ce426f
+	cmpw	cr1, rLWR1, rLWR2
ce426f
+	crorc	4*cr1+eq,eq,4*cr1+eq
ce426f
+	beq     cr1, L(done)
ce426f
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
ce426f
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
ce426f
+#ifdef USE_AS_STRNCASECMP
ce426f
+	bdnz	L(loop)
ce426f
+#else
ce426f
+	b	L(loop)
ce426f
+#endif
ce426f
+#ifdef USE_AS_STRNCASECMP
ce426f
+L(lessthan4):
ce426f
+	clrldi	r5, r5, 62
ce426f
+	cmpdi	cr7, r5, 0
ce426f
+	beq	cr7, L(retnull)
ce426f
+	mtctr	r5
ce426f
+L(loop1):
ce426f
+	cmpdi	rCHAR1, 0
ce426f
+	sldi	rADDR1, rCHAR1, 2
ce426f
+	sldi	rADDR2, rCHAR2, 2
ce426f
+	lwzx	rLWR1, rLOC, rADDR1
ce426f
+	lwzx	rLWR2, rLOC, rADDR2
ce426f
+	cmpw	cr1, rLWR1, rLWR2
ce426f
+	crorc	4*cr1+eq,eq,4*cr1+eq
ce426f
+	beq	cr1, L(done)
ce426f
+	addi	rSTR1, rSTR1, 1
ce426f
+	addi	rSTR2, rSTR2, 1
ce426f
+	lbz	rCHAR1, 0(rSTR1)
ce426f
+	lbz	rCHAR2, 0(rSTR2)
ce426f
+	bdnz	L(loop1)
ce426f
+#endif
ce426f
+L(done):
ce426f
+	subf	r0, rLWR2, rLWR1
ce426f
+	extsw	rRTN, r0
ce426f
+	blr
ce426f
+END (__STRCASECMP)
ce426f
+
ce426f
+weak_alias (__STRCASECMP, STRCASECMP)
ce426f
+libc_hidden_builtin_def (__STRCASECMP)
ce426f
diff --git a/sysdeps/powerpc/powerpc64/power8/strncase.S b/sysdeps/powerpc/powerpc64/power8/strncase.S
ce426f
new file mode 100644
ce426f
index 0000000..7ce2ed0
ce426f
--- /dev/null
ce426f
+++ b/sysdeps/powerpc/powerpc64/power8/strncase.S
ce426f
@@ -0,0 +1,20 @@
ce426f
+/* Optimized strncasecmp implementation for POWER8.
ce426f
+   Copyright (C) 2016 Free Software Foundation, Inc.
ce426f
+   This file is part of the GNU C Library.
ce426f
+
ce426f
+   The GNU C Library is free software; you can redistribute it and/or
ce426f
+   modify it under the terms of the GNU Lesser General Public
ce426f
+   License as published by the Free Software Foundation; either
ce426f
+   version 2.1 of the License, or (at your option) any later version.
ce426f
+
ce426f
+   The GNU C Library is distributed in the hope that it will be useful,
ce426f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
ce426f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
ce426f
+   Lesser General Public License for more details.
ce426f
+
ce426f
+   You should have received a copy of the GNU Lesser General Public
ce426f
+   License along with the GNU C Library; if not, see
ce426f
+   <http://www.gnu.org/licenses/>.  */
ce426f
+
ce426f
+#define USE_AS_STRNCASECMP 1
ce426f
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
ce426f
-- 
ce426f
2.1.0
ce426f