8ae002
From 7dd60718b327b3eb6112ec3900750007b0259189 Mon Sep 17 00:00:00 2001
8ae002
From: raji <raji@oc4354787705.ibm.com>
8ae002
Date: Tue, 14 Jun 2016 14:51:16 +0530
8ae002
Subject: [PATCH] powerpc: strcasecmp/strncasecmp optmization for power8
8ae002
8ae002
This implementation utilizes vectors to improve performance
8ae002
compared to current byte by byte implementation for POWER7.
8ae002
The performance improvement is upto 4x.  This patch is tested
8ae002
on powerpc64 and powerpc64le.
8ae002
8ae002
(cherry picked from commit c8376f3e07602aaef9cb843bb73cb5f2b860634a)
8ae002
8ae002
Conflicts:
8ae002
	sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
8ae002
	sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
8ae002
---
8ae002
 ChangeLog                                          |  22 +
8ae002
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   4 +-
8ae002
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
8ae002
 .../powerpc64/multiarch/strcasecmp-power7.S        |  20 +-
8ae002
 .../powerpc64/multiarch/strcasecmp-power8.S        |  28 ++
8ae002
 .../powerpc/powerpc64/multiarch/strcasecmp-ppc64.c |  21 +
8ae002
 sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c   |  32 +-
8ae002
 .../powerpc/powerpc64/multiarch/strncase-power8.S  |  28 ++
8ae002
 .../powerpc/powerpc64/multiarch/strncase-ppc64.c   |  21 +
8ae002
 sysdeps/powerpc/powerpc64/multiarch/strncase.c     |  25 +-
8ae002
 sysdeps/powerpc/powerpc64/power8/strcasecmp.S      | 446 +++++++++++++++++++++
8ae002
 sysdeps/powerpc/powerpc64/power8/strncase.S        |  20 +
8ae002
 12 files changed, 622 insertions(+), 51 deletions(-)
8ae002
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
8ae002
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
8ae002
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
8ae002
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
8ae002
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strcasecmp.S
8ae002
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strncase.S
8ae002
8ae002
diff --git a/ChangeLog b/ChangeLog
8ae002
index c01d1a0..9385bd0 100644
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
8ae002
index 9ee9bc2..e3ac285 100644
8ae002
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
8ae002
@@ -21,6 +21,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
8ae002
                   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
8ae002
                   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
8ae002
                   stpcpy-power8 stpcpy-power7 stpcpy-ppc64 \
8ae002
+                  strcasecmp-ppc64 strcasecmp-power8 \
8ae002
+                  strncase-ppc64 strncase-power8 \
8ae002
                   strcasestr-power8 strcasestr-ppc64 \
8ae002
                   strcat-power8 strcat-power7 strcat-ppc64 \
8ae002
                   strcmp-power8 strcmp-power7 strcmp-ppc64 \
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
8ae002
index 228891f..aabd7bc 100644
8ae002
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
8ae002
@@ -204,6 +204,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
8ae002
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c.  */
8ae002
   IFUNC_IMPL (i, name, strcasecmp,
8ae002
              IFUNC_IMPL_ADD (array, i, strcasecmp,
8ae002
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
8ae002
+                              __strcasecmp_power8)
8ae002
+             IFUNC_IMPL_ADD (array, i, strcasecmp,
8ae002
                              hwcap & PPC_FEATURE_HAS_VSX,
8ae002
                              __strcasecmp_power7)
8ae002
              IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ppc))
8ae002
@@ -219,6 +222,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
8ae002
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncase.c.  */
8ae002
   IFUNC_IMPL (i, name, strncasecmp,
8ae002
              IFUNC_IMPL_ADD (array, i, strncasecmp,
8ae002
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
8ae002
+                             __strncasecmp_power8)
8ae002
+             IFUNC_IMPL_ADD (array, i, strncasecmp,
8ae002
                              hwcap & PPC_FEATURE_HAS_VSX,
8ae002
                              __strncasecmp_power7)
8ae002
              IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_ppc))
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
8ae002
index 56eed9a..99cd7bd 100644
8ae002
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
8ae002
@@ -1,5 +1,5 @@
8ae002
-/* Optimized strcasecmp implementation foOWER7.
8ae002
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
8ae002
+/* Optimized strcasecmp implementation for POWER7.
8ae002
+   Copyright (C) 2013-2016 Free Software Foundation, Inc.
8ae002
    This file is part of the GNU C Library.
8ae002
 
8ae002
    The GNU C Library is free software; you can redistribute it and/or
8ae002
@@ -18,21 +18,7 @@
8ae002
 
8ae002
 #include <sysdep.h>
8ae002
 
8ae002
-#undef ENTRY
8ae002
-#define ENTRY(name)						\
8ae002
-  .section ".text";						\
8ae002
-  ENTRY_2(__strcasecmp_power7)					\
8ae002
-  .align ALIGNARG(2);						\
8ae002
-  BODY_LABEL(__strcasecmp_power7):				\
8ae002
-  cfi_startproc;						\
8ae002
-  LOCALENTRY(__strcasecmp_power7)
8ae002
-
8ae002
-#undef END
8ae002
-#define END(name)						\
8ae002
-  cfi_endproc;							\
8ae002
-  TRACEBACK(__strcasecmp_power7)				\
8ae002
-  END_2(__strcasecmp_power7)
8ae002
-
8ae002
+#define __strcasecmp __strcasecmp_power7
8ae002
 #undef weak_alias
8ae002
 #define weak_alias(name, alias)
8ae002
 
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
8ae002
new file mode 100644
8ae002
index 0000000..492047a
8ae002
--- /dev/null
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
8ae002
@@ -0,0 +1,28 @@
8ae002
+/* Optimized strcasecmp implementation for POWER8.
8ae002
+   Copyright (C) 2016 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <sysdep.h>
8ae002
+
8ae002
+#define __strcasecmp __strcasecmp_power8
8ae002
+#undef weak_alias
8ae002
+#define weak_alias(name, alias)
8ae002
+
8ae002
+#undef libc_hidden_builtin_def
8ae002
+#define libc_hidden_builtin_def(name)
8ae002
+
8ae002
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
8ae002
new file mode 100644
8ae002
index 0000000..6318b4a
8ae002
--- /dev/null
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
8ae002
@@ -0,0 +1,21 @@
8ae002
+/* Multiarch strcasecmp for PPC64.
8ae002
+   Copyright (C) 2016 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#define strcasecmp __strcasecmp_ppc
8ae002
+
8ae002
+#include <string/strcasecmp.c>
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
8ae002
index 979e9f1..5ec6885 100644
8ae002
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
8ae002
@@ -1,5 +1,5 @@
8ae002
-/* Multiple versions of strcasecmp.
8ae002
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
8ae002
+/* Multiple versions of strcasecmp
8ae002
+   Copyright (C) 2013-2016 Free Software Foundation, Inc.
8ae002
    This file is part of the GNU C Library.
8ae002
 
8ae002
    The GNU C Library is free software; you can redistribute it and/or
8ae002
@@ -16,25 +16,21 @@
8ae002
    License along with the GNU C Library; if not, see
8ae002
    <http://www.gnu.org/licenses/>.  */
8ae002
 
8ae002
-#if IS_IN (libc)
8ae002
-# include <string.h>
8ae002
-# define strcasecmp __strcasecmp_ppc
8ae002
-extern __typeof (__strcasecmp) __strcasecmp_ppc attribute_hidden;
8ae002
-extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
8ae002
-#endif
8ae002
+#include <string.h>
8ae002
+#include <shlib-compat.h>
8ae002
+#include "init-arch.h"
8ae002
 
8ae002
-#include <string/strcasecmp.c>
8ae002
-#undef strcasecmp
8ae002
+extern __typeof (__strcasecmp) __libc_strcasecmp;
8ae002
 
8ae002
-#if IS_IN (libc)
8ae002
-# include <shlib-compat.h>
8ae002
-# include "init-arch.h"
8ae002
+extern __typeof (__strcasecmp) __strcasecmp_ppc attribute_hidden;
8ae002
+extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
8ae002
+extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
8ae002
 
8ae002
-extern __typeof (__strcasecmp) __libc_strcasecmp;
8ae002
 libc_ifunc (__libc_strcasecmp,
8ae002
-	    (hwcap & PPC_FEATURE_HAS_VSX)
8ae002
-            ? __strcasecmp_power7
8ae002
-            : __strcasecmp_ppc);
8ae002
+	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
8ae002
+             ? __strcasecmp_power8:
8ae002
+	     (hwcap & PPC_FEATURE_HAS_VSX)
8ae002
+             ? __strcasecmp_power7
8ae002
+             : __strcasecmp_ppc);
8ae002
 
8ae002
 weak_alias (__libc_strcasecmp, strcasecmp)
8ae002
-#endif
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
8ae002
new file mode 100644
8ae002
index 0000000..01a63b5
8ae002
--- /dev/null
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
8ae002
@@ -0,0 +1,28 @@
8ae002
+/* Optimized strncasecmp implementation for POWER8.
8ae002
+   Copyright (C) 2016 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <sysdep.h>
8ae002
+
8ae002
+#define __strncasecmp __strncasecmp_power8
8ae002
+#undef weak_alias
8ae002
+#define weak_alias(name, alias)
8ae002
+
8ae002
+#undef libc_hidden_builtin_def
8ae002
+#define libc_hidden_builtin_def(name)
8ae002
+
8ae002
+#include <sysdeps/powerpc/powerpc64/power8/strncase.S>
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
8ae002
new file mode 100644
8ae002
index 0000000..c245d77
8ae002
--- /dev/null
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
8ae002
@@ -0,0 +1,21 @@
8ae002
+/* Multiarch strncasecmp for PPC64.
8ae002
+   Copyright (C) 2016 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#define strncasecmp __strncasecmp_ppc
8ae002
+
8ae002
+#include <string/strncase.c>
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase.c b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
8ae002
index 4339f3a..5bfaf65 100644
8ae002
--- a/sysdeps/powerpc/powerpc64/multiarch/strncase.c
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
8ae002
@@ -16,26 +16,21 @@
8ae002
    License along with the GNU C Library; if not, see
8ae002
    <http://www.gnu.org/licenses/>.  */
8ae002
 
8ae002
-#if IS_IN (libc)
8ae002
-# include <string.h>
8ae002
-# define strncasecmp __strncasecmp_ppc
8ae002
-extern __typeof (__strncasecmp) __strncasecmp_ppc attribute_hidden;
8ae002
-extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
8ae002
-#endif
8ae002
+#include <string.h>
8ae002
+#include <shlib-compat.h>
8ae002
+#include "init-arch.h"
8ae002
 
8ae002
-#include <string/strncase.c>
8ae002
-#undef strncasecmp
8ae002
+extern __typeof (__strncasecmp) __libc_strncasecmp;
8ae002
 
8ae002
-#if IS_IN (libc)
8ae002
-# include <shlib-compat.h>
8ae002
-# include "init-arch.h"
8ae002
+extern __typeof (__strncasecmp) __strncasecmp_ppc attribute_hidden;
8ae002
+extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
8ae002
+extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
8ae002
 
8ae002
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
8ae002
-   ifunc symbol properly.  */
8ae002
-extern __typeof (__strncasecmp) __libc_strncasecmp;
8ae002
 libc_ifunc (__libc_strncasecmp,
8ae002
+	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
8ae002
+             ? __strncasecmp_power8:
8ae002
 	     (hwcap & PPC_FEATURE_HAS_VSX)
8ae002
              ? __strncasecmp_power7
8ae002
              : __strncasecmp_ppc);
8ae002
+
8ae002
 weak_alias (__libc_strncasecmp, strncasecmp)
8ae002
-#endif
8ae002
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
8ae002
new file mode 100644
8ae002
index 0000000..63f6217
8ae002
--- /dev/null
8ae002
+++ b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
8ae002
@@ -0,0 +1,446 @@
8ae002
+/* Optimized strcasecmp implementation for PowerPC64.
8ae002
+   Copyright (C) 2016 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <sysdep.h>
8ae002
+#include <locale-defines.h>
8ae002
+
8ae002
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
8ae002
+
8ae002
+#ifndef USE_AS_STRNCASECMP
8ae002
+#  define __STRCASECMP __strcasecmp
8ae002
+#  define STRCASECMP   strcasecmp
8ae002
+#else
8ae002
+#  define __STRCASECMP __strncasecmp
8ae002
+#  define STRCASECMP   strncasecmp
8ae002
+#endif
8ae002
+/* Convert 16 bytes to lowercase and compare */
8ae002
+#define TOLOWER()     \
8ae002
+	vaddubm	v8, v4, v1; \
8ae002
+	vaddubm	v7, v4, v3; \
8ae002
+	vcmpgtub	v8, v8, v2; \
8ae002
+	vsel	v4, v7, v4, v8; \
8ae002
+	vaddubm	v8, v5, v1; \
8ae002
+	vaddubm	v7, v5, v3; \
8ae002
+	vcmpgtub	v8, v8, v2; \
8ae002
+	vsel	v5, v7, v5, v8; \
8ae002
+	vcmpequb.	v7, v5, v4;
8ae002
+
8ae002
+/* Get 16 bytes for unaligned case.  */
8ae002
+#ifdef __LITTLE_ENDIAN__
8ae002
+#define GET16BYTES(reg1, reg2, reg3) \
8ae002
+	lvx	reg1, 0, reg2; \
8ae002
+	vcmpequb.	v8, v0, reg1; \
8ae002
+	beq	cr6, 1f; \
8ae002
+	vspltisb	v9, 0; \
8ae002
+	b	2f; \
8ae002
+	.align 4; \
8ae002
+1: \
8ae002
+	addi	r6, reg2, 16; \
8ae002
+	lvx	v9, 0, r6; \
8ae002
+2: \
8ae002
+	vperm	reg1, v9, reg1, reg3;
8ae002
+#else
8ae002
+#define GET16BYTES(reg1, reg2, reg3) \
8ae002
+	lvx	reg1, 0, reg2; \
8ae002
+	vcmpequb.	v8, v0, reg1; \
8ae002
+	beq	cr6, 1f; \
8ae002
+	vspltisb	v9, 0; \
8ae002
+	b	2f; \
8ae002
+	.align 4; \
8ae002
+1: \
8ae002
+	addi	r6, reg2, 16; \
8ae002
+	lvx	v9, 0, r6; \
8ae002
+2: \
8ae002
+	vperm	reg1, reg1, v9, reg3;
8ae002
+#endif
8ae002
+
8ae002
+/* Check null in v4, v5 and convert to lower.  */
8ae002
+#define CHECKNULLANDCONVERT() \
8ae002
+	vcmpequb.	v7, v0, v5; \
8ae002
+	beq	cr6, 3f; \
8ae002
+	vcmpequb.	v7, v0, v4; \
8ae002
+	beq	cr6, 3f; \
8ae002
+	b	L(null_found); \
8ae002
+	.align  4; \
8ae002
+3: \
8ae002
+	TOLOWER()
8ae002
+
8ae002
+#ifdef _ARCH_PWR8
8ae002
+#  define VCLZD_V8_v7	vclzd	v8, v7;
8ae002
+#  define MFVRD_R3_V1	mfvrd	r3, v1;
8ae002
+#  define VSUBUDM_V9_V8	vsubudm	v9, v9, v8;
8ae002
+#  define VPOPCNTD_V8_V8	vpopcntd v8, v8;
8ae002
+#  define VADDUQM_V7_V8	vadduqm	v9, v7, v8;
8ae002
+#else
8ae002
+#  define VCLZD_V8_v7	.long	0x11003fc2
8ae002
+#  define MFVRD_R3_V1	.long	0x7c230067
8ae002
+#  define VSUBUDM_V9_V8	.long	0x112944c0
8ae002
+#  define VPOPCNTD_V8_V8	.long	0x110047c3
8ae002
+#  define VADDUQM_V7_V8	.long	0x11274100
8ae002
+#endif
8ae002
+
8ae002
+	.machine  power7
8ae002
+
8ae002
+ENTRY (__STRCASECMP)
8ae002
+#ifdef USE_AS_STRNCASECMP
8ae002
+	CALL_MCOUNT 3
8ae002
+#else
8ae002
+	CALL_MCOUNT 2
8ae002
+#endif
8ae002
+#define rRTN	r3	/* Return value */
8ae002
+#define rSTR1	r10	/* 1st string */
8ae002
+#define rSTR2	r4	/* 2nd string */
8ae002
+#define rCHAR1	r6	/* Byte read from 1st string */
8ae002
+#define rCHAR2	r7	/* Byte read from 2nd string */
8ae002
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
8ae002
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
8ae002
+#define rLWR1	r8	/* Word tolower(rCHAR1) */
8ae002
+#define rLWR2	r12	/* Word tolower(rCHAR2) */
8ae002
+#define rTMP	r9
8ae002
+#define rLOC	r11	/* Default locale address */
8ae002
+
8ae002
+	cmpd	cr7, rRTN, rSTR2
8ae002
+
8ae002
+	/* Get locale address.  */
8ae002
+	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
8ae002
+	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
8ae002
+	ld	rLOC, 0(rLOC)
8ae002
+
8ae002
+	mr	rSTR1, rRTN
8ae002
+	li	rRTN, 0
8ae002
+	beqlr	cr7
8ae002
+#ifdef USE_AS_STRNCASECMP
8ae002
+	cmpdi	cr7, r5, 0
8ae002
+	beq	cr7, L(retnull)
8ae002
+	cmpdi	cr7, r5, 16
8ae002
+	blt	cr7, L(bytebybyte)
8ae002
+#endif
8ae002
+	vspltisb	v0, 0
8ae002
+	vspltisb	v8, -1
8ae002
+	/* Check for null in initial characters.
8ae002
+	   Check max of 16 char depending on the alignment.
8ae002
+	   If null is present, proceed byte by byte.  */
8ae002
+	lvx	v4, 0, rSTR1
8ae002
+#ifdef  __LITTLE_ENDIAN__
8ae002
+	lvsr	v10, 0, rSTR1	/* Compute mask.  */
8ae002
+	vperm	v9, v8, v4, v10	/* Mask bits that are not part of string.  */
8ae002
+#else
8ae002
+	lvsl	v10, 0, rSTR1
8ae002
+	vperm	v9, v4, v8, v10
8ae002
+#endif
8ae002
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
8ae002
+	bne	cr6, L(bytebybyte)
8ae002
+	lvx	v5, 0, rSTR2
8ae002
+	/* Calculate alignment.  */
8ae002
+#ifdef __LITTLE_ENDIAN__
8ae002
+	lvsr	v6, 0, rSTR2
8ae002
+	vperm	v9, v8, v5, v6	/* Mask bits that are not part of string.  */
8ae002
+#else
8ae002
+	lvsl	v6, 0, rSTR2
8ae002
+	vperm	v9, v5, v8, v6
8ae002
+#endif
8ae002
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
8ae002
+	bne	cr6, L(bytebybyte)
8ae002
+	/* Check if locale has non ascii characters.  */
8ae002
+	ld	rTMP, 0(rLOC)
8ae002
+	addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
8ae002
+	lwz	rTMP, 0(r6)
8ae002
+	cmpdi	cr7, rTMP, 1
8ae002
+	beq	cr7, L(bytebybyte)
8ae002
+
8ae002
+	/* Load vector registers with values used for TOLOWER.  */
8ae002
+	/* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte.  */
8ae002
+	vspltisb	v3, 2
8ae002
+	vspltisb	v9, 4
8ae002
+	vsl	v3, v3, v9
8ae002
+	vaddubm	v1, v3, v3
8ae002
+	vnor	v1, v1, v1
8ae002
+	vspltisb	v2, 7
8ae002
+	vsububm	v2, v3, v2
8ae002
+
8ae002
+	andi.	rADDR1, rSTR1, 0xF
8ae002
+	beq	cr0, L(align)
8ae002
+	addi	r6, rSTR1, 16
8ae002
+	lvx	v9, 0, r6
8ae002
+	/* Compute 16 bytes from previous two loads.  */
8ae002
+#ifdef __LITTLE_ENDIAN__
8ae002
+	vperm	v4, v9, v4, v10
8ae002
+#else
8ae002
+	vperm	v4, v4, v9, v10
8ae002
+#endif
8ae002
+L(align):
8ae002
+	andi.	rADDR2, rSTR2, 0xF
8ae002
+	beq	cr0, L(align1)
8ae002
+	addi	r6, rSTR2, 16
8ae002
+	lvx	v9, 0, r6
8ae002
+	/* Compute 16 bytes from previous two loads.  */
8ae002
+#ifdef __LITTLE_ENDIAN__
8ae002
+	vperm	v5, v9, v5, v6
8ae002
+#else
8ae002
+	vperm	v5, v5, v9, v6
8ae002
+#endif
8ae002
+L(align1):
8ae002
+	CHECKNULLANDCONVERT()
8ae002
+	blt	cr6, L(match)
8ae002
+	b	L(different)
8ae002
+	.align 	4
8ae002
+L(match):
8ae002
+	clrldi	r6, rSTR1, 60
8ae002
+	subfic	r7, r6, 16
8ae002
+#ifdef USE_AS_STRNCASECMP
8ae002
+	sub	r5, r5, r7
8ae002
+#endif
8ae002
+	add	rSTR1, rSTR1, r7
8ae002
+	add	rSTR2, rSTR2, r7
8ae002
+	andi.	rADDR2, rSTR2, 0xF
8ae002
+	addi	rSTR1, rSTR1, -16
8ae002
+	addi	rSTR2, rSTR2, -16
8ae002
+	beq	cr0, L(aligned)
8ae002
+#ifdef __LITTLE_ENDIAN__
8ae002
+	lvsr	v6, 0, rSTR2
8ae002
+#else
8ae002
+	lvsl	v6, 0, rSTR2
8ae002
+#endif
8ae002
+	/* There are 2 loops depending on the input alignment.
8ae002
+	   Each loop gets 16 bytes from s1 and s2, check for null,
8ae002
+	   convert to lowercase and compare. Loop till difference
8ae002
+	   or null occurs. */
8ae002
+L(s1_align):
8ae002
+	addi	rSTR1, rSTR1, 16
8ae002
+	addi	rSTR2, rSTR2, 16
8ae002
+#ifdef USE_AS_STRNCASECMP
8ae002
+	cmpdi	cr7, r5, 16
8ae002
+	blt	cr7, L(bytebybyte)
8ae002
+	addi	r5, r5, -16
8ae002
+#endif
8ae002
+	lvx	v4, 0, rSTR1
8ae002
+	GET16BYTES(v5, rSTR2, v6)
8ae002
+	CHECKNULLANDCONVERT()
8ae002
+	blt	cr6, L(s1_align)
8ae002
+	b	L(different)
8ae002
+	.align 	4
8ae002
+L(aligned):
8ae002
+	addi	rSTR1, rSTR1, 16
8ae002
+	addi	rSTR2, rSTR2, 16
8ae002
+#ifdef USE_AS_STRNCASECMP
8ae002
+	cmpdi	cr7, r5, 16
8ae002
+	blt	cr7, L(bytebybyte)
8ae002
+	addi	r5, r5, -16
8ae002
+#endif
8ae002
+	lvx	v4, 0, rSTR1
8ae002
+	lvx	v5, 0, rSTR2
8ae002
+	CHECKNULLANDCONVERT()
8ae002
+	blt	cr6, L(aligned)
8ae002
+
8ae002
+	/* Calculate and return the difference. */
8ae002
+L(different):
8ae002
+	vaddubm	v1, v3, v3
8ae002
+	vcmpequb	v7, v0, v7
8ae002
+#ifdef __LITTLE_ENDIAN__
8ae002
+	/* Count trailing zero.  */
8ae002
+	vspltisb	v8, -1
8ae002
+	VADDUQM_V7_V8
8ae002
+	vandc	v8, v9, v7
8ae002
+	VPOPCNTD_V8_V8
8ae002
+	vspltb	v6, v8, 15
8ae002
+	vcmpequb.	v6, v6, v1
8ae002
+	blt	cr6, L(shift8)
8ae002
+#else
8ae002
+	/* Count leading zero.  */
8ae002
+	VCLZD_V8_v7
8ae002
+	vspltb	v6, v8, 7
8ae002
+	vcmpequb.	v6, v6, v1
8ae002
+	blt	cr6, L(shift8)
8ae002
+	vsro	v8, v8, v1
8ae002
+#endif
8ae002
+	b	L(skipsum)
8ae002
+	.align  4
8ae002
+L(shift8):
8ae002
+	vsumsws		v8, v8, v0
8ae002
+L(skipsum):
8ae002
+#ifdef __LITTLE_ENDIAN__
8ae002
+	/* Shift registers based on leading zero count.  */
8ae002
+	vsro	v6, v5, v8
8ae002
+	vsro	v7, v4, v8
8ae002
+	/* Merge and move to GPR.  */
8ae002
+	vmrglb	v6, v6, v7
8ae002
+	vslo	v1, v6, v1
8ae002
+	MFVRD_R3_V1
8ae002
+	/* Place the characters that are different in first position.  */
8ae002
+	sldi	rSTR2, rRTN, 56
8ae002
+	srdi	rSTR2, rSTR2, 56
8ae002
+	sldi	rSTR1, rRTN, 48
8ae002
+	srdi	rSTR1, rSTR1, 56
8ae002
+#else
8ae002
+	vslo	v6, v5, v8
8ae002
+	vslo	v7, v4, v8
8ae002
+	vmrghb	v1, v6, v7
8ae002
+	MFVRD_R3_V1
8ae002
+	srdi	rSTR2, rRTN, 48
8ae002
+	sldi	rSTR2, rSTR2, 56
8ae002
+	srdi	rSTR2, rSTR2, 56
8ae002
+	srdi	rSTR1, rRTN, 56
8ae002
+#endif
8ae002
+	subf  	rRTN, rSTR1, rSTR2
8ae002
+	extsw 	rRTN, rRTN
8ae002
+	blr
8ae002
+
8ae002
+	.align  4
8ae002
+	/* OK. We've hit the end of the string. We need to be careful that
8ae002
+	   we don't compare two strings as different because of junk beyond
8ae002
+	   the end of the strings...  */
8ae002
+L(null_found):
8ae002
+	vaddubm	v10, v3, v3
8ae002
+#ifdef __LITTLE_ENDIAN__
8ae002
+	/* Count trailing zero.  */
8ae002
+	vspltisb	v8, -1
8ae002
+	VADDUQM_V7_V8
8ae002
+	vandc	v8, v9, v7
8ae002
+	VPOPCNTD_V8_V8
8ae002
+	vspltb	v6, v8, 15
8ae002
+	vcmpequb.	v6, v6, v10
8ae002
+	blt	cr6, L(shift_8)
8ae002
+#else
8ae002
+	/* Count leading zero.  */
8ae002
+	VCLZD_V8_v7
8ae002
+	vspltb	v6, v8, 7
8ae002
+	vcmpequb.	v6, v6, v10
8ae002
+	blt	cr6, L(shift_8)
8ae002
+	vsro	v8, v8, v10
8ae002
+#endif
8ae002
+	b	L(skipsum1)
8ae002
+	.align  4
8ae002
+L(shift_8):
8ae002
+	vsumsws	v8, v8, v0
8ae002
+L(skipsum1):
8ae002
+	/* Calculate shift count based on count of zero.  */
8ae002
+	vspltisb	v10, 7
8ae002
+	vslb	v10, v10, v10
8ae002
+	vsldoi	v9, v0, v10, 1
8ae002
+	VSUBUDM_V9_V8
8ae002
+	vspltisb	v8, 8
8ae002
+	vsldoi	v8, v0, v8, 1
8ae002
+	VSUBUDM_V9_V8
8ae002
+	/* Shift and remove junk after null character.  */
8ae002
+#ifdef __LITTLE_ENDIAN__
8ae002
+	vslo	v5, v5, v9
8ae002
+	vslo	v4, v4, v9
8ae002
+#else
8ae002
+	vsro	v5, v5, v9
8ae002
+	vsro	v4, v4, v9
8ae002
+#endif
8ae002
+	/* Convert and compare 16 bytes.  */
8ae002
+	TOLOWER()
8ae002
+	blt	cr6, L(retnull)
8ae002
+	b	L(different)
8ae002
+	.align  4
8ae002
+L(retnull):
8ae002
+	li	rRTN, 0
8ae002
+	blr
8ae002
+	.align  4
8ae002
+L(bytebybyte):
8ae002
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
8ae002
+	offset and string descriptors are only updated in the end
8ae002
+	of loop unrolling. */
8ae002
+	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
8ae002
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
8ae002
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
8ae002
+#ifdef USE_AS_STRNCASECMP
8ae002
+	rldicl	rTMP, r5, 62, 2
8ae002
+	cmpdi	cr7, rTMP, 0
8ae002
+	beq	cr7, L(lessthan4)
8ae002
+	mtctr	rTMP
8ae002
+#endif
8ae002
+L(loop):
8ae002
+	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
8ae002
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
8ae002
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
8ae002
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
8ae002
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
8ae002
+	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
8ae002
+	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
8ae002
+	beq	cr1, L(done)
8ae002
+	lbz	rCHAR1, 1(rSTR1)
8ae002
+	lbz	rCHAR2, 1(rSTR2)
8ae002
+	cmpdi	rCHAR1, 0
8ae002
+	sldi	rADDR1, rCHAR1, 2
8ae002
+	sldi	rADDR2, rCHAR2, 2
8ae002
+	lwzx	rLWR1, rLOC, rADDR1
8ae002
+	lwzx	rLWR2, rLOC, rADDR2
8ae002
+	cmpw	cr1, rLWR1, rLWR2
8ae002
+	crorc	4*cr1+eq,eq,4*cr1+eq
8ae002
+	beq	cr1, L(done)
8ae002
+	lbz	rCHAR1, 2(rSTR1)
8ae002
+	lbz	rCHAR2, 2(rSTR2)
8ae002
+	cmpdi	rCHAR1, 0
8ae002
+	sldi	rADDR1, rCHAR1, 2
8ae002
+	sldi	rADDR2, rCHAR2, 2
8ae002
+	lwzx	rLWR1, rLOC, rADDR1
8ae002
+	lwzx	rLWR2, rLOC, rADDR2
8ae002
+	cmpw	cr1, rLWR1, rLWR2
8ae002
+	crorc	4*cr1+eq,eq,4*cr1+eq
8ae002
+	beq	cr1, L(done)
8ae002
+	lbz	rCHAR1, 3(rSTR1)
8ae002
+	lbz	rCHAR2, 3(rSTR2)
8ae002
+	cmpdi	rCHAR1, 0
8ae002
+	/* Increment both string descriptors */
8ae002
+	addi	rSTR1, rSTR1, 4
8ae002
+	addi	rSTR2, rSTR2, 4
8ae002
+	sldi	rADDR1, rCHAR1, 2
8ae002
+	sldi	rADDR2, rCHAR2, 2
8ae002
+	lwzx	rLWR1, rLOC, rADDR1
8ae002
+	lwzx	rLWR2, rLOC, rADDR2
8ae002
+	cmpw	cr1, rLWR1, rLWR2
8ae002
+	crorc	4*cr1+eq,eq,4*cr1+eq
8ae002
+	beq     cr1, L(done)
8ae002
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
8ae002
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
8ae002
+#ifdef USE_AS_STRNCASECMP
8ae002
+	bdnz	L(loop)
8ae002
+#else
8ae002
+	b	L(loop)
8ae002
+#endif
8ae002
+#ifdef USE_AS_STRNCASECMP
8ae002
+L(lessthan4):
8ae002
+	clrldi	r5, r5, 62
8ae002
+	cmpdi	cr7, r5, 0
8ae002
+	beq	cr7, L(retnull)
8ae002
+	mtctr	r5
8ae002
+L(loop1):
8ae002
+	cmpdi	rCHAR1, 0
8ae002
+	sldi	rADDR1, rCHAR1, 2
8ae002
+	sldi	rADDR2, rCHAR2, 2
8ae002
+	lwzx	rLWR1, rLOC, rADDR1
8ae002
+	lwzx	rLWR2, rLOC, rADDR2
8ae002
+	cmpw	cr1, rLWR1, rLWR2
8ae002
+	crorc	4*cr1+eq,eq,4*cr1+eq
8ae002
+	beq	cr1, L(done)
8ae002
+	addi	rSTR1, rSTR1, 1
8ae002
+	addi	rSTR2, rSTR2, 1
8ae002
+	lbz	rCHAR1, 0(rSTR1)
8ae002
+	lbz	rCHAR2, 0(rSTR2)
8ae002
+	bdnz	L(loop1)
8ae002
+#endif
8ae002
+L(done):
8ae002
+	subf	r0, rLWR2, rLWR1
8ae002
+	extsw	rRTN, r0
8ae002
+	blr
8ae002
+END (__STRCASECMP)
8ae002
+
8ae002
+weak_alias (__STRCASECMP, STRCASECMP)
8ae002
+libc_hidden_builtin_def (__STRCASECMP)
8ae002
diff --git a/sysdeps/powerpc/powerpc64/power8/strncase.S b/sysdeps/powerpc/powerpc64/power8/strncase.S
8ae002
new file mode 100644
8ae002
index 0000000..7ce2ed0
8ae002
--- /dev/null
8ae002
+++ b/sysdeps/powerpc/powerpc64/power8/strncase.S
8ae002
@@ -0,0 +1,20 @@
8ae002
+/* Optimized strncasecmp implementation for POWER8.
8ae002
+   Copyright (C) 2016 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#define USE_AS_STRNCASECMP 1
8ae002
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
8ae002
-- 
8ae002
2.1.0
8ae002