olga / rpms / glibc

Forked from rpms/glibc 5 years ago
Clone
00db10
From 7dd60718b327b3eb6112ec3900750007b0259189 Mon Sep 17 00:00:00 2001
00db10
From: raji <raji@oc4354787705.ibm.com>
00db10
Date: Tue, 14 Jun 2016 14:51:16 +0530
00db10
Subject: [PATCH] powerpc: strcasecmp/strncasecmp optmization for power8
00db10
00db10
This implementation utilizes vectors to improve performance
00db10
compared to current byte by byte implementation for POWER7.
00db10
The performance improvement is upto 4x.  This patch is tested
00db10
on powerpc64 and powerpc64le.
00db10
00db10
(cherry picked from commit c8376f3e07602aaef9cb843bb73cb5f2b860634a)
00db10
00db10
Conflicts:
00db10
	sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
00db10
	sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
00db10
---
00db10
 ChangeLog                                          |  22 +
00db10
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   4 +-
00db10
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
00db10
 .../powerpc64/multiarch/strcasecmp-power7.S        |  20 +-
00db10
 .../powerpc64/multiarch/strcasecmp-power8.S        |  28 ++
00db10
 .../powerpc/powerpc64/multiarch/strcasecmp-ppc64.c |  21 +
00db10
 sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c   |  32 +-
00db10
 .../powerpc/powerpc64/multiarch/strncase-power8.S  |  28 ++
00db10
 .../powerpc/powerpc64/multiarch/strncase-ppc64.c   |  21 +
00db10
 sysdeps/powerpc/powerpc64/multiarch/strncase.c     |  25 +-
00db10
 sysdeps/powerpc/powerpc64/power8/strcasecmp.S      | 446 +++++++++++++++++++++
00db10
 sysdeps/powerpc/powerpc64/power8/strncase.S        |  20 +
00db10
 12 files changed, 622 insertions(+), 51 deletions(-)
00db10
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
00db10
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
00db10
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
00db10
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
00db10
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strcasecmp.S
00db10
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strncase.S
00db10
00db10
diff --git a/ChangeLog b/ChangeLog
00db10
index c01d1a0..9385bd0 100644
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
00db10
index 9ee9bc2..e3ac285 100644
00db10
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
00db10
@@ -21,6 +21,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
00db10
                   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
00db10
                   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
00db10
                   stpcpy-power8 stpcpy-power7 stpcpy-ppc64 \
00db10
+                  strcasecmp-ppc64 strcasecmp-power8 \
00db10
+                  strncase-ppc64 strncase-power8 \
00db10
                   strcasestr-power8 strcasestr-ppc64 \
00db10
                   strcat-power8 strcat-power7 strcat-ppc64 \
00db10
                   strcmp-power8 strcmp-power7 strcmp-ppc64 \
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
00db10
index 228891f..aabd7bc 100644
00db10
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
00db10
@@ -204,6 +204,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
00db10
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c.  */
00db10
   IFUNC_IMPL (i, name, strcasecmp,
00db10
              IFUNC_IMPL_ADD (array, i, strcasecmp,
00db10
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
00db10
+                              __strcasecmp_power8)
00db10
+             IFUNC_IMPL_ADD (array, i, strcasecmp,
00db10
                              hwcap & PPC_FEATURE_HAS_VSX,
00db10
                              __strcasecmp_power7)
00db10
              IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ppc))
00db10
@@ -219,6 +222,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
00db10
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncase.c.  */
00db10
   IFUNC_IMPL (i, name, strncasecmp,
00db10
              IFUNC_IMPL_ADD (array, i, strncasecmp,
00db10
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
00db10
+                             __strncasecmp_power8)
00db10
+             IFUNC_IMPL_ADD (array, i, strncasecmp,
00db10
                              hwcap & PPC_FEATURE_HAS_VSX,
00db10
                              __strncasecmp_power7)
00db10
              IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_ppc))
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
00db10
index 56eed9a..99cd7bd 100644
00db10
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
00db10
@@ -1,5 +1,5 @@
00db10
-/* Optimized strcasecmp implementation foOWER7.
00db10
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
00db10
+/* Optimized strcasecmp implementation for POWER7.
00db10
+   Copyright (C) 2013-2016 Free Software Foundation, Inc.
00db10
    This file is part of the GNU C Library.
00db10
 
00db10
    The GNU C Library is free software; you can redistribute it and/or
00db10
@@ -18,21 +18,7 @@
00db10
 
00db10
 #include <sysdep.h>
00db10
 
00db10
-#undef ENTRY
00db10
-#define ENTRY(name)						\
00db10
-  .section ".text";						\
00db10
-  ENTRY_2(__strcasecmp_power7)					\
00db10
-  .align ALIGNARG(2);						\
00db10
-  BODY_LABEL(__strcasecmp_power7):				\
00db10
-  cfi_startproc;						\
00db10
-  LOCALENTRY(__strcasecmp_power7)
00db10
-
00db10
-#undef END
00db10
-#define END(name)						\
00db10
-  cfi_endproc;							\
00db10
-  TRACEBACK(__strcasecmp_power7)				\
00db10
-  END_2(__strcasecmp_power7)
00db10
-
00db10
+#define __strcasecmp __strcasecmp_power7
00db10
 #undef weak_alias
00db10
 #define weak_alias(name, alias)
00db10
 
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
00db10
new file mode 100644
00db10
index 0000000..492047a
00db10
--- /dev/null
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
00db10
@@ -0,0 +1,28 @@
00db10
+/* Optimized strcasecmp implementation for POWER8.
00db10
+   Copyright (C) 2016 Free Software Foundation, Inc.
00db10
+   This file is part of the GNU C Library.
00db10
+
00db10
+   The GNU C Library is free software; you can redistribute it and/or
00db10
+   modify it under the terms of the GNU Lesser General Public
00db10
+   License as published by the Free Software Foundation; either
00db10
+   version 2.1 of the License, or (at your option) any later version.
00db10
+
00db10
+   The GNU C Library is distributed in the hope that it will be useful,
00db10
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
00db10
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00db10
+   Lesser General Public License for more details.
00db10
+
00db10
+   You should have received a copy of the GNU Lesser General Public
00db10
+   License along with the GNU C Library; if not, see
00db10
+   <http://www.gnu.org/licenses/>.  */
00db10
+
00db10
+#include <sysdep.h>
00db10
+
00db10
+#define __strcasecmp __strcasecmp_power8
00db10
+#undef weak_alias
00db10
+#define weak_alias(name, alias)
00db10
+
00db10
+#undef libc_hidden_builtin_def
00db10
+#define libc_hidden_builtin_def(name)
00db10
+
00db10
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
00db10
new file mode 100644
00db10
index 0000000..6318b4a
00db10
--- /dev/null
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
00db10
@@ -0,0 +1,21 @@
00db10
+/* Multiarch strcasecmp for PPC64.
00db10
+   Copyright (C) 2016 Free Software Foundation, Inc.
00db10
+   This file is part of the GNU C Library.
00db10
+
00db10
+   The GNU C Library is free software; you can redistribute it and/or
00db10
+   modify it under the terms of the GNU Lesser General Public
00db10
+   License as published by the Free Software Foundation; either
00db10
+   version 2.1 of the License, or (at your option) any later version.
00db10
+
00db10
+   The GNU C Library is distributed in the hope that it will be useful,
00db10
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
00db10
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00db10
+   Lesser General Public License for more details.
00db10
+
00db10
+   You should have received a copy of the GNU Lesser General Public
00db10
+   License along with the GNU C Library; if not, see
00db10
+   <http://www.gnu.org/licenses/>.  */
00db10
+
00db10
+#define strcasecmp __strcasecmp_ppc
00db10
+
00db10
+#include <string/strcasecmp.c>
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
00db10
index 979e9f1..5ec6885 100644
00db10
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
00db10
@@ -1,5 +1,5 @@
00db10
-/* Multiple versions of strcasecmp.
00db10
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
00db10
+/* Multiple versions of strcasecmp
00db10
+   Copyright (C) 2013-2016 Free Software Foundation, Inc.
00db10
    This file is part of the GNU C Library.
00db10
 
00db10
    The GNU C Library is free software; you can redistribute it and/or
00db10
@@ -16,25 +16,21 @@
00db10
    License along with the GNU C Library; if not, see
00db10
    <http://www.gnu.org/licenses/>.  */
00db10
 
00db10
-#if IS_IN (libc)
00db10
-# include <string.h>
00db10
-# define strcasecmp __strcasecmp_ppc
00db10
-extern __typeof (__strcasecmp) __strcasecmp_ppc attribute_hidden;
00db10
-extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
00db10
-#endif
00db10
+#include <string.h>
00db10
+#include <shlib-compat.h>
00db10
+#include "init-arch.h"
00db10
 
00db10
-#include <string/strcasecmp.c>
00db10
-#undef strcasecmp
00db10
+extern __typeof (__strcasecmp) __libc_strcasecmp;
00db10
 
00db10
-#if IS_IN (libc)
00db10
-# include <shlib-compat.h>
00db10
-# include "init-arch.h"
00db10
+extern __typeof (__strcasecmp) __strcasecmp_ppc attribute_hidden;
00db10
+extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
00db10
+extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
00db10
 
00db10
-extern __typeof (__strcasecmp) __libc_strcasecmp;
00db10
 libc_ifunc (__libc_strcasecmp,
00db10
-	    (hwcap & PPC_FEATURE_HAS_VSX)
00db10
-            ? __strcasecmp_power7
00db10
-            : __strcasecmp_ppc);
00db10
+	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
00db10
+             ? __strcasecmp_power8:
00db10
+	     (hwcap & PPC_FEATURE_HAS_VSX)
00db10
+             ? __strcasecmp_power7
00db10
+             : __strcasecmp_ppc);
00db10
 
00db10
 weak_alias (__libc_strcasecmp, strcasecmp)
00db10
-#endif
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
00db10
new file mode 100644
00db10
index 0000000..01a63b5
00db10
--- /dev/null
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
00db10
@@ -0,0 +1,28 @@
00db10
+/* Optimized strncasecmp implementation for POWER8.
00db10
+   Copyright (C) 2016 Free Software Foundation, Inc.
00db10
+   This file is part of the GNU C Library.
00db10
+
00db10
+   The GNU C Library is free software; you can redistribute it and/or
00db10
+   modify it under the terms of the GNU Lesser General Public
00db10
+   License as published by the Free Software Foundation; either
00db10
+   version 2.1 of the License, or (at your option) any later version.
00db10
+
00db10
+   The GNU C Library is distributed in the hope that it will be useful,
00db10
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
00db10
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00db10
+   Lesser General Public License for more details.
00db10
+
00db10
+   You should have received a copy of the GNU Lesser General Public
00db10
+   License along with the GNU C Library; if not, see
00db10
+   <http://www.gnu.org/licenses/>.  */
00db10
+
00db10
+#include <sysdep.h>
00db10
+
00db10
+#define __strncasecmp __strncasecmp_power8
00db10
+#undef weak_alias
00db10
+#define weak_alias(name, alias)
00db10
+
00db10
+#undef libc_hidden_builtin_def
00db10
+#define libc_hidden_builtin_def(name)
00db10
+
00db10
+#include <sysdeps/powerpc/powerpc64/power8/strncase.S>
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
00db10
new file mode 100644
00db10
index 0000000..c245d77
00db10
--- /dev/null
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
00db10
@@ -0,0 +1,21 @@
00db10
+/* Multiarch strncasecmp for PPC64.
00db10
+   Copyright (C) 2016 Free Software Foundation, Inc.
00db10
+   This file is part of the GNU C Library.
00db10
+
00db10
+   The GNU C Library is free software; you can redistribute it and/or
00db10
+   modify it under the terms of the GNU Lesser General Public
00db10
+   License as published by the Free Software Foundation; either
00db10
+   version 2.1 of the License, or (at your option) any later version.
00db10
+
00db10
+   The GNU C Library is distributed in the hope that it will be useful,
00db10
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
00db10
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00db10
+   Lesser General Public License for more details.
00db10
+
00db10
+   You should have received a copy of the GNU Lesser General Public
00db10
+   License along with the GNU C Library; if not, see
00db10
+   <http://www.gnu.org/licenses/>.  */
00db10
+
00db10
+#define strncasecmp __strncasecmp_ppc
00db10
+
00db10
+#include <string/strncase.c>
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase.c b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
00db10
index 4339f3a..5bfaf65 100644
00db10
--- a/sysdeps/powerpc/powerpc64/multiarch/strncase.c
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
00db10
@@ -16,26 +16,21 @@
00db10
    License along with the GNU C Library; if not, see
00db10
    <http://www.gnu.org/licenses/>.  */
00db10
 
00db10
-#if IS_IN (libc)
00db10
-# include <string.h>
00db10
-# define strncasecmp __strncasecmp_ppc
00db10
-extern __typeof (__strncasecmp) __strncasecmp_ppc attribute_hidden;
00db10
-extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
00db10
-#endif
00db10
+#include <string.h>
00db10
+#include <shlib-compat.h>
00db10
+#include "init-arch.h"
00db10
 
00db10
-#include <string/strncase.c>
00db10
-#undef strncasecmp
00db10
+extern __typeof (__strncasecmp) __libc_strncasecmp;
00db10
 
00db10
-#if IS_IN (libc)
00db10
-# include <shlib-compat.h>
00db10
-# include "init-arch.h"
00db10
+extern __typeof (__strncasecmp) __strncasecmp_ppc attribute_hidden;
00db10
+extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
00db10
+extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
00db10
 
00db10
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
00db10
-   ifunc symbol properly.  */
00db10
-extern __typeof (__strncasecmp) __libc_strncasecmp;
00db10
 libc_ifunc (__libc_strncasecmp,
00db10
+	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
00db10
+             ? __strncasecmp_power8:
00db10
 	     (hwcap & PPC_FEATURE_HAS_VSX)
00db10
              ? __strncasecmp_power7
00db10
              : __strncasecmp_ppc);
00db10
+
00db10
 weak_alias (__libc_strncasecmp, strncasecmp)
00db10
-#endif
00db10
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
00db10
new file mode 100644
00db10
index 0000000..63f6217
00db10
--- /dev/null
00db10
+++ b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
00db10
@@ -0,0 +1,446 @@
00db10
+/* Optimized strcasecmp implementation for PowerPC64.
00db10
+   Copyright (C) 2016 Free Software Foundation, Inc.
00db10
+   This file is part of the GNU C Library.
00db10
+
00db10
+   The GNU C Library is free software; you can redistribute it and/or
00db10
+   modify it under the terms of the GNU Lesser General Public
00db10
+   License as published by the Free Software Foundation; either
00db10
+   version 2.1 of the License, or (at your option) any later version.
00db10
+
00db10
+   The GNU C Library is distributed in the hope that it will be useful,
00db10
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
00db10
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00db10
+   Lesser General Public License for more details.
00db10
+
00db10
+   You should have received a copy of the GNU Lesser General Public
00db10
+   License along with the GNU C Library; if not, see
00db10
+   <http://www.gnu.org/licenses/>.  */
00db10
+
00db10
+#include <sysdep.h>
00db10
+#include <locale-defines.h>
00db10
+
00db10
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
00db10
+
00db10
+#ifndef USE_AS_STRNCASECMP
00db10
+#  define __STRCASECMP __strcasecmp
00db10
+#  define STRCASECMP   strcasecmp
00db10
+#else
00db10
+#  define __STRCASECMP __strncasecmp
00db10
+#  define STRCASECMP   strncasecmp
00db10
+#endif
00db10
+/* Convert 16 bytes to lowercase and compare */
00db10
+#define TOLOWER()     \
00db10
+	vaddubm	v8, v4, v1; \
00db10
+	vaddubm	v7, v4, v3; \
00db10
+	vcmpgtub	v8, v8, v2; \
00db10
+	vsel	v4, v7, v4, v8; \
00db10
+	vaddubm	v8, v5, v1; \
00db10
+	vaddubm	v7, v5, v3; \
00db10
+	vcmpgtub	v8, v8, v2; \
00db10
+	vsel	v5, v7, v5, v8; \
00db10
+	vcmpequb.	v7, v5, v4;
00db10
+
00db10
+/* Get 16 bytes for unaligned case.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+#define GET16BYTES(reg1, reg2, reg3) \
00db10
+	lvx	reg1, 0, reg2; \
00db10
+	vcmpequb.	v8, v0, reg1; \
00db10
+	beq	cr6, 1f; \
00db10
+	vspltisb	v9, 0; \
00db10
+	b	2f; \
00db10
+	.align 4; \
00db10
+1: \
00db10
+	addi	r6, reg2, 16; \
00db10
+	lvx	v9, 0, r6; \
00db10
+2: \
00db10
+	vperm	reg1, v9, reg1, reg3;
00db10
+#else
00db10
+#define GET16BYTES(reg1, reg2, reg3) \
00db10
+	lvx	reg1, 0, reg2; \
00db10
+	vcmpequb.	v8, v0, reg1; \
00db10
+	beq	cr6, 1f; \
00db10
+	vspltisb	v9, 0; \
00db10
+	b	2f; \
00db10
+	.align 4; \
00db10
+1: \
00db10
+	addi	r6, reg2, 16; \
00db10
+	lvx	v9, 0, r6; \
00db10
+2: \
00db10
+	vperm	reg1, reg1, v9, reg3;
00db10
+#endif
00db10
+
00db10
+/* Check null in v4, v5 and convert to lower.  */
00db10
+#define CHECKNULLANDCONVERT() \
00db10
+	vcmpequb.	v7, v0, v5; \
00db10
+	beq	cr6, 3f; \
00db10
+	vcmpequb.	v7, v0, v4; \
00db10
+	beq	cr6, 3f; \
00db10
+	b	L(null_found); \
00db10
+	.align  4; \
00db10
+3: \
00db10
+	TOLOWER()
00db10
+
00db10
+#ifdef _ARCH_PWR8
00db10
+#  define VCLZD_V8_v7	vclzd	v8, v7;
00db10
+#  define MFVRD_R3_V1	mfvrd	r3, v1;
00db10
+#  define VSUBUDM_V9_V8	vsubudm	v9, v9, v8;
00db10
+#  define VPOPCNTD_V8_V8	vpopcntd v8, v8;
00db10
+#  define VADDUQM_V7_V8	vadduqm	v9, v7, v8;
00db10
+#else
00db10
+#  define VCLZD_V8_v7	.long	0x11003fc2
00db10
+#  define MFVRD_R3_V1	.long	0x7c230067
00db10
+#  define VSUBUDM_V9_V8	.long	0x112944c0
00db10
+#  define VPOPCNTD_V8_V8	.long	0x110047c3
00db10
+#  define VADDUQM_V7_V8	.long	0x11274100
00db10
+#endif
00db10
+
00db10
+	.machine  power7
00db10
+
00db10
+ENTRY (__STRCASECMP)
00db10
+#ifdef USE_AS_STRNCASECMP
00db10
+	CALL_MCOUNT 3
00db10
+#else
00db10
+	CALL_MCOUNT 2
00db10
+#endif
00db10
+#define rRTN	r3	/* Return value */
00db10
+#define rSTR1	r10	/* 1st string */
00db10
+#define rSTR2	r4	/* 2nd string */
00db10
+#define rCHAR1	r6	/* Byte read from 1st string */
00db10
+#define rCHAR2	r7	/* Byte read from 2nd string */
00db10
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
00db10
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
00db10
+#define rLWR1	r8	/* Word tolower(rCHAR1) */
00db10
+#define rLWR2	r12	/* Word tolower(rCHAR2) */
00db10
+#define rTMP	r9
00db10
+#define rLOC	r11	/* Default locale address */
00db10
+
00db10
+	cmpd	cr7, rRTN, rSTR2
00db10
+
00db10
+	/* Get locale address.  */
00db10
+	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
00db10
+	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
00db10
+	ld	rLOC, 0(rLOC)
00db10
+
00db10
+	mr	rSTR1, rRTN
00db10
+	li	rRTN, 0
00db10
+	beqlr	cr7
00db10
+#ifdef USE_AS_STRNCASECMP
00db10
+	cmpdi	cr7, r5, 0
00db10
+	beq	cr7, L(retnull)
00db10
+	cmpdi	cr7, r5, 16
00db10
+	blt	cr7, L(bytebybyte)
00db10
+#endif
00db10
+	vspltisb	v0, 0
00db10
+	vspltisb	v8, -1
00db10
+	/* Check for null in initial characters.
00db10
+	   Check max of 16 char depending on the alignment.
00db10
+	   If null is present, proceed byte by byte.  */
00db10
+	lvx	v4, 0, rSTR1
00db10
+#ifdef  __LITTLE_ENDIAN__
00db10
+	lvsr	v10, 0, rSTR1	/* Compute mask.  */
00db10
+	vperm	v9, v8, v4, v10	/* Mask bits that are not part of string.  */
00db10
+#else
00db10
+	lvsl	v10, 0, rSTR1
00db10
+	vperm	v9, v4, v8, v10
00db10
+#endif
00db10
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
00db10
+	bne	cr6, L(bytebybyte)
00db10
+	lvx	v5, 0, rSTR2
00db10
+	/* Calculate alignment.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	lvsr	v6, 0, rSTR2
00db10
+	vperm	v9, v8, v5, v6	/* Mask bits that are not part of string.  */
00db10
+#else
00db10
+	lvsl	v6, 0, rSTR2
00db10
+	vperm	v9, v5, v8, v6
00db10
+#endif
00db10
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
00db10
+	bne	cr6, L(bytebybyte)
00db10
+	/* Check if locale has non ascii characters.  */
00db10
+	ld	rTMP, 0(rLOC)
00db10
+	addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
00db10
+	lwz	rTMP, 0(r6)
00db10
+	cmpdi	cr7, rTMP, 1
00db10
+	beq	cr7, L(bytebybyte)
00db10
+
00db10
+	/* Load vector registers with values used for TOLOWER.  */
00db10
+	/* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte.  */
00db10
+	vspltisb	v3, 2
00db10
+	vspltisb	v9, 4
00db10
+	vsl	v3, v3, v9
00db10
+	vaddubm	v1, v3, v3
00db10
+	vnor	v1, v1, v1
00db10
+	vspltisb	v2, 7
00db10
+	vsububm	v2, v3, v2
00db10
+
00db10
+	andi.	rADDR1, rSTR1, 0xF
00db10
+	beq	cr0, L(align)
00db10
+	addi	r6, rSTR1, 16
00db10
+	lvx	v9, 0, r6
00db10
+	/* Compute 16 bytes from previous two loads.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm	v4, v9, v4, v10
00db10
+#else
00db10
+	vperm	v4, v4, v9, v10
00db10
+#endif
00db10
+L(align):
00db10
+	andi.	rADDR2, rSTR2, 0xF
00db10
+	beq	cr0, L(align1)
00db10
+	addi	r6, rSTR2, 16
00db10
+	lvx	v9, 0, r6
00db10
+	/* Compute 16 bytes from previous two loads.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm	v5, v9, v5, v6
00db10
+#else
00db10
+	vperm	v5, v5, v9, v6
00db10
+#endif
00db10
+L(align1):
00db10
+	CHECKNULLANDCONVERT()
00db10
+	blt	cr6, L(match)
00db10
+	b	L(different)
00db10
+	.align 	4
00db10
+L(match):
00db10
+	clrldi	r6, rSTR1, 60
00db10
+	subfic	r7, r6, 16
00db10
+#ifdef USE_AS_STRNCASECMP
00db10
+	sub	r5, r5, r7
00db10
+#endif
00db10
+	add	rSTR1, rSTR1, r7
00db10
+	add	rSTR2, rSTR2, r7
00db10
+	andi.	rADDR2, rSTR2, 0xF
00db10
+	addi	rSTR1, rSTR1, -16
00db10
+	addi	rSTR2, rSTR2, -16
00db10
+	beq	cr0, L(aligned)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	lvsr	v6, 0, rSTR2
00db10
+#else
00db10
+	lvsl	v6, 0, rSTR2
00db10
+#endif
00db10
+	/* There are 2 loops depending on the input alignment.
00db10
+	   Each loop gets 16 bytes from s1 and s2, check for null,
00db10
+	   convert to lowercase and compare. Loop till difference
00db10
+	   or null occurs. */
00db10
+L(s1_align):
00db10
+	addi	rSTR1, rSTR1, 16
00db10
+	addi	rSTR2, rSTR2, 16
00db10
+#ifdef USE_AS_STRNCASECMP
00db10
+	cmpdi	cr7, r5, 16
00db10
+	blt	cr7, L(bytebybyte)
00db10
+	addi	r5, r5, -16
00db10
+#endif
00db10
+	lvx	v4, 0, rSTR1
00db10
+	GET16BYTES(v5, rSTR2, v6)
00db10
+	CHECKNULLANDCONVERT()
00db10
+	blt	cr6, L(s1_align)
00db10
+	b	L(different)
00db10
+	.align 	4
00db10
+L(aligned):
00db10
+	addi	rSTR1, rSTR1, 16
00db10
+	addi	rSTR2, rSTR2, 16
00db10
+#ifdef USE_AS_STRNCASECMP
00db10
+	cmpdi	cr7, r5, 16
00db10
+	blt	cr7, L(bytebybyte)
00db10
+	addi	r5, r5, -16
00db10
+#endif
00db10
+	lvx	v4, 0, rSTR1
00db10
+	lvx	v5, 0, rSTR2
00db10
+	CHECKNULLANDCONVERT()
00db10
+	blt	cr6, L(aligned)
00db10
+
00db10
+	/* Calculate and return the difference. */
00db10
+L(different):
00db10
+	vaddubm	v1, v3, v3
00db10
+	vcmpequb	v7, v0, v7
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	/* Count trailing zero.  */
00db10
+	vspltisb	v8, -1
00db10
+	VADDUQM_V7_V8
00db10
+	vandc	v8, v9, v7
00db10
+	VPOPCNTD_V8_V8
00db10
+	vspltb	v6, v8, 15
00db10
+	vcmpequb.	v6, v6, v1
00db10
+	blt	cr6, L(shift8)
00db10
+#else
00db10
+	/* Count leading zero.  */
00db10
+	VCLZD_V8_v7
00db10
+	vspltb	v6, v8, 7
00db10
+	vcmpequb.	v6, v6, v1
00db10
+	blt	cr6, L(shift8)
00db10
+	vsro	v8, v8, v1
00db10
+#endif
00db10
+	b	L(skipsum)
00db10
+	.align  4
00db10
+L(shift8):
00db10
+	vsumsws		v8, v8, v0
00db10
+L(skipsum):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	/* Shift registers based on leading zero count.  */
00db10
+	vsro	v6, v5, v8
00db10
+	vsro	v7, v4, v8
00db10
+	/* Merge and move to GPR.  */
00db10
+	vmrglb	v6, v6, v7
00db10
+	vslo	v1, v6, v1
00db10
+	MFVRD_R3_V1
00db10
+	/* Place the characters that are different in first position.  */
00db10
+	sldi	rSTR2, rRTN, 56
00db10
+	srdi	rSTR2, rSTR2, 56
00db10
+	sldi	rSTR1, rRTN, 48
00db10
+	srdi	rSTR1, rSTR1, 56
00db10
+#else
00db10
+	vslo	v6, v5, v8
00db10
+	vslo	v7, v4, v8
00db10
+	vmrghb	v1, v6, v7
00db10
+	MFVRD_R3_V1
00db10
+	srdi	rSTR2, rRTN, 48
00db10
+	sldi	rSTR2, rSTR2, 56
00db10
+	srdi	rSTR2, rSTR2, 56
00db10
+	srdi	rSTR1, rRTN, 56
00db10
+#endif
00db10
+	subf  	rRTN, rSTR1, rSTR2
00db10
+	extsw 	rRTN, rRTN
00db10
+	blr
00db10
+
00db10
+	.align  4
00db10
+	/* OK. We've hit the end of the string. We need to be careful that
00db10
+	   we don't compare two strings as different because of junk beyond
00db10
+	   the end of the strings...  */
00db10
+L(null_found):
00db10
+	vaddubm	v10, v3, v3
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	/* Count trailing zero.  */
00db10
+	vspltisb	v8, -1
00db10
+	VADDUQM_V7_V8
00db10
+	vandc	v8, v9, v7
00db10
+	VPOPCNTD_V8_V8
00db10
+	vspltb	v6, v8, 15
00db10
+	vcmpequb.	v6, v6, v10
00db10
+	blt	cr6, L(shift_8)
00db10
+#else
00db10
+	/* Count leading zero.  */
00db10
+	VCLZD_V8_v7
00db10
+	vspltb	v6, v8, 7
00db10
+	vcmpequb.	v6, v6, v10
00db10
+	blt	cr6, L(shift_8)
00db10
+	vsro	v8, v8, v10
00db10
+#endif
00db10
+	b	L(skipsum1)
00db10
+	.align  4
00db10
+L(shift_8):
00db10
+	vsumsws	v8, v8, v0
00db10
+L(skipsum1):
00db10
+	/* Calculate shift count based on count of zero.  */
00db10
+	vspltisb	v10, 7
00db10
+	vslb	v10, v10, v10
00db10
+	vsldoi	v9, v0, v10, 1
00db10
+	VSUBUDM_V9_V8
00db10
+	vspltisb	v8, 8
00db10
+	vsldoi	v8, v0, v8, 1
00db10
+	VSUBUDM_V9_V8
00db10
+	/* Shift and remove junk after null character.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vslo	v5, v5, v9
00db10
+	vslo	v4, v4, v9
00db10
+#else
00db10
+	vsro	v5, v5, v9
00db10
+	vsro	v4, v4, v9
00db10
+#endif
00db10
+	/* Convert and compare 16 bytes.  */
00db10
+	TOLOWER()
00db10
+	blt	cr6, L(retnull)
00db10
+	b	L(different)
00db10
+	.align  4
00db10
+L(retnull):
00db10
+	li	rRTN, 0
00db10
+	blr
00db10
+	.align  4
00db10
+L(bytebybyte):
00db10
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
00db10
+	offset and string descriptors are only updated in the end
00db10
+	of loop unrolling. */
00db10
+	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
00db10
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
00db10
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
00db10
+#ifdef USE_AS_STRNCASECMP
00db10
+	rldicl	rTMP, r5, 62, 2
00db10
+	cmpdi	cr7, rTMP, 0
00db10
+	beq	cr7, L(lessthan4)
00db10
+	mtctr	rTMP
00db10
+#endif
00db10
+L(loop):
00db10
+	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
00db10
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
00db10
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
00db10
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
00db10
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
00db10
+	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
00db10
+	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
00db10
+	beq	cr1, L(done)
00db10
+	lbz	rCHAR1, 1(rSTR1)
00db10
+	lbz	rCHAR2, 1(rSTR2)
00db10
+	cmpdi	rCHAR1, 0
00db10
+	sldi	rADDR1, rCHAR1, 2
00db10
+	sldi	rADDR2, rCHAR2, 2
00db10
+	lwzx	rLWR1, rLOC, rADDR1
00db10
+	lwzx	rLWR2, rLOC, rADDR2
00db10
+	cmpw	cr1, rLWR1, rLWR2
00db10
+	crorc	4*cr1+eq,eq,4*cr1+eq
00db10
+	beq	cr1, L(done)
00db10
+	lbz	rCHAR1, 2(rSTR1)
00db10
+	lbz	rCHAR2, 2(rSTR2)
00db10
+	cmpdi	rCHAR1, 0
00db10
+	sldi	rADDR1, rCHAR1, 2
00db10
+	sldi	rADDR2, rCHAR2, 2
00db10
+	lwzx	rLWR1, rLOC, rADDR1
00db10
+	lwzx	rLWR2, rLOC, rADDR2
00db10
+	cmpw	cr1, rLWR1, rLWR2
00db10
+	crorc	4*cr1+eq,eq,4*cr1+eq
00db10
+	beq	cr1, L(done)
00db10
+	lbz	rCHAR1, 3(rSTR1)
00db10
+	lbz	rCHAR2, 3(rSTR2)
00db10
+	cmpdi	rCHAR1, 0
00db10
+	/* Increment both string descriptors */
00db10
+	addi	rSTR1, rSTR1, 4
00db10
+	addi	rSTR2, rSTR2, 4
00db10
+	sldi	rADDR1, rCHAR1, 2
00db10
+	sldi	rADDR2, rCHAR2, 2
00db10
+	lwzx	rLWR1, rLOC, rADDR1
00db10
+	lwzx	rLWR2, rLOC, rADDR2
00db10
+	cmpw	cr1, rLWR1, rLWR2
00db10
+	crorc	4*cr1+eq,eq,4*cr1+eq
00db10
+	beq     cr1, L(done)
00db10
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
00db10
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
00db10
+#ifdef USE_AS_STRNCASECMP
00db10
+	bdnz	L(loop)
00db10
+#else
00db10
+	b	L(loop)
00db10
+#endif
00db10
+#ifdef USE_AS_STRNCASECMP
00db10
+L(lessthan4):
00db10
+	clrldi	r5, r5, 62
00db10
+	cmpdi	cr7, r5, 0
00db10
+	beq	cr7, L(retnull)
00db10
+	mtctr	r5
00db10
+L(loop1):
00db10
+	cmpdi	rCHAR1, 0
00db10
+	sldi	rADDR1, rCHAR1, 2
00db10
+	sldi	rADDR2, rCHAR2, 2
00db10
+	lwzx	rLWR1, rLOC, rADDR1
00db10
+	lwzx	rLWR2, rLOC, rADDR2
00db10
+	cmpw	cr1, rLWR1, rLWR2
00db10
+	crorc	4*cr1+eq,eq,4*cr1+eq
00db10
+	beq	cr1, L(done)
00db10
+	addi	rSTR1, rSTR1, 1
00db10
+	addi	rSTR2, rSTR2, 1
00db10
+	lbz	rCHAR1, 0(rSTR1)
00db10
+	lbz	rCHAR2, 0(rSTR2)
00db10
+	bdnz	L(loop1)
00db10
+#endif
00db10
+L(done):
00db10
+	subf	r0, rLWR2, rLWR1
00db10
+	extsw	rRTN, r0
00db10
+	blr
00db10
+END (__STRCASECMP)
00db10
+
00db10
+weak_alias (__STRCASECMP, STRCASECMP)
00db10
+libc_hidden_builtin_def (__STRCASECMP)
00db10
diff --git a/sysdeps/powerpc/powerpc64/power8/strncase.S b/sysdeps/powerpc/powerpc64/power8/strncase.S
00db10
new file mode 100644
00db10
index 0000000..7ce2ed0
00db10
--- /dev/null
00db10
+++ b/sysdeps/powerpc/powerpc64/power8/strncase.S
00db10
@@ -0,0 +1,20 @@
00db10
+/* Optimized strncasecmp implementation for POWER8.
00db10
+   Copyright (C) 2016 Free Software Foundation, Inc.
00db10
+   This file is part of the GNU C Library.
00db10
+
00db10
+   The GNU C Library is free software; you can redistribute it and/or
00db10
+   modify it under the terms of the GNU Lesser General Public
00db10
+   License as published by the Free Software Foundation; either
00db10
+   version 2.1 of the License, or (at your option) any later version.
00db10
+
00db10
+   The GNU C Library is distributed in the hope that it will be useful,
00db10
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
00db10
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00db10
+   Lesser General Public License for more details.
00db10
+
00db10
+   You should have received a copy of the GNU Lesser General Public
00db10
+   License along with the GNU C Library; if not, see
00db10
+   <http://www.gnu.org/licenses/>.  */
00db10
+
00db10
+#define USE_AS_STRNCASECMP 1
00db10
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
00db10
-- 
00db10
2.1.0
00db10