Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-rh1385004-12.patch

Blob History Raw

		8ae002	`From 0d3555b9b4d5cefe116c32bfa38ac70f1d6c25cb Mon Sep 17 00:00:00 2001`
		8ae002	`From: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>`
		8ae002	`Date: Wed, 11 Nov 2015 17:31:28 -0200`
		8ae002	`Subject: [PATCH] powerpc: Optimization for strlen for POWER8.`
		8ae002
		8ae002	`This implementation takes advantage of vectorization to improve performance of`
		8ae002	`the loop over the current strlen implementation for POWER7.`
		8ae002
		8ae002	`(cherry picked from commit 1b045ee53e0b8bed75745b931b33f27d21c9ed22)`
		8ae002	`---`
		8ae002	`ChangeLog \| 13 +`
		8ae002	`sysdeps/powerpc/powerpc64/multiarch/Makefile \| 2 +-`
		8ae002	`.../powerpc/powerpc64/multiarch/ifunc-impl-list.c \| 2 +`
		8ae002	`.../powerpc/powerpc64/multiarch/strlen-power8.S \| 39 +++`
		8ae002	`sysdeps/powerpc/powerpc64/multiarch/strlen.c \| 9 +-`
		8ae002	`sysdeps/powerpc/powerpc64/power8/strlen.S \| 297 +++++++++++++++++++++`
		8ae002	`6 files changed, 358 insertions(+), 4 deletions(-)`
		8ae002	`create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S`
		8ae002	`create mode 100644 sysdeps/powerpc/powerpc64/power8/strlen.S`
		8ae002
		8ae002	`diff --git a/ChangeLog b/ChangeLog`
		8ae002	`index f030b68..e7ea58a 100644`
		8ae002	`diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile`
		8ae002	`index 7ed56bf..57abe8f 100644`
		8ae002	`--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile`
		8ae002	`+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile`
		8ae002	`@@ -20,7 +20,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \`
		8ae002	`strncpy-power8 strncpy-power7 strncpy-ppc64 \`
		8ae002	`strncat-power7 \`
		8ae002	`strstr-power7 strstr-ppc64 \`
		8ae002	`- strspn-power8 strspn-ppc64 \`
		8ae002	`+ strspn-power8 strspn-ppc64 strlen-power8 \`
		8ae002	`rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \`
		8ae002	`strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \`
		8ae002	`strncase-power7 strncase_l-power7 \`
		8ae002	`diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c`
		8ae002	`index f6c70ba..583885c 100644`
		8ae002	`--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c`
		8ae002	`+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c`
		8ae002	`@@ -101,6 +101,8 @@ __libc_ifunc_impl_list (const char name, struct libc_ifunc_impl array,`
		8ae002
		8ae002	`/* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */`
		8ae002	`IFUNC_IMPL (i, name, strlen,`
		8ae002	`+ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,`
		8ae002	`+ __strlen_power8)`
		8ae002	`IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,`
		8ae002	`__strlen_power7)`
		8ae002	`IFUNC_IMPL_ADD (array, i, strlen, 1,`
		8ae002	`diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S`
		8ae002	`new file mode 100644`
		8ae002	`index 0000000..686dc3d`
		8ae002	`--- /dev/null`
		8ae002	`+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S`
		8ae002	`@@ -0,0 +1,39 @@`
		8ae002	`+/* Optimized strlen implementation for POWER8.`
		8ae002	`+ Copyright (C) 2016 Free Software Foundation, Inc.`
		8ae002	`+ This file is part of the GNU C Library.`
		8ae002	`+`
		8ae002	`+ The GNU C Library is free software; you can redistribute it and/or`
		8ae002	`+ modify it under the terms of the GNU Lesser General Public`
		8ae002	`+ License as published by the Free Software Foundation; either`
		8ae002	`+ version 2.1 of the License, or (at your option) any later version.`
		8ae002	`+`
		8ae002	`+ The GNU C Library is distributed in the hope that it will be useful,`
		8ae002	`+ but WITHOUT ANY WARRANTY; without even the implied warranty of`
		8ae002	`+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
		8ae002	`+ Lesser General Public License for more details.`
		8ae002	`+`
		8ae002	`+ You should have received a copy of the GNU Lesser General Public`
		8ae002	`+ License along with the GNU C Library; if not, see`
		8ae002	`+ <http://www.gnu.org/licenses/>. */`
		8ae002	`+`
		8ae002	`+#include <sysdep.h>`
		8ae002	`+`
		8ae002	`+#undef EALIGN`
		8ae002	`+#define EALIGN(name, alignt, words) \`
		8ae002	`+ .section ".text"; \`
		8ae002	`+ ENTRY_2(__strlen_power8) \`
		8ae002	`+ .align ALIGNARG(alignt); \`
		8ae002	`+ EALIGN_W_##words; \`
		8ae002	`+ BODY_LABEL(__strlen_power8): \`
		8ae002	`+ cfi_startproc; \`
		8ae002	`+ LOCALENTRY(__strlen_power8)`
		8ae002	`+#undef END`
		8ae002	`+#define END(name) \`
		8ae002	`+ cfi_endproc; \`
		8ae002	`+ TRACEBACK(__strlen_power8) \`
		8ae002	`+ END_2(__strlen_power8)`
		8ae002	`+`
		8ae002	`+#undef libc_hidden_builtin_def`
		8ae002	`+#define libc_hidden_builtin_def(name)`
		8ae002	`+`
		8ae002	`+#include <sysdeps/powerpc/powerpc64/power8/strlen.S>`
		8ae002	`diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c`
		8ae002	`index 79a53d9..4b400a5 100644`
		8ae002	`--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c`
		8ae002	`+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c`
		8ae002	`@@ -29,11 +29,14 @@ extern __typeof (__redirect_strlen) __libc_strlen;`
		8ae002
		8ae002	`extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;`
		8ae002	`extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;`
		8ae002	`+extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;`
		8ae002
		8ae002	`libc_ifunc (__libc_strlen,`
		8ae002	`- (hwcap & PPC_FEATURE_HAS_VSX)`
		8ae002	`- ? __strlen_power7`
		8ae002	`- : __strlen_ppc);`
		8ae002	`+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)`
		8ae002	`+ ? __strlen_power8 :`
		8ae002	`+ (hwcap & PPC_FEATURE_HAS_VSX)`
		8ae002	`+ ? __strlen_power7`
		8ae002	`+ : __strlen_ppc);`
		8ae002
		8ae002	`#undef strlen`
		8ae002	`strong_alias (__libc_strlen, strlen)`
		8ae002	`diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S`
		8ae002	`new file mode 100644`
		8ae002	`index 0000000..0142747`
		8ae002	`--- /dev/null`
		8ae002	`+++ b/sysdeps/powerpc/powerpc64/power8/strlen.S`
		8ae002	`@@ -0,0 +1,297 @@`
		8ae002	`+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized`
		8ae002	`+ loop.`
		8ae002	`+ Copyright (C) 2016 Free Software Foundation, Inc.`
		8ae002	`+ This file is part of the GNU C Library.`
		8ae002	`+`
		8ae002	`+ The GNU C Library is free software; you can redistribute it and/or`
		8ae002	`+ modify it under the terms of the GNU Lesser General Public`
		8ae002	`+ License as published by the Free Software Foundation; either`
		8ae002	`+ version 2.1 of the License, or (at your option) any later version.`
		8ae002	`+`
		8ae002	`+ The GNU C Library is distributed in the hope that it will be useful,`
		8ae002	`+ but WITHOUT ANY WARRANTY; without even the implied warranty of`
		8ae002	`+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
		8ae002	`+ Lesser General Public License for more details.`
		8ae002	`+`
		8ae002	`+ You should have received a copy of the GNU Lesser General Public`
		8ae002	`+ License along with the GNU C Library; if not, see`
		8ae002	`+ <http://www.gnu.org/licenses/>. */`
		8ae002	`+`
		8ae002	`+#include <sysdep.h>`
		8ae002	`+`
		8ae002	`+/* TODO: change these to the actual instructions when the minimum required`
		8ae002	`+ binutils allows it. */`
		8ae002	`+#define MFVRD(r,v) .long (0x7c000067 \| ((v)<<(32-11)) \| ((r)<<(32-16)))`
		8ae002	`+#define VBPERMQ(t,a,b) .long (0x1000054c \`
		8ae002	`+ \| ((t)<<(32-11)) \`
		8ae002	`+ \| ((a)<<(32-16)) \`
		8ae002	`+ \| ((b)<<(32-21)) )`
		8ae002	`+`
		8ae002	`+/* int [r3] strlen (char s [r3]) /`
		8ae002	`+`
		8ae002	`+/* TODO: change this to .machine power8 when the minimum required binutils`
		8ae002	`+ allows it. */`
		8ae002	`+ .machine power7`
		8ae002	`+EALIGN (strlen, 4, 0)`
		8ae002	`+ CALL_MCOUNT 1`
		8ae002	`+ dcbt 0,r3`
		8ae002	`+ clrrdi r4,r3,3 /* Align the address to doubleword boundary. */`
		8ae002	`+ rlwinm r6,r3,3,26,28 /* Calculate padding. */`
		8ae002	`+ li r0,0 /* Doubleword with null chars to use`
		8ae002	`+ with cmpb. */`
		8ae002	`+ li r5,-1 /* MASK = 0xffffffffffffffff. */`
		8ae002	`+ ld r12,0(r4) /* Load doubleword from memory. */`
		8ae002	`+#ifdef __LITTLE_ENDIAN__`
		8ae002	`+ sld r5,r5,r6`
		8ae002	`+#else`
		8ae002	`+ srd r5,r5,r6 /* MASK = MASK >> padding. */`
		8ae002	`+#endif`
		8ae002	`+ orc r9,r12,r5 /* Mask bits that are not part of the string. */`
		8ae002	`+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */`
		8ae002	`+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */`
		8ae002	`+ bne cr7,L(done)`
		8ae002	`+`
		8ae002	`+ /* For shorter strings (< 64 bytes), we will not use vector registers,`
		8ae002	`+ as the overhead isn't worth it. So, let's use GPRs instead. This`
		8ae002	`+ will be done the same way as we do in the POWER7 implementation.`
		8ae002	`+ Let's see if we are aligned to a quadword boundary. If so, we can`
		8ae002	`+ jump to the first (non-vectorized) loop. Otherwise, we have to`
		8ae002	`+ handle the next DWORD first. */`
		8ae002	`+ mtcrf 0x01,r4`
		8ae002	`+ mr r9,r4`
		8ae002	`+ addi r9,r9,8`
		8ae002	`+ bt 28,L(align64)`
		8ae002	`+`
		8ae002	`+ /* Handle the next 8 bytes so we are aligned to a quadword`
		8ae002	`+ boundary. */`
		8ae002	`+ ldu r5,8(r4)`
		8ae002	`+ cmpb r10,r5,r0`
		8ae002	`+ cmpdi cr7,r10,0`
		8ae002	`+ addi r9,r9,8`
		8ae002	`+ bne cr7,L(done)`
		8ae002	`+`
		8ae002	`+L(align64):`
		8ae002	`+ /* Proceed to the old (POWER7) implementation, checking two doublewords`
		8ae002	`+ per iteraction. For the first 56 bytes, we will just check for null`
		8ae002	`+ characters. After that, we will also check if we are 64-byte aligned`
		8ae002	`+ so we can jump to the vectorized implementation. We will unroll`
		8ae002	`+ these loops to avoid excessive branching. */`
		8ae002	`+ ld r6,8(r4)`
		8ae002	`+ ldu r5,16(r4)`
		8ae002	`+ cmpb r10,r6,r0`
		8ae002	`+ cmpb r11,r5,r0`
		8ae002	`+ or r5,r10,r11`
		8ae002	`+ cmpdi cr7,r5,0`
		8ae002	`+ addi r9,r9,16`
		8ae002	`+ bne cr7,L(dword_zero)`
		8ae002	`+`
		8ae002	`+ ld r6,8(r4)`
		8ae002	`+ ldu r5,16(r4)`
		8ae002	`+ cmpb r10,r6,r0`
		8ae002	`+ cmpb r11,r5,r0`
		8ae002	`+ or r5,r10,r11`
		8ae002	`+ cmpdi cr7,r5,0`
		8ae002	`+ addi r9,r9,16`
		8ae002	`+ bne cr7,L(dword_zero)`
		8ae002	`+`
		8ae002	`+ ld r6,8(r4)`
		8ae002	`+ ldu r5,16(r4)`
		8ae002	`+ cmpb r10,r6,r0`
		8ae002	`+ cmpb r11,r5,r0`
		8ae002	`+ or r5,r10,r11`
		8ae002	`+ cmpdi cr7,r5,0`
		8ae002	`+ addi r9,r9,16`
		8ae002	`+ bne cr7,L(dword_zero)`
		8ae002	`+`
		8ae002	`+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.`
		8ae002	`+ Note: aligning to 64-byte will necessarily slow down performance for`
		8ae002	`+ strings around 64 bytes in length due to the extra comparisons`
		8ae002	`+ required to check alignment for the vectorized loop. This is a`
		8ae002	`+ necessary tradeoff we are willing to take in order to speed up the`
		8ae002	`+ calculation for larger strings. */`
		8ae002	`+ andi. r10,r9,63`
		8ae002	`+ beq cr0,L(preloop)`
		8ae002	`+ ld r6,8(r4)`
		8ae002	`+ ldu r5,16(r4)`
		8ae002	`+ cmpb r10,r6,r0`
		8ae002	`+ cmpb r11,r5,r0`
		8ae002	`+ or r5,r10,r11`
		8ae002	`+ cmpdi cr7,r5,0`
		8ae002	`+ addi r9,r9,16`
		8ae002	`+ bne cr7,L(dword_zero)`
		8ae002	`+`
		8ae002	`+ andi. r10,r9,63`
		8ae002	`+ beq cr0,L(preloop)`
		8ae002	`+ ld r6,8(r4)`
		8ae002	`+ ldu r5,16(r4)`
		8ae002	`+ cmpb r10,r6,r0`
		8ae002	`+ cmpb r11,r5,r0`
		8ae002	`+ or r5,r10,r11`
		8ae002	`+ cmpdi cr7,r5,0`
		8ae002	`+ addi r9,r9,16`
		8ae002	`+ bne cr7,L(dword_zero)`
		8ae002	`+`
		8ae002	`+ andi. r10,r9,63`
		8ae002	`+ beq cr0,L(preloop)`
		8ae002	`+ ld r6,8(r4)`
		8ae002	`+ ldu r5,16(r4)`
		8ae002	`+ cmpb r10,r6,r0`
		8ae002	`+ cmpb r11,r5,r0`
		8ae002	`+ or r5,r10,r11`
		8ae002	`+ cmpdi cr7,r5,0`
		8ae002	`+ addi r9,r9,16`
		8ae002	`+ bne cr7,L(dword_zero)`
		8ae002	`+`
		8ae002	`+ andi. r10,r9,63`
		8ae002	`+ beq cr0,L(preloop)`
		8ae002	`+ ld r6,8(r4)`
		8ae002	`+ ldu r5,16(r4)`
		8ae002	`+ cmpb r10,r6,r0`
		8ae002	`+ cmpb r11,r5,r0`
		8ae002	`+ or r5,r10,r11`
		8ae002	`+ cmpdi cr7,r5,0`
		8ae002	`+ addi r9,r9,16`
		8ae002	`+`
		8ae002	`+ /* At this point, we are necessarily 64-byte aligned. If no zeroes were`
		8ae002	`+ found, jump to the vectorized loop. */`
		8ae002	`+ beq cr7,L(preloop)`
		8ae002	`+`
		8ae002	`+L(dword_zero):`
		8ae002	`+ /* OK, one (or both) of the doublewords contains a null byte. Check`
		8ae002	`+ the first doubleword and decrement the address in case the first`
		8ae002	`+ doubleword really contains a null byte. */`
		8ae002	`+`
		8ae002	`+ cmpdi cr6,r10,0`
		8ae002	`+ addi r4,r4,-8`
		8ae002	`+ bne cr6,L(done)`
		8ae002	`+`
		8ae002	`+ /* The null byte must be in the second doubleword. Adjust the address`
		8ae002	`+ again and move the result of cmpb to r10 so we can calculate the`
		8ae002	`+ length. */`
		8ae002	`+`
		8ae002	`+ mr r10,r11`
		8ae002	`+ addi r4,r4,8`
		8ae002	`+`
		8ae002	`+ /* If the null byte was found in the non-vectorized code, compute the`
		8ae002	`+ final length. r10 has the output of the cmpb instruction, that is,`
		8ae002	`+ it contains 0xff in the same position as the null byte in the`
		8ae002	`+ original doubleword from the string. Use that to calculate the`
		8ae002	`+ length. */`
		8ae002	`+L(done):`
		8ae002	`+#ifdef __LITTLE_ENDIAN__`
		8ae002	`+ addi r9, r10,-1 /* Form a mask from trailing zeros. */`
		8ae002	`+ andc r9, r9,r10`
		8ae002	`+ popcntd r0, r9 /* Count the bits in the mask. */`
		8ae002	`+#else`
		8ae002	`+ cntlzd r0,r10 /* Count leading zeros before the match. */`
		8ae002	`+#endif`
		8ae002	`+ subf r5,r3,r4`
		8ae002	`+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */`
		8ae002	`+ add r3,r5,r0 /* Compute final length. */`
		8ae002	`+ blr`
		8ae002	`+`
		8ae002	`+ /* Vectorized implementation starts here. */`
		8ae002	`+ .p2align 4`
		8ae002	`+L(preloop):`
		8ae002	`+ /* Set up for the loop. */`
		8ae002	`+ mr r4,r9`
		8ae002	`+ li r7, 16 /* Load required offsets. */`
		8ae002	`+ li r8, 32`
		8ae002	`+ li r9, 48`
		8ae002	`+ li r12, 8`
		8ae002	`+ vxor v0,v0,v0 /* VR with null chars to use with`
		8ae002	`+ vcmpequb. */`
		8ae002	`+`
		8ae002	`+ /* Main loop to look for the end of the string. We will read in`
		8ae002	`+ 64-byte chunks. Align it to 32 bytes and unroll it 3 times to`
		8ae002	`+ leverage the icache performance. */`
		8ae002	`+ .p2align 5`
		8ae002	`+L(loop):`
		8ae002	`+ lvx v1,r4,r0 /* Load 4 quadwords. */`
		8ae002	`+ lvx v2,r4,r7`
		8ae002	`+ lvx v3,r4,r8`
		8ae002	`+ lvx v4,r4,r9`
		8ae002	`+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */`
		8ae002	`+ vminub v6,v3,v4`
		8ae002	`+ vminub v7,v5,v6`
		8ae002	`+ vcmpequb. v7,v7,v0 /* Check for NULLs. */`
		8ae002	`+ addi r4,r4,64 /* Adjust address for the next iteration. */`
		8ae002	`+ bne cr6,L(vmx_zero)`
		8ae002	`+`
		8ae002	`+ lvx v1,r4,r0 /* Load 4 quadwords. */`
		8ae002	`+ lvx v2,r4,r7`
		8ae002	`+ lvx v3,r4,r8`
		8ae002	`+ lvx v4,r4,r9`
		8ae002	`+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */`
		8ae002	`+ vminub v6,v3,v4`
		8ae002	`+ vminub v7,v5,v6`
		8ae002	`+ vcmpequb. v7,v7,v0 /* Check for NULLs. */`
		8ae002	`+ addi r4,r4,64 /* Adjust address for the next iteration. */`
		8ae002	`+ bne cr6,L(vmx_zero)`
		8ae002	`+`
		8ae002	`+ lvx v1,r4,r0 /* Load 4 quadwords. */`
		8ae002	`+ lvx v2,r4,r7`
		8ae002	`+ lvx v3,r4,r8`
		8ae002	`+ lvx v4,r4,r9`
		8ae002	`+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */`
		8ae002	`+ vminub v6,v3,v4`
		8ae002	`+ vminub v7,v5,v6`
		8ae002	`+ vcmpequb. v7,v7,v0 /* Check for NULLs. */`
		8ae002	`+ addi r4,r4,64 /* Adjust address for the next iteration. */`
		8ae002	`+ beq cr6,L(loop)`
		8ae002	`+`
		8ae002	`+L(vmx_zero):`
		8ae002	`+ /* OK, we found a null byte. Let's look for it in the current 64-byte`
		8ae002	`+ block and mark it in its corresponding VR. */`
		8ae002	`+ vcmpequb v1,v1,v0`
		8ae002	`+ vcmpequb v2,v2,v0`
		8ae002	`+ vcmpequb v3,v3,v0`
		8ae002	`+ vcmpequb v4,v4,v0`
		8ae002	`+`
		8ae002	`+ /* We will now 'compress' the result into a single doubleword, so it`
		8ae002	`+ can be moved to a GPR for the final calculation. First, we`
		8ae002	`+ generate an appropriate mask for vbpermq, so we can permute bits into`
		8ae002	`+ the first halfword. */`
		8ae002	`+ vspltisb v10,3`
		8ae002	`+ lvsl v11,r0,r0`
		8ae002	`+ vslb v10,v11,v10`
		8ae002	`+`
		8ae002	`+ /* Permute the first bit of each byte into bits 48-63. */`
		8ae002	`+ VBPERMQ(v1,v1,v10)`
		8ae002	`+ VBPERMQ(v2,v2,v10)`
		8ae002	`+ VBPERMQ(v3,v3,v10)`
		8ae002	`+ VBPERMQ(v4,v4,v10)`
		8ae002	`+`
		8ae002	`+ /* Shift each component into its correct position for merging. */`
		8ae002	`+#ifdef __LITTLE_ENDIAN__`
		8ae002	`+ vsldoi v2,v2,v2,2`
		8ae002	`+ vsldoi v3,v3,v3,4`
		8ae002	`+ vsldoi v4,v4,v4,6`
		8ae002	`+#else`
		8ae002	`+ vsldoi v1,v1,v1,6`
		8ae002	`+ vsldoi v2,v2,v2,4`
		8ae002	`+ vsldoi v3,v3,v3,2`
		8ae002	`+#endif`
		8ae002	`+`
		8ae002	`+ /* Merge the results and move to a GPR. */`
		8ae002	`+ vor v1,v2,v1`
		8ae002	`+ vor v2,v3,v4`
		8ae002	`+ vor v4,v1,v2`
		8ae002	`+ MFVRD(r10,v4)`
		8ae002	`+`
		8ae002	`+ /* Adjust address to the begninning of the current 64-byte block. */`
		8ae002	`+ addi r4,r4,-64`
		8ae002	`+`
		8ae002	`+#ifdef __LITTLE_ENDIAN__`
		8ae002	`+ addi r9, r10,-1 /* Form a mask from trailing zeros. */`
		8ae002	`+ andc r9, r9,r10`
		8ae002	`+ popcntd r0, r9 /* Count the bits in the mask. */`
		8ae002	`+#else`
		8ae002	`+ cntlzd r0,r10 /* Count leading zeros before the match. */`
		8ae002	`+#endif`
		8ae002	`+ subf r5,r3,r4`
		8ae002	`+ add r3,r5,r0 /* Compute final length. */`
		8ae002	`+ blr`
		8ae002	`+`
		8ae002	`+END (strlen)`
		8ae002	`+libc_hidden_builtin_def (strlen)`
		8ae002	`--`
		8ae002	`2.1.0`
		8ae002

rpms / glibc

Source Code

Blame SOURCES/glibc-rh1385004-12.patch