5de29b
# commit 664318c3eb07032e2bfcf47cb2aa3c89280c19e7
5de29b
# Author: Alan Modra <amodra@gmail.com>
5de29b
# Date:   Sat Aug 17 18:46:05 2013 +0930
5de29b
# 
5de29b
#     PowerPC LE strchr
5de29b
#     http://sourceware.org/ml/libc-alpha/2013-08/msg00101.html
5de29b
#     
5de29b
#     Adds little-endian support to optimised strchr assembly.  I've also
5de29b
#     tweaked the big-endian code a little.  In power7/strchr.S there's a
5de29b
#     check in the tail of the function that we didn't match 0 before
5de29b
#     finding a c match, done by comparing leading zero counts.  It's just
5de29b
#     as valid, and quicker, to compare the raw output from cmpb.
5de29b
#     
5de29b
#     Another little tweak is to use rldimi/insrdi in place of rlwimi for
5de29b
#     the power7 strchr functions.  Since rlwimi is cracked, it is a few
5de29b
#     cycles slower.  rldimi can be used on the 32-bit power7 functions
5de29b
#     too.
5de29b
#     
5de29b
#         * sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian
5de29b
#         support.  Correct typos, formatting.  Optimize tail.  Use insrdi
5de29b
#         rather than rlwimi.
5de29b
#         * sysdeps/powerpc/powerpc32/power7/strchr.S: Likewise.
5de29b
#         * sysdeps/powerpc/powerpc64/power7/strchrnul.S (__strchrnul): Add
5de29b
#         little-endian support.  Correct typos.
5de29b
#         * sysdeps/powerpc/powerpc32/power7/strchrnul.S: Likewise.  Use insrdi
5de29b
#         rather than rlwimi.
5de29b
#         * sysdeps/powerpc/powerpc64/strchr.S (rTMP4, rTMP5): Define.  Use
5de29b
#         in loop and entry code to keep "and." results.
5de29b
#         (strchr): Add little-endian support.  Comment.  Move cntlzd
5de29b
#         earlier in tail.
5de29b
#         * sysdeps/powerpc/powerpc32/strchr.S: Likewise.
5de29b
# 
12745e
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchr.S
12745e
===================================================================
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchr.S.orig
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchr.S
12745e
@@ -37,8 +37,8 @@ ENTRY (BP_SYM(strchr))
5de29b
 	beq	cr7,L(null_match)
5de29b
 
5de29b
 	/* Replicate byte to word.  */
5de29b
-	rlwimi	r4,r4,8,16,23
5de29b
-	rlwimi	r4,r4,16,0,15
5de29b
+	insrdi	r4,r4,8,48
5de29b
+	insrdi	r4,r4,16,32
5de29b
 
5de29b
 	/* Now r4 has a word of c bytes and r0 has
5de29b
 	   a word of null bytes.  */
12745e
@@ -48,11 +48,17 @@ ENTRY (BP_SYM(strchr))
5de29b
 
5de29b
 	/* Move the words left and right to discard the bits that are
5de29b
 	   not part of the string and to bring them back as zeros.  */
5de29b
-
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	srw	r10,r10,r6
5de29b
+	srw	r11,r11,r6
5de29b
+	slw	r10,r10,r6
5de29b
+	slw	r11,r11,r6
5de29b
+#else
5de29b
 	slw	r10,r10,r6
5de29b
 	slw	r11,r11,r6
5de29b
 	srw	r10,r10,r6
5de29b
 	srw	r11,r11,r6
5de29b
+#endif
5de29b
 	or	r5,r10,r11    /* OR the results to speed things up.  */
5de29b
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
5de29b
 				 have been found.  */
12745e
@@ -67,7 +73,7 @@ ENTRY (BP_SYM(strchr))
5de29b
 
5de29b
 	/* Handle WORD2 of pair.  */
5de29b
 	lwzu	r12,4(r8)
5de29b
-	cmpb    r10,r12,r4
5de29b
+	cmpb	r10,r12,r4
5de29b
 	cmpb	r11,r12,r0
5de29b
 	or	r5,r10,r11
5de29b
 	cmpwi	cr7,r5,0
12745e
@@ -102,22 +108,31 @@ L(loop):
5de29b
 	bne	cr6,L(done)
5de29b
 
5de29b
 	/* The c/null byte must be in the second word.  Adjust the address
5de29b
-	   again and move the result of cmpb to r10 so we can calculate the
5de29b
-	   pointer.  */
5de29b
+	   again and move the result of cmpb to r10/r11 so we can calculate
5de29b
+	   the pointer.  */
5de29b
 
5de29b
 	mr	r10,r6
5de29b
 	mr	r11,r7
5de29b
 	addi	r8,r8,4
5de29b
 
5de29b
-	/* r5 has the output of the cmpb instruction, that is, it contains
5de29b
+	/* r10/r11 have the output of the cmpb instructions, that is,
5de29b
 	   0xff in the same position as the c/null byte in the original
5de29b
 	   word from the string.  Use that to calculate the pointer.  */
5de29b
 L(done):
5de29b
-	cntlzw	r4,r10	      /* Count leading zeroes before c matches.  */
5de29b
-	cntlzw	r0,r11	      /* Count leading zeroes before null matches.  */
5de29b
-	cmplw	cr7,r4,r0
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi    r3,r10,-1
5de29b
+	andc    r3,r3,r10
5de29b
+	popcntw	r0,r3
5de29b
+	addi    r4,r11,-1
5de29b
+	andc    r4,r4,r11
5de29b
+	cmplw	cr7,r3,r4
5de29b
+	bgt	cr7,L(no_match)
5de29b
+#else
5de29b
+	cntlzw	r0,r10	      /* Count leading zeros before c matches.  */
5de29b
+	cmplw	cr7,r11,r10
5de29b
 	bgt	cr7,L(no_match)
5de29b
-	srwi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
5de29b
+#endif
5de29b
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
5de29b
 	add	r3,r8,r0      /* Return address of the matching c byte
5de29b
 				 or null in case c was not found.  */
5de29b
 	blr
12745e
@@ -135,10 +150,14 @@ L(null_match):
5de29b
 	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
5de29b
 
5de29b
 	/* Move the words left and right to discard the bits that are
5de29b
-	   not part of the string and to bring them back as zeros.  */
5de29b
-
5de29b
+	   not part of the string and bring them back as zeros.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	srw	r5,r5,r6
5de29b
+	slw	r5,r5,r6
5de29b
+#else
5de29b
 	slw	r5,r5,r6
5de29b
 	srw	r5,r5,r6
5de29b
+#endif
5de29b
 	cmpwi	cr7,r5,0      /* If r10 == 0, no c or null bytes
5de29b
 				 have been found.  */
5de29b
 	bne	cr7,L(done_null)
12745e
@@ -193,7 +212,13 @@ L(loop_null):
5de29b
 	   0xff in the same position as the null byte in the original
5de29b
 	   word from the string.  Use that to calculate the pointer.  */
5de29b
 L(done_null):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi    r0,r5,-1
5de29b
+	andc    r0,r0,r5
5de29b
+	popcntw	r0,r0
5de29b
+#else
5de29b
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
5de29b
+#endif
5de29b
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
5de29b
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
5de29b
 	blr
12745e
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchrnul.S
12745e
===================================================================
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchrnul.S.orig
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchrnul.S
12745e
@@ -29,8 +29,8 @@ ENTRY (BP_SYM(__strchrnul))
5de29b
 	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
5de29b
 
5de29b
 	/* Replicate byte to word.  */
5de29b
-	rlwimi	r4,r4,8,16,23
5de29b
-	rlwimi	r4,r4,16,0,15
5de29b
+	insrdi	r4,r4,8,48
5de29b
+	insrdi	r4,r4,16,32
5de29b
 
5de29b
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
5de29b
 	lwz	r12,0(r8)     /* Load word from memory.  */
12745e
@@ -45,10 +45,17 @@ ENTRY (BP_SYM(__strchrnul))
5de29b
 
5de29b
 	/* Move the words left and right to discard the bits that are
5de29b
 	   not part of the string and bring them back as zeros.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	srw	r10,r10,r6
5de29b
+	srw	r9,r9,r6
5de29b
+	slw	r10,r10,r6
5de29b
+	slw	r9,r9,r6
5de29b
+#else
5de29b
 	slw	r10,r10,r6
5de29b
 	slw	r9,r9,r6
5de29b
 	srw	r10,r10,r6
5de29b
 	srw	r9,r9,r6
5de29b
+#endif
5de29b
 	or	r5,r9,r10     /* OR the results to speed things up.  */
5de29b
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
5de29b
 				 have been found.  */
12745e
@@ -56,7 +63,7 @@ ENTRY (BP_SYM(__strchrnul))
5de29b
 
5de29b
 	mtcrf   0x01,r8
5de29b
 
5de29b
-	/* Are we now aligned to a quadword boundary?  If so, skip to
5de29b
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
5de29b
 	   the main loop.  Otherwise, go through the alignment code.  */
5de29b
 
5de29b
 	bt	29,L(loop)
12745e
@@ -78,7 +85,7 @@ L(loop):
5de29b
 	   single register for speed.  This is an attempt
5de29b
 	   to speed up the null-checking process for bigger strings.  */
5de29b
 	lwz	r12,4(r8)
5de29b
-	lwzu     r11,8(r8)
5de29b
+	lwzu	r11,8(r8)
5de29b
 	cmpb	r10,r12,r0
5de29b
 	cmpb	r9,r12,r4
5de29b
 	cmpb	r6,r11,r0
12745e
@@ -97,9 +104,9 @@ L(loop):
5de29b
 	addi	r8,r8,-4
5de29b
 	bne	cr6,L(done)
5de29b
 
5de29b
-	/* The c/null byte must be in the second word.  Adjust the
5de29b
-	   address again and move the result of cmpb to r10 so we can calculate
5de29b
-	   the pointer.  */
5de29b
+	/* The c/null byte must be in the second word.  Adjust the address
5de29b
+	   again and move the result of cmpb to r5 so we can calculate the
5de29b
+	   pointer.  */
5de29b
 	mr	r5,r10
5de29b
 	addi	r8,r8,4
5de29b
 
12745e
@@ -107,7 +114,13 @@ L(loop):
5de29b
 	   0xff in the same position as the c/null byte in the original
5de29b
 	   word from the string.  Use that to calculate the pointer.  */
5de29b
 L(done):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi    r0,r5,-1
5de29b
+	andc    r0,r0,r5
5de29b
+	popcntw	r0,r0
5de29b
+#else
5de29b
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
5de29b
+#endif
5de29b
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
5de29b
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
5de29b
 	blr
12745e
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strchr.S
12745e
===================================================================
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strchr.S.orig
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strchr.S
12745e
@@ -44,6 +44,8 @@ ENTRY (BP_SYM (strchr))
5de29b
 #define rIGN	r10	/* number of bits we should ignore in the first word */
5de29b
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
5de29b
 #define rTMP3	r12
5de29b
+#define rTMP4	rIGN
5de29b
+#define rTMP5	rMASK
5de29b
 
5de29b
 	CHECK_BOUNDS_LOW (rSTR, rTMP1, rTMP2)
5de29b
 	STORE_RETURN_BOUNDS (rTMP1, rTMP2)
12745e
@@ -59,53 +61,74 @@ ENTRY (BP_SYM (strchr))
5de29b
 	addi	r7F7F, r7F7F, 0x7f7f
5de29b
 /* Test the first (partial?) word.  */
5de29b
 	lwz	rWORD, 0(rSTR)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	slw	rMASK, rMASK, rIGN
5de29b
+#else
5de29b
 	srw	rMASK, rMASK, rIGN
5de29b
+#endif
5de29b
 	orc	rWORD, rWORD, rMASK
5de29b
 	add	rTMP1, rFEFE, rWORD
5de29b
 	nor	rTMP2, r7F7F, rWORD
5de29b
-	and.	rTMP1, rTMP1, rTMP2
5de29b
+	and.	rTMP4, rTMP1, rTMP2
5de29b
 	xor	rTMP3, rCHR, rWORD
5de29b
 	orc	rTMP3, rTMP3, rMASK
5de29b
 	b	L(loopentry)
5de29b
 
5de29b
 /* The loop.  */
5de29b
 
5de29b
-L(loop):lwzu rWORD, 4(rSTR)
5de29b
-	and.	rTMP1, rTMP1, rTMP2
5de29b
+L(loop):
5de29b
+	lwzu	rWORD, 4(rSTR)
5de29b
+	and.	rTMP5, rTMP1, rTMP2
5de29b
 /* Test for 0.	*/
5de29b
-	add	rTMP1, rFEFE, rWORD
5de29b
-	nor	rTMP2, r7F7F, rWORD
5de29b
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
5de29b
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
5de29b
 	bne	L(foundit)
5de29b
-	and.	rTMP1, rTMP1, rTMP2
5de29b
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
5de29b
 /* Start test for the bytes we're looking for.  */
5de29b
 	xor	rTMP3, rCHR, rWORD
5de29b
 L(loopentry):
5de29b
 	add	rTMP1, rFEFE, rTMP3
5de29b
 	nor	rTMP2, r7F7F, rTMP3
5de29b
 	beq	L(loop)
5de29b
+
5de29b
 /* There is a zero byte in the word, but may also be a matching byte (either
5de29b
    before or after the zero byte).  In fact, we may be looking for a
5de29b
-   zero byte, in which case we return a match.  We guess that this hasn't
5de29b
-   happened, though.  */
5de29b
-L(missed):
5de29b
-	and.	rTMP1, rTMP1, rTMP2
5de29b
+   zero byte, in which case we return a match.  */
5de29b
+	and.	rTMP5, rTMP1, rTMP2
5de29b
 	li	rRTN, 0
5de29b
 	STORE_RETURN_VALUE (rSTR)
5de29b
 	beqlr
5de29b
-/* It did happen. Decide which one was first...
5de29b
-   I'm not sure if this is actually faster than a sequence of
5de29b
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
5de29b
+/* At this point:
5de29b
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
5de29b
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
5de29b
+   But there may be false matches in the next most significant byte from
5de29b
+   a true match due to carries.  This means we need to recalculate the
5de29b
+   matches using a longer method for big-endian.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi	rTMP1, rTMP5, -1
5de29b
+	andc	rTMP1, rTMP1, rTMP5
5de29b
+	cntlzw	rCLZB, rTMP1
5de29b
+	addi	rTMP2, rTMP4, -1
5de29b
+	andc	rTMP2, rTMP2, rTMP4
5de29b
+	cmplw	rTMP1, rTMP2
5de29b
+	bgtlr
5de29b
+	subfic	rCLZB, rCLZB, 32-7
5de29b
+#else
5de29b
+/* I think we could reduce this by two instructions by keeping the "nor"
5de29b
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
5de29b
+   one instruction could be pruned from L(foundit).  */
5de29b
 	and	rFEFE, r7F7F, rWORD
5de29b
-	or	rMASK, r7F7F, rWORD
5de29b
+	or	rTMP5, r7F7F, rWORD
5de29b
 	and	rTMP1, r7F7F, rTMP3
5de29b
-	or	rIGN, r7F7F, rTMP3
5de29b
+	or	rTMP4, r7F7F, rTMP3
5de29b
 	add	rFEFE, rFEFE, r7F7F
5de29b
 	add	rTMP1, rTMP1, r7F7F
5de29b
-	nor	rWORD, rMASK, rFEFE
5de29b
-	nor	rTMP2, rIGN, rTMP1
5de29b
+	nor	rWORD, rTMP5, rFEFE
5de29b
+	nor	rTMP2, rTMP4, rTMP1
5de29b
+	cntlzw	rCLZB, rTMP2
5de29b
 	cmplw	rWORD, rTMP2
5de29b
 	bgtlr
5de29b
-	cntlzw	rCLZB, rTMP2
5de29b
+#endif
5de29b
 	srwi	rCLZB, rCLZB, 3
5de29b
 	add	rRTN, rSTR, rCLZB
5de29b
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge)
12745e
@@ -113,13 +136,21 @@ L(missed):
5de29b
 	blr
5de29b
 
5de29b
 L(foundit):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi	rTMP1, rTMP5, -1
5de29b
+	andc	rTMP1, rTMP1, rTMP5
5de29b
+	cntlzw	rCLZB, rTMP1
5de29b
+	subfic	rCLZB, rCLZB, 32-7-32
5de29b
+	srawi	rCLZB, rCLZB, 3
5de29b
+#else
5de29b
 	and	rTMP1, r7F7F, rTMP3
5de29b
-	or	rIGN, r7F7F, rTMP3
5de29b
+	or	rTMP4, r7F7F, rTMP3
5de29b
 	add	rTMP1, rTMP1, r7F7F
5de29b
-	nor	rTMP2, rIGN, rTMP1
5de29b
+	nor	rTMP2, rTMP4, rTMP1
5de29b
 	cntlzw	rCLZB, rTMP2
5de29b
 	subi	rSTR, rSTR, 4
5de29b
 	srwi	rCLZB, rCLZB, 3
5de29b
+#endif
5de29b
 	add	rRTN, rSTR, rCLZB
5de29b
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge)
5de29b
 	STORE_RETURN_VALUE (rSTR)
12745e
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchr.S
12745e
===================================================================
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchr.S.orig
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchr.S
12745e
@@ -37,8 +37,8 @@ ENTRY (BP_SYM(strchr))
5de29b
 	beq	cr7,L(null_match)
5de29b
 
5de29b
 	/* Replicate byte to doubleword.  */
5de29b
-	rlwimi	r4,r4,8,16,23
5de29b
-	rlwimi	r4,r4,16,0,15
5de29b
+	insrdi	r4,r4,8,48
5de29b
+	insrdi	r4,r4,16,32
5de29b
 	insrdi  r4,r4,32,0
5de29b
 
5de29b
 	/* Now r4 has a doubleword of c bytes and r0 has
12745e
@@ -49,11 +49,17 @@ ENTRY (BP_SYM(strchr))
5de29b
 
5de29b
 	/* Move the doublewords left and right to discard the bits that are
5de29b
 	   not part of the string and bring them back as zeros.  */
5de29b
-
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	srd	r10,r10,r6
5de29b
+	srd	r11,r11,r6
5de29b
+	sld	r10,r10,r6
5de29b
+	sld	r11,r11,r6
5de29b
+#else
5de29b
 	sld	r10,r10,r6
5de29b
 	sld	r11,r11,r6
5de29b
 	srd	r10,r10,r6
5de29b
 	srd	r11,r11,r6
5de29b
+#endif
5de29b
 	or	r5,r10,r11    /* OR the results to speed things up.  */
5de29b
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
5de29b
 				 have been found.  */
12745e
@@ -110,15 +116,24 @@ L(loop):
5de29b
 	mr	r11,r7
5de29b
 	addi	r8,r8,8
5de29b
 
5de29b
-	/* r5 has the output of the cmpb instruction, that is, it contains
5de29b
+	/* r10/r11 have the output of the cmpb instructions, that is,
5de29b
 	   0xff in the same position as the c/null byte in the original
5de29b
 	   doubleword from the string.  Use that to calculate the pointer.  */
5de29b
 L(done):
5de29b
-	cntlzd	r4,r10	      /* Count leading zeroes before c matches.  */
5de29b
-	cntlzd	r0,r11	      /* Count leading zeroes before null matches.  */
5de29b
-	cmpld	cr7,r4,r0
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi    r3,r10,-1
5de29b
+	andc    r3,r3,r10
5de29b
+	popcntd	r0,r3
5de29b
+	addi    r4,r11,-1
5de29b
+	andc    r4,r4,r11
5de29b
+	cmpld	cr7,r3,r4
5de29b
 	bgt	cr7,L(no_match)
5de29b
-	srdi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
5de29b
+#else
5de29b
+	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
5de29b
+	cmpld	cr7,r11,r10
5de29b
+	bgt	cr7,L(no_match)
5de29b
+#endif
5de29b
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
5de29b
 	add	r3,r8,r0      /* Return address of the matching c byte
5de29b
 				 or null in case c was not found.  */
5de29b
 	blr
12745e
@@ -137,9 +152,13 @@ L(null_match):
5de29b
 
5de29b
 	/* Move the doublewords left and right to discard the bits that are
5de29b
 	   not part of the string and bring them back as zeros.  */
5de29b
-
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	srd	r5,r5,r6
5de29b
+	sld	r5,r5,r6
5de29b
+#else
5de29b
 	sld	r5,r5,r6
5de29b
 	srd	r5,r5,r6
5de29b
+#endif
5de29b
 	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
5de29b
 				 have been found.  */
5de29b
 	bne	cr7,L(done_null)
12745e
@@ -194,7 +213,13 @@ L(loop_null):
5de29b
 	   0xff in the same position as the null byte in the original
5de29b
 	   doubleword from the string.  Use that to calculate the pointer.  */
5de29b
 L(done_null):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi    r0,r5,-1
5de29b
+	andc    r0,r0,r5
5de29b
+	popcntd	r0,r0
5de29b
+#else
5de29b
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
5de29b
+#endif
5de29b
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
5de29b
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
5de29b
 	blr
12745e
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchrnul.S
12745e
===================================================================
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchrnul.S.orig
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchrnul.S
12745e
@@ -29,8 +29,8 @@ ENTRY (BP_SYM(__strchrnul))
5de29b
 	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
5de29b
 
5de29b
 	/* Replicate byte to doubleword.  */
5de29b
-	rlwimi	r4,r4,8,16,23
5de29b
-	rlwimi	r4,r4,16,0,15
5de29b
+	insrdi	r4,r4,8,48
5de29b
+	insrdi	r4,r4,16,32
5de29b
 	insrdi	r4,r4,32,0
5de29b
 
5de29b
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
12745e
@@ -46,10 +46,17 @@ ENTRY (BP_SYM(__strchrnul))
5de29b
 
5de29b
 	/* Move the doublewords left and right to discard the bits that are
5de29b
 	   not part of the string and to bring them back as zeros.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	srd	r10,r10,r6
5de29b
+	srd	r9,r9,r6
5de29b
+	sld	r10,r10,r6
5de29b
+	sld	r9,r9,r6
5de29b
+#else
5de29b
 	sld	r10,r10,r6
5de29b
 	sld	r9,r9,r6
5de29b
 	srd	r10,r10,r6
5de29b
 	srd	r9,r9,r6
5de29b
+#endif
5de29b
 	or	r5,r9,r10     /* OR the results to speed things up.  */
5de29b
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
5de29b
 				 have been found.  */
12745e
@@ -99,7 +106,7 @@ L(loop):
5de29b
 	bne	cr6,L(done)
5de29b
 
5de29b
 	/* The c/null byte must be in the second doubleword.  Adjust the
5de29b
-	   address again and move the result of cmpb to r10 so we can calculate
5de29b
+	   address again and move the result of cmpb to r5 so we can calculate
5de29b
 	   the pointer.  */
5de29b
 	mr	r5,r10
5de29b
 	addi	r8,r8,8
12745e
@@ -108,7 +115,13 @@ L(loop):
5de29b
 	   0xff in the same position as the c/null byte in the original
5de29b
 	   doubleword from the string.  Use that to calculate the pointer.  */
5de29b
 L(done):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi    r0,r5,-1
5de29b
+	andc    r0,r0,r5
5de29b
+	popcntd	r0,r0
5de29b
+#else
5de29b
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
5de29b
+#endif
5de29b
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
5de29b
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
5de29b
 	blr
12745e
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strchr.S
12745e
===================================================================
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strchr.S.orig
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strchr.S
12745e
@@ -50,14 +50,16 @@ ENTRY (BP_SYM (strchr))
5de29b
 #define rIGN	r10	/* number of bits we should ignore in the first word */
5de29b
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
5de29b
 #define rTMP3	r12
5de29b
+#define rTMP4	rIGN
5de29b
+#define rTMP5	rMASK
5de29b
 
5de29b
 	CHECK_BOUNDS_LOW (rSTR, rTMP1, rTMP2)
5de29b
 	STORE_RETURN_BOUNDS (rTMP1, rTMP2)
5de29b
 
5de29b
 	dcbt	0,rRTN
5de29b
-	rlwimi	rCHR, rCHR, 8, 16, 23
5de29b
+	insrdi	rCHR, rCHR, 8, 48
5de29b
 	li	rMASK, -1
5de29b
-	rlwimi	rCHR, rCHR, 16, 0, 15
5de29b
+	insrdi	rCHR, rCHR, 16, 32
5de29b
 	rlwinm	rIGN, rRTN, 3, 26, 28
5de29b
 	insrdi	rCHR, rCHR, 32, 0
5de29b
 	lis	rFEFE, -0x101
12745e
@@ -70,53 +72,74 @@ ENTRY (BP_SYM (strchr))
5de29b
 	add	rFEFE, rFEFE, rTMP1
5de29b
 /* Test the first (partial?) word.  */
5de29b
 	ld	rWORD, 0(rSTR)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	sld	rMASK, rMASK, rIGN
5de29b
+#else
5de29b
 	srd	rMASK, rMASK, rIGN
5de29b
+#endif
5de29b
 	orc	rWORD, rWORD, rMASK
5de29b
 	add	rTMP1, rFEFE, rWORD
5de29b
 	nor	rTMP2, r7F7F, rWORD
5de29b
-	and.	rTMP1, rTMP1, rTMP2
5de29b
+	and.	rTMP4, rTMP1, rTMP2
5de29b
 	xor	rTMP3, rCHR, rWORD
5de29b
 	orc	rTMP3, rTMP3, rMASK
5de29b
 	b	L(loopentry)
5de29b
 
5de29b
 /* The loop.  */
5de29b
 
5de29b
-L(loop):ldu rWORD, 8(rSTR)
5de29b
-	and.	rTMP1, rTMP1, rTMP2
5de29b
+L(loop):
5de29b
+	ldu	rWORD, 8(rSTR)
5de29b
+	and.	rTMP5, rTMP1, rTMP2
5de29b
 /* Test for 0.	*/
5de29b
-	add	rTMP1, rFEFE, rWORD
5de29b
-	nor	rTMP2, r7F7F, rWORD
5de29b
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
5de29b
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
5de29b
 	bne	L(foundit)
5de29b
-	and.	rTMP1, rTMP1, rTMP2
5de29b
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
5de29b
 /* Start test for the bytes we're looking for.  */
5de29b
 	xor	rTMP3, rCHR, rWORD
5de29b
 L(loopentry):
5de29b
 	add	rTMP1, rFEFE, rTMP3
5de29b
 	nor	rTMP2, r7F7F, rTMP3
5de29b
 	beq	L(loop)
5de29b
+
5de29b
 /* There is a zero byte in the word, but may also be a matching byte (either
5de29b
    before or after the zero byte).  In fact, we may be looking for a
5de29b
-   zero byte, in which case we return a match.  We guess that this hasn't
5de29b
-   happened, though.  */
5de29b
-L(missed):
5de29b
-	and.	rTMP1, rTMP1, rTMP2
5de29b
+   zero byte, in which case we return a match.  */
5de29b
+	and.	rTMP5, rTMP1, rTMP2
5de29b
 	li	rRTN, 0
5de29b
 	STORE_RETURN_VALUE (rSTR)
5de29b
 	beqlr
5de29b
-/* It did happen. Decide which one was first...
5de29b
-   I'm not sure if this is actually faster than a sequence of
5de29b
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
5de29b
+/* At this point:
5de29b
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
5de29b
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
5de29b
+   But there may be false matches in the next most significant byte from
5de29b
+   a true match due to carries.  This means we need to recalculate the
5de29b
+   matches using a longer method for big-endian.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi	rTMP1, rTMP5, -1
5de29b
+	andc	rTMP1, rTMP1, rTMP5
5de29b
+	cntlzd	rCLZB, rTMP1
5de29b
+	addi	rTMP2, rTMP4, -1
5de29b
+	andc	rTMP2, rTMP2, rTMP4
5de29b
+	cmpld	rTMP1, rTMP2
5de29b
+	bgtlr
5de29b
+	subfic	rCLZB, rCLZB, 64-7
5de29b
+#else
5de29b
+/* I think we could reduce this by two instructions by keeping the "nor"
5de29b
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
5de29b
+   one instruction could be pruned from L(foundit).  */
5de29b
 	and	rFEFE, r7F7F, rWORD
5de29b
-	or	rMASK, r7F7F, rWORD
5de29b
+	or	rTMP5, r7F7F, rWORD
5de29b
 	and	rTMP1, r7F7F, rTMP3
5de29b
-	or	rIGN, r7F7F, rTMP3
5de29b
+	or	rTMP4, r7F7F, rTMP3
5de29b
 	add	rFEFE, rFEFE, r7F7F
5de29b
 	add	rTMP1, rTMP1, r7F7F
5de29b
-	nor	rWORD, rMASK, rFEFE
5de29b
-	nor	rTMP2, rIGN, rTMP1
5de29b
+	nor	rWORD, rTMP5, rFEFE
5de29b
+	nor	rTMP2, rTMP4, rTMP1
5de29b
+	cntlzd	rCLZB, rTMP2
5de29b
 	cmpld	rWORD, rTMP2
5de29b
 	bgtlr
5de29b
-	cntlzd	rCLZB, rTMP2
5de29b
+#endif
5de29b
 	srdi	rCLZB, rCLZB, 3
5de29b
 	add	rRTN, rSTR, rCLZB
5de29b
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge)
12745e
@@ -124,13 +147,21 @@ L(missed):
5de29b
 	blr
5de29b
 
5de29b
 L(foundit):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	addi	rTMP1, rTMP5, -1
5de29b
+	andc	rTMP1, rTMP1, rTMP5
5de29b
+	cntlzd	rCLZB, rTMP1
5de29b
+	subfic	rCLZB, rCLZB, 64-7-64
5de29b
+	sradi	rCLZB, rCLZB, 3
5de29b
+#else
5de29b
 	and	rTMP1, r7F7F, rTMP3
5de29b
-	or	rIGN, r7F7F, rTMP3
5de29b
+	or	rTMP4, r7F7F, rTMP3
5de29b
 	add	rTMP1, rTMP1, r7F7F
5de29b
-	nor	rTMP2, rIGN, rTMP1
5de29b
+	nor	rTMP2, rTMP4, rTMP1
5de29b
 	cntlzd	rCLZB, rTMP2
5de29b
 	subi	rSTR, rSTR, 8
5de29b
 	srdi	rCLZB, rCLZB, 3
5de29b
+#endif
5de29b
 	add	rRTN, rSTR, rCLZB
5de29b
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge)
5de29b
 	STORE_RETURN_VALUE (rSTR)