ce426f
# commit 664318c3eb07032e2bfcf47cb2aa3c89280c19e7
ce426f
# Author: Alan Modra <amodra@gmail.com>
ce426f
# Date:   Sat Aug 17 18:46:05 2013 +0930
ce426f
# 
ce426f
#     PowerPC LE strchr
ce426f
#     http://sourceware.org/ml/libc-alpha/2013-08/msg00101.html
ce426f
#     
ce426f
#     Adds little-endian support to optimised strchr assembly.  I've also
ce426f
#     tweaked the big-endian code a little.  In power7/strchr.S there's a
ce426f
#     check in the tail of the function that we didn't match 0 before
ce426f
#     finding a c match, done by comparing leading zero counts.  It's just
ce426f
#     as valid, and quicker, to compare the raw output from cmpb.
ce426f
#     
ce426f
#     Another little tweak is to use rldimi/insrdi in place of rlwimi for
ce426f
#     the power7 strchr functions.  Since rlwimi is cracked, it is a few
ce426f
#     cycles slower.  rldimi can be used on the 32-bit power7 functions
ce426f
#     too.
ce426f
#     
ce426f
#         * sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian
ce426f
#         support.  Correct typos, formatting.  Optimize tail.  Use insrdi
ce426f
#         rather than rlwimi.
ce426f
#         * sysdeps/powerpc/powerpc32/power7/strchr.S: Likewise.
ce426f
#         * sysdeps/powerpc/powerpc64/power7/strchrnul.S (__strchrnul): Add
ce426f
#         little-endian support.  Correct typos.
ce426f
#         * sysdeps/powerpc/powerpc32/power7/strchrnul.S: Likewise.  Use insrdi
ce426f
#         rather than rlwimi.
ce426f
#         * sysdeps/powerpc/powerpc64/strchr.S (rTMP4, rTMP5): Define.  Use
ce426f
#         in loop and entry code to keep "and." results.
ce426f
#         (strchr): Add little-endian support.  Comment.  Move cntlzd
ce426f
#         earlier in tail.
ce426f
#         * sysdeps/powerpc/powerpc32/strchr.S: Likewise.
ce426f
# 
ce426f
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchr.S
ce426f
===================================================================
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchr.S.orig
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchr.S
ce426f
@@ -37,8 +37,8 @@ ENTRY (BP_SYM(strchr))
ce426f
 	beq	cr7,L(null_match)
ce426f
 
ce426f
 	/* Replicate byte to word.  */
ce426f
-	rlwimi	r4,r4,8,16,23
ce426f
-	rlwimi	r4,r4,16,0,15
ce426f
+	insrdi	r4,r4,8,48
ce426f
+	insrdi	r4,r4,16,32
ce426f
 
ce426f
 	/* Now r4 has a word of c bytes and r0 has
ce426f
 	   a word of null bytes.  */
ce426f
@@ -48,11 +48,17 @@ ENTRY (BP_SYM(strchr))
ce426f
 
ce426f
 	/* Move the words left and right to discard the bits that are
ce426f
 	   not part of the string and to bring them back as zeros.  */
ce426f
-
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	srw	r10,r10,r6
ce426f
+	srw	r11,r11,r6
ce426f
+	slw	r10,r10,r6
ce426f
+	slw	r11,r11,r6
ce426f
+#else
ce426f
 	slw	r10,r10,r6
ce426f
 	slw	r11,r11,r6
ce426f
 	srw	r10,r10,r6
ce426f
 	srw	r11,r11,r6
ce426f
+#endif
ce426f
 	or	r5,r10,r11    /* OR the results to speed things up.  */
ce426f
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
ce426f
 				 have been found.  */
ce426f
@@ -67,7 +73,7 @@ ENTRY (BP_SYM(strchr))
ce426f
 
ce426f
 	/* Handle WORD2 of pair.  */
ce426f
 	lwzu	r12,4(r8)
ce426f
-	cmpb    r10,r12,r4
ce426f
+	cmpb	r10,r12,r4
ce426f
 	cmpb	r11,r12,r0
ce426f
 	or	r5,r10,r11
ce426f
 	cmpwi	cr7,r5,0
ce426f
@@ -102,22 +108,31 @@ L(loop):
ce426f
 	bne	cr6,L(done)
ce426f
 
ce426f
 	/* The c/null byte must be in the second word.  Adjust the address
ce426f
-	   again and move the result of cmpb to r10 so we can calculate the
ce426f
-	   pointer.  */
ce426f
+	   again and move the result of cmpb to r10/r11 so we can calculate
ce426f
+	   the pointer.  */
ce426f
 
ce426f
 	mr	r10,r6
ce426f
 	mr	r11,r7
ce426f
 	addi	r8,r8,4
ce426f
 
ce426f
-	/* r5 has the output of the cmpb instruction, that is, it contains
ce426f
+	/* r10/r11 have the output of the cmpb instructions, that is,
ce426f
 	   0xff in the same position as the c/null byte in the original
ce426f
 	   word from the string.  Use that to calculate the pointer.  */
ce426f
 L(done):
ce426f
-	cntlzw	r4,r10	      /* Count leading zeroes before c matches.  */
ce426f
-	cntlzw	r0,r11	      /* Count leading zeroes before null matches.  */
ce426f
-	cmplw	cr7,r4,r0
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi    r3,r10,-1
ce426f
+	andc    r3,r3,r10
ce426f
+	popcntw	r0,r3
ce426f
+	addi    r4,r11,-1
ce426f
+	andc    r4,r4,r11
ce426f
+	cmplw	cr7,r3,r4
ce426f
+	bgt	cr7,L(no_match)
ce426f
+#else
ce426f
+	cntlzw	r0,r10	      /* Count leading zeros before c matches.  */
ce426f
+	cmplw	cr7,r11,r10
ce426f
 	bgt	cr7,L(no_match)
ce426f
-	srwi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
ce426f
+#endif
ce426f
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
ce426f
 	add	r3,r8,r0      /* Return address of the matching c byte
ce426f
 				 or null in case c was not found.  */
ce426f
 	blr
ce426f
@@ -135,10 +150,14 @@ L(null_match):
ce426f
 	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
ce426f
 
ce426f
 	/* Move the words left and right to discard the bits that are
ce426f
-	   not part of the string and to bring them back as zeros.  */
ce426f
-
ce426f
+	   not part of the string and bring them back as zeros.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	srw	r5,r5,r6
ce426f
+	slw	r5,r5,r6
ce426f
+#else
ce426f
 	slw	r5,r5,r6
ce426f
 	srw	r5,r5,r6
ce426f
+#endif
ce426f
 	cmpwi	cr7,r5,0      /* If r10 == 0, no c or null bytes
ce426f
 				 have been found.  */
ce426f
 	bne	cr7,L(done_null)
ce426f
@@ -193,7 +212,13 @@ L(loop_null):
ce426f
 	   0xff in the same position as the null byte in the original
ce426f
 	   word from the string.  Use that to calculate the pointer.  */
ce426f
 L(done_null):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi    r0,r5,-1
ce426f
+	andc    r0,r0,r5
ce426f
+	popcntw	r0,r0
ce426f
+#else
ce426f
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
ce426f
+#endif
ce426f
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
ce426f
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
ce426f
 	blr
ce426f
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchrnul.S
ce426f
===================================================================
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchrnul.S.orig
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchrnul.S
ce426f
@@ -29,8 +29,8 @@ ENTRY (BP_SYM(__strchrnul))
ce426f
 	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
ce426f
 
ce426f
 	/* Replicate byte to word.  */
ce426f
-	rlwimi	r4,r4,8,16,23
ce426f
-	rlwimi	r4,r4,16,0,15
ce426f
+	insrdi	r4,r4,8,48
ce426f
+	insrdi	r4,r4,16,32
ce426f
 
ce426f
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
ce426f
 	lwz	r12,0(r8)     /* Load word from memory.  */
ce426f
@@ -45,10 +45,17 @@ ENTRY (BP_SYM(__strchrnul))
ce426f
 
ce426f
 	/* Move the words left and right to discard the bits that are
ce426f
 	   not part of the string and bring them back as zeros.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	srw	r10,r10,r6
ce426f
+	srw	r9,r9,r6
ce426f
+	slw	r10,r10,r6
ce426f
+	slw	r9,r9,r6
ce426f
+#else
ce426f
 	slw	r10,r10,r6
ce426f
 	slw	r9,r9,r6
ce426f
 	srw	r10,r10,r6
ce426f
 	srw	r9,r9,r6
ce426f
+#endif
ce426f
 	or	r5,r9,r10     /* OR the results to speed things up.  */
ce426f
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
ce426f
 				 have been found.  */
ce426f
@@ -56,7 +63,7 @@ ENTRY (BP_SYM(__strchrnul))
ce426f
 
ce426f
 	mtcrf   0x01,r8
ce426f
 
ce426f
-	/* Are we now aligned to a quadword boundary?  If so, skip to
ce426f
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
ce426f
 	   the main loop.  Otherwise, go through the alignment code.  */
ce426f
 
ce426f
 	bt	29,L(loop)
ce426f
@@ -78,7 +85,7 @@ L(loop):
ce426f
 	   single register for speed.  This is an attempt
ce426f
 	   to speed up the null-checking process for bigger strings.  */
ce426f
 	lwz	r12,4(r8)
ce426f
-	lwzu     r11,8(r8)
ce426f
+	lwzu	r11,8(r8)
ce426f
 	cmpb	r10,r12,r0
ce426f
 	cmpb	r9,r12,r4
ce426f
 	cmpb	r6,r11,r0
ce426f
@@ -97,9 +104,9 @@ L(loop):
ce426f
 	addi	r8,r8,-4
ce426f
 	bne	cr6,L(done)
ce426f
 
ce426f
-	/* The c/null byte must be in the second word.  Adjust the
ce426f
-	   address again and move the result of cmpb to r10 so we can calculate
ce426f
-	   the pointer.  */
ce426f
+	/* The c/null byte must be in the second word.  Adjust the address
ce426f
+	   again and move the result of cmpb to r5 so we can calculate the
ce426f
+	   pointer.  */
ce426f
 	mr	r5,r10
ce426f
 	addi	r8,r8,4
ce426f
 
ce426f
@@ -107,7 +114,13 @@ L(loop):
ce426f
 	   0xff in the same position as the c/null byte in the original
ce426f
 	   word from the string.  Use that to calculate the pointer.  */
ce426f
 L(done):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi    r0,r5,-1
ce426f
+	andc    r0,r0,r5
ce426f
+	popcntw	r0,r0
ce426f
+#else
ce426f
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
ce426f
+#endif
ce426f
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
ce426f
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
ce426f
 	blr
ce426f
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strchr.S
ce426f
===================================================================
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strchr.S.orig
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strchr.S
ce426f
@@ -44,6 +44,8 @@ ENTRY (BP_SYM (strchr))
ce426f
 #define rIGN	r10	/* number of bits we should ignore in the first word */
ce426f
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
ce426f
 #define rTMP3	r12
ce426f
+#define rTMP4	rIGN
ce426f
+#define rTMP5	rMASK
ce426f
 
ce426f
 	CHECK_BOUNDS_LOW (rSTR, rTMP1, rTMP2)
ce426f
 	STORE_RETURN_BOUNDS (rTMP1, rTMP2)
ce426f
@@ -59,53 +61,74 @@ ENTRY (BP_SYM (strchr))
ce426f
 	addi	r7F7F, r7F7F, 0x7f7f
ce426f
 /* Test the first (partial?) word.  */
ce426f
 	lwz	rWORD, 0(rSTR)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	slw	rMASK, rMASK, rIGN
ce426f
+#else
ce426f
 	srw	rMASK, rMASK, rIGN
ce426f
+#endif
ce426f
 	orc	rWORD, rWORD, rMASK
ce426f
 	add	rTMP1, rFEFE, rWORD
ce426f
 	nor	rTMP2, r7F7F, rWORD
ce426f
-	and.	rTMP1, rTMP1, rTMP2
ce426f
+	and.	rTMP4, rTMP1, rTMP2
ce426f
 	xor	rTMP3, rCHR, rWORD
ce426f
 	orc	rTMP3, rTMP3, rMASK
ce426f
 	b	L(loopentry)
ce426f
 
ce426f
 /* The loop.  */
ce426f
 
ce426f
-L(loop):lwzu rWORD, 4(rSTR)
ce426f
-	and.	rTMP1, rTMP1, rTMP2
ce426f
+L(loop):
ce426f
+	lwzu	rWORD, 4(rSTR)
ce426f
+	and.	rTMP5, rTMP1, rTMP2
ce426f
 /* Test for 0.	*/
ce426f
-	add	rTMP1, rFEFE, rWORD
ce426f
-	nor	rTMP2, r7F7F, rWORD
ce426f
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
ce426f
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
ce426f
 	bne	L(foundit)
ce426f
-	and.	rTMP1, rTMP1, rTMP2
ce426f
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
ce426f
 /* Start test for the bytes we're looking for.  */
ce426f
 	xor	rTMP3, rCHR, rWORD
ce426f
 L(loopentry):
ce426f
 	add	rTMP1, rFEFE, rTMP3
ce426f
 	nor	rTMP2, r7F7F, rTMP3
ce426f
 	beq	L(loop)
ce426f
+
ce426f
 /* There is a zero byte in the word, but may also be a matching byte (either
ce426f
    before or after the zero byte).  In fact, we may be looking for a
ce426f
-   zero byte, in which case we return a match.  We guess that this hasn't
ce426f
-   happened, though.  */
ce426f
-L(missed):
ce426f
-	and.	rTMP1, rTMP1, rTMP2
ce426f
+   zero byte, in which case we return a match.  */
ce426f
+	and.	rTMP5, rTMP1, rTMP2
ce426f
 	li	rRTN, 0
ce426f
 	STORE_RETURN_VALUE (rSTR)
ce426f
 	beqlr
ce426f
-/* It did happen. Decide which one was first...
ce426f
-   I'm not sure if this is actually faster than a sequence of
ce426f
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
ce426f
+/* At this point:
ce426f
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
ce426f
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
ce426f
+   But there may be false matches in the next most significant byte from
ce426f
+   a true match due to carries.  This means we need to recalculate the
ce426f
+   matches using a longer method for big-endian.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi	rTMP1, rTMP5, -1
ce426f
+	andc	rTMP1, rTMP1, rTMP5
ce426f
+	cntlzw	rCLZB, rTMP1
ce426f
+	addi	rTMP2, rTMP4, -1
ce426f
+	andc	rTMP2, rTMP2, rTMP4
ce426f
+	cmplw	rTMP1, rTMP2
ce426f
+	bgtlr
ce426f
+	subfic	rCLZB, rCLZB, 32-7
ce426f
+#else
ce426f
+/* I think we could reduce this by two instructions by keeping the "nor"
ce426f
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
ce426f
+   one instruction could be pruned from L(foundit).  */
ce426f
 	and	rFEFE, r7F7F, rWORD
ce426f
-	or	rMASK, r7F7F, rWORD
ce426f
+	or	rTMP5, r7F7F, rWORD
ce426f
 	and	rTMP1, r7F7F, rTMP3
ce426f
-	or	rIGN, r7F7F, rTMP3
ce426f
+	or	rTMP4, r7F7F, rTMP3
ce426f
 	add	rFEFE, rFEFE, r7F7F
ce426f
 	add	rTMP1, rTMP1, r7F7F
ce426f
-	nor	rWORD, rMASK, rFEFE
ce426f
-	nor	rTMP2, rIGN, rTMP1
ce426f
+	nor	rWORD, rTMP5, rFEFE
ce426f
+	nor	rTMP2, rTMP4, rTMP1
ce426f
+	cntlzw	rCLZB, rTMP2
ce426f
 	cmplw	rWORD, rTMP2
ce426f
 	bgtlr
ce426f
-	cntlzw	rCLZB, rTMP2
ce426f
+#endif
ce426f
 	srwi	rCLZB, rCLZB, 3
ce426f
 	add	rRTN, rSTR, rCLZB
ce426f
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge)
ce426f
@@ -113,13 +136,21 @@ L(missed):
ce426f
 	blr
ce426f
 
ce426f
 L(foundit):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi	rTMP1, rTMP5, -1
ce426f
+	andc	rTMP1, rTMP1, rTMP5
ce426f
+	cntlzw	rCLZB, rTMP1
ce426f
+	subfic	rCLZB, rCLZB, 32-7-32
ce426f
+	srawi	rCLZB, rCLZB, 3
ce426f
+#else
ce426f
 	and	rTMP1, r7F7F, rTMP3
ce426f
-	or	rIGN, r7F7F, rTMP3
ce426f
+	or	rTMP4, r7F7F, rTMP3
ce426f
 	add	rTMP1, rTMP1, r7F7F
ce426f
-	nor	rTMP2, rIGN, rTMP1
ce426f
+	nor	rTMP2, rTMP4, rTMP1
ce426f
 	cntlzw	rCLZB, rTMP2
ce426f
 	subi	rSTR, rSTR, 4
ce426f
 	srwi	rCLZB, rCLZB, 3
ce426f
+#endif
ce426f
 	add	rRTN, rSTR, rCLZB
ce426f
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge)
ce426f
 	STORE_RETURN_VALUE (rSTR)
ce426f
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchr.S
ce426f
===================================================================
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchr.S.orig
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchr.S
ce426f
@@ -37,8 +37,8 @@ ENTRY (BP_SYM(strchr))
ce426f
 	beq	cr7,L(null_match)
ce426f
 
ce426f
 	/* Replicate byte to doubleword.  */
ce426f
-	rlwimi	r4,r4,8,16,23
ce426f
-	rlwimi	r4,r4,16,0,15
ce426f
+	insrdi	r4,r4,8,48
ce426f
+	insrdi	r4,r4,16,32
ce426f
 	insrdi  r4,r4,32,0
ce426f
 
ce426f
 	/* Now r4 has a doubleword of c bytes and r0 has
ce426f
@@ -49,11 +49,17 @@ ENTRY (BP_SYM(strchr))
ce426f
 
ce426f
 	/* Move the doublewords left and right to discard the bits that are
ce426f
 	   not part of the string and bring them back as zeros.  */
ce426f
-
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	srd	r10,r10,r6
ce426f
+	srd	r11,r11,r6
ce426f
+	sld	r10,r10,r6
ce426f
+	sld	r11,r11,r6
ce426f
+#else
ce426f
 	sld	r10,r10,r6
ce426f
 	sld	r11,r11,r6
ce426f
 	srd	r10,r10,r6
ce426f
 	srd	r11,r11,r6
ce426f
+#endif
ce426f
 	or	r5,r10,r11    /* OR the results to speed things up.  */
ce426f
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
ce426f
 				 have been found.  */
ce426f
@@ -110,15 +116,24 @@ L(loop):
ce426f
 	mr	r11,r7
ce426f
 	addi	r8,r8,8
ce426f
 
ce426f
-	/* r5 has the output of the cmpb instruction, that is, it contains
ce426f
+	/* r10/r11 have the output of the cmpb instructions, that is,
ce426f
 	   0xff in the same position as the c/null byte in the original
ce426f
 	   doubleword from the string.  Use that to calculate the pointer.  */
ce426f
 L(done):
ce426f
-	cntlzd	r4,r10	      /* Count leading zeroes before c matches.  */
ce426f
-	cntlzd	r0,r11	      /* Count leading zeroes before null matches.  */
ce426f
-	cmpld	cr7,r4,r0
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi    r3,r10,-1
ce426f
+	andc    r3,r3,r10
ce426f
+	popcntd	r0,r3
ce426f
+	addi    r4,r11,-1
ce426f
+	andc    r4,r4,r11
ce426f
+	cmpld	cr7,r3,r4
ce426f
 	bgt	cr7,L(no_match)
ce426f
-	srdi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
ce426f
+#else
ce426f
+	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
ce426f
+	cmpld	cr7,r11,r10
ce426f
+	bgt	cr7,L(no_match)
ce426f
+#endif
ce426f
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
ce426f
 	add	r3,r8,r0      /* Return address of the matching c byte
ce426f
 				 or null in case c was not found.  */
ce426f
 	blr
ce426f
@@ -137,9 +152,13 @@ L(null_match):
ce426f
 
ce426f
 	/* Move the doublewords left and right to discard the bits that are
ce426f
 	   not part of the string and bring them back as zeros.  */
ce426f
-
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	srd	r5,r5,r6
ce426f
+	sld	r5,r5,r6
ce426f
+#else
ce426f
 	sld	r5,r5,r6
ce426f
 	srd	r5,r5,r6
ce426f
+#endif
ce426f
 	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
ce426f
 				 have been found.  */
ce426f
 	bne	cr7,L(done_null)
ce426f
@@ -194,7 +213,13 @@ L(loop_null):
ce426f
 	   0xff in the same position as the null byte in the original
ce426f
 	   doubleword from the string.  Use that to calculate the pointer.  */
ce426f
 L(done_null):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi    r0,r5,-1
ce426f
+	andc    r0,r0,r5
ce426f
+	popcntd	r0,r0
ce426f
+#else
ce426f
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
ce426f
+#endif
ce426f
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
ce426f
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
ce426f
 	blr
ce426f
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchrnul.S
ce426f
===================================================================
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchrnul.S.orig
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchrnul.S
ce426f
@@ -29,8 +29,8 @@ ENTRY (BP_SYM(__strchrnul))
ce426f
 	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
ce426f
 
ce426f
 	/* Replicate byte to doubleword.  */
ce426f
-	rlwimi	r4,r4,8,16,23
ce426f
-	rlwimi	r4,r4,16,0,15
ce426f
+	insrdi	r4,r4,8,48
ce426f
+	insrdi	r4,r4,16,32
ce426f
 	insrdi	r4,r4,32,0
ce426f
 
ce426f
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
ce426f
@@ -46,10 +46,17 @@ ENTRY (BP_SYM(__strchrnul))
ce426f
 
ce426f
 	/* Move the doublewords left and right to discard the bits that are
ce426f
 	   not part of the string and to bring them back as zeros.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	srd	r10,r10,r6
ce426f
+	srd	r9,r9,r6
ce426f
+	sld	r10,r10,r6
ce426f
+	sld	r9,r9,r6
ce426f
+#else
ce426f
 	sld	r10,r10,r6
ce426f
 	sld	r9,r9,r6
ce426f
 	srd	r10,r10,r6
ce426f
 	srd	r9,r9,r6
ce426f
+#endif
ce426f
 	or	r5,r9,r10     /* OR the results to speed things up.  */
ce426f
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
ce426f
 				 have been found.  */
ce426f
@@ -99,7 +106,7 @@ L(loop):
ce426f
 	bne	cr6,L(done)
ce426f
 
ce426f
 	/* The c/null byte must be in the second doubleword.  Adjust the
ce426f
-	   address again and move the result of cmpb to r10 so we can calculate
ce426f
+	   address again and move the result of cmpb to r5 so we can calculate
ce426f
 	   the pointer.  */
ce426f
 	mr	r5,r10
ce426f
 	addi	r8,r8,8
ce426f
@@ -108,7 +115,13 @@ L(loop):
ce426f
 	   0xff in the same position as the c/null byte in the original
ce426f
 	   doubleword from the string.  Use that to calculate the pointer.  */
ce426f
 L(done):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi    r0,r5,-1
ce426f
+	andc    r0,r0,r5
ce426f
+	popcntd	r0,r0
ce426f
+#else
ce426f
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
ce426f
+#endif
ce426f
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
ce426f
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
ce426f
 	blr
ce426f
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strchr.S
ce426f
===================================================================
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strchr.S.orig
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strchr.S
ce426f
@@ -50,14 +50,16 @@ ENTRY (BP_SYM (strchr))
ce426f
 #define rIGN	r10	/* number of bits we should ignore in the first word */
ce426f
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
ce426f
 #define rTMP3	r12
ce426f
+#define rTMP4	rIGN
ce426f
+#define rTMP5	rMASK
ce426f
 
ce426f
 	CHECK_BOUNDS_LOW (rSTR, rTMP1, rTMP2)
ce426f
 	STORE_RETURN_BOUNDS (rTMP1, rTMP2)
ce426f
 
ce426f
 	dcbt	0,rRTN
ce426f
-	rlwimi	rCHR, rCHR, 8, 16, 23
ce426f
+	insrdi	rCHR, rCHR, 8, 48
ce426f
 	li	rMASK, -1
ce426f
-	rlwimi	rCHR, rCHR, 16, 0, 15
ce426f
+	insrdi	rCHR, rCHR, 16, 32
ce426f
 	rlwinm	rIGN, rRTN, 3, 26, 28
ce426f
 	insrdi	rCHR, rCHR, 32, 0
ce426f
 	lis	rFEFE, -0x101
ce426f
@@ -70,53 +72,74 @@ ENTRY (BP_SYM (strchr))
ce426f
 	add	rFEFE, rFEFE, rTMP1
ce426f
 /* Test the first (partial?) word.  */
ce426f
 	ld	rWORD, 0(rSTR)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	sld	rMASK, rMASK, rIGN
ce426f
+#else
ce426f
 	srd	rMASK, rMASK, rIGN
ce426f
+#endif
ce426f
 	orc	rWORD, rWORD, rMASK
ce426f
 	add	rTMP1, rFEFE, rWORD
ce426f
 	nor	rTMP2, r7F7F, rWORD
ce426f
-	and.	rTMP1, rTMP1, rTMP2
ce426f
+	and.	rTMP4, rTMP1, rTMP2
ce426f
 	xor	rTMP3, rCHR, rWORD
ce426f
 	orc	rTMP3, rTMP3, rMASK
ce426f
 	b	L(loopentry)
ce426f
 
ce426f
 /* The loop.  */
ce426f
 
ce426f
-L(loop):ldu rWORD, 8(rSTR)
ce426f
-	and.	rTMP1, rTMP1, rTMP2
ce426f
+L(loop):
ce426f
+	ldu	rWORD, 8(rSTR)
ce426f
+	and.	rTMP5, rTMP1, rTMP2
ce426f
 /* Test for 0.	*/
ce426f
-	add	rTMP1, rFEFE, rWORD
ce426f
-	nor	rTMP2, r7F7F, rWORD
ce426f
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
ce426f
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
ce426f
 	bne	L(foundit)
ce426f
-	and.	rTMP1, rTMP1, rTMP2
ce426f
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
ce426f
 /* Start test for the bytes we're looking for.  */
ce426f
 	xor	rTMP3, rCHR, rWORD
ce426f
 L(loopentry):
ce426f
 	add	rTMP1, rFEFE, rTMP3
ce426f
 	nor	rTMP2, r7F7F, rTMP3
ce426f
 	beq	L(loop)
ce426f
+
ce426f
 /* There is a zero byte in the word, but may also be a matching byte (either
ce426f
    before or after the zero byte).  In fact, we may be looking for a
ce426f
-   zero byte, in which case we return a match.  We guess that this hasn't
ce426f
-   happened, though.  */
ce426f
-L(missed):
ce426f
-	and.	rTMP1, rTMP1, rTMP2
ce426f
+   zero byte, in which case we return a match.  */
ce426f
+	and.	rTMP5, rTMP1, rTMP2
ce426f
 	li	rRTN, 0
ce426f
 	STORE_RETURN_VALUE (rSTR)
ce426f
 	beqlr
ce426f
-/* It did happen. Decide which one was first...
ce426f
-   I'm not sure if this is actually faster than a sequence of
ce426f
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
ce426f
+/* At this point:
ce426f
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
ce426f
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
ce426f
+   But there may be false matches in the next most significant byte from
ce426f
+   a true match due to carries.  This means we need to recalculate the
ce426f
+   matches using a longer method for big-endian.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi	rTMP1, rTMP5, -1
ce426f
+	andc	rTMP1, rTMP1, rTMP5
ce426f
+	cntlzd	rCLZB, rTMP1
ce426f
+	addi	rTMP2, rTMP4, -1
ce426f
+	andc	rTMP2, rTMP2, rTMP4
ce426f
+	cmpld	rTMP1, rTMP2
ce426f
+	bgtlr
ce426f
+	subfic	rCLZB, rCLZB, 64-7
ce426f
+#else
ce426f
+/* I think we could reduce this by two instructions by keeping the "nor"
ce426f
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
ce426f
+   one instruction could be pruned from L(foundit).  */
ce426f
 	and	rFEFE, r7F7F, rWORD
ce426f
-	or	rMASK, r7F7F, rWORD
ce426f
+	or	rTMP5, r7F7F, rWORD
ce426f
 	and	rTMP1, r7F7F, rTMP3
ce426f
-	or	rIGN, r7F7F, rTMP3
ce426f
+	or	rTMP4, r7F7F, rTMP3
ce426f
 	add	rFEFE, rFEFE, r7F7F
ce426f
 	add	rTMP1, rTMP1, r7F7F
ce426f
-	nor	rWORD, rMASK, rFEFE
ce426f
-	nor	rTMP2, rIGN, rTMP1
ce426f
+	nor	rWORD, rTMP5, rFEFE
ce426f
+	nor	rTMP2, rTMP4, rTMP1
ce426f
+	cntlzd	rCLZB, rTMP2
ce426f
 	cmpld	rWORD, rTMP2
ce426f
 	bgtlr
ce426f
-	cntlzd	rCLZB, rTMP2
ce426f
+#endif
ce426f
 	srdi	rCLZB, rCLZB, 3
ce426f
 	add	rRTN, rSTR, rCLZB
ce426f
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge)
ce426f
@@ -124,13 +147,21 @@ L(missed):
ce426f
 	blr
ce426f
 
ce426f
 L(foundit):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	addi	rTMP1, rTMP5, -1
ce426f
+	andc	rTMP1, rTMP1, rTMP5
ce426f
+	cntlzd	rCLZB, rTMP1
ce426f
+	subfic	rCLZB, rCLZB, 64-7-64
ce426f
+	sradi	rCLZB, rCLZB, 3
ce426f
+#else
ce426f
 	and	rTMP1, r7F7F, rTMP3
ce426f
-	or	rIGN, r7F7F, rTMP3
ce426f
+	or	rTMP4, r7F7F, rTMP3
ce426f
 	add	rTMP1, rTMP1, r7F7F
ce426f
-	nor	rTMP2, rIGN, rTMP1
ce426f
+	nor	rTMP2, rTMP4, rTMP1
ce426f
 	cntlzd	rCLZB, rTMP2
ce426f
 	subi	rSTR, rSTR, 8
ce426f
 	srdi	rCLZB, rCLZB, 3
ce426f
+#endif
ce426f
 	add	rRTN, rSTR, rCLZB
ce426f
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge)
ce426f
 	STORE_RETURN_VALUE (rSTR)