00db10
# commit 664318c3eb07032e2bfcf47cb2aa3c89280c19e7
00db10
# Author: Alan Modra <amodra@gmail.com>
00db10
# Date:   Sat Aug 17 18:46:05 2013 +0930
00db10
# 
00db10
#     PowerPC LE strchr
00db10
#     http://sourceware.org/ml/libc-alpha/2013-08/msg00101.html
00db10
#     
00db10
#     Adds little-endian support to optimised strchr assembly.  I've also
00db10
#     tweaked the big-endian code a little.  In power7/strchr.S there's a
00db10
#     check in the tail of the function that we didn't match 0 before
00db10
#     finding a c match, done by comparing leading zero counts.  It's just
00db10
#     as valid, and quicker, to compare the raw output from cmpb.
00db10
#     
00db10
#     Another little tweak is to use rldimi/insrdi in place of rlwimi for
00db10
#     the power7 strchr functions.  Since rlwimi is cracked, it is a few
00db10
#     cycles slower.  rldimi can be used on the 32-bit power7 functions
00db10
#     too.
00db10
#     
00db10
#         * sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian
00db10
#         support.  Correct typos, formatting.  Optimize tail.  Use insrdi
00db10
#         rather than rlwimi.
00db10
#         * sysdeps/powerpc/powerpc32/power7/strchr.S: Likewise.
00db10
#         * sysdeps/powerpc/powerpc64/power7/strchrnul.S (__strchrnul): Add
00db10
#         little-endian support.  Correct typos.
00db10
#         * sysdeps/powerpc/powerpc32/power7/strchrnul.S: Likewise.  Use insrdi
00db10
#         rather than rlwimi.
00db10
#         * sysdeps/powerpc/powerpc64/strchr.S (rTMP4, rTMP5): Define.  Use
00db10
#         in loop and entry code to keep "and." results.
00db10
#         (strchr): Add little-endian support.  Comment.  Move cntlzd
00db10
#         earlier in tail.
00db10
#         * sysdeps/powerpc/powerpc32/strchr.S: Likewise.
00db10
# 
00db10
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchr.S
00db10
===================================================================
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchr.S.orig
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchr.S
00db10
@@ -37,8 +37,8 @@ ENTRY (BP_SYM(strchr))
00db10
 	beq	cr7,L(null_match)
00db10
 
00db10
 	/* Replicate byte to word.  */
00db10
-	rlwimi	r4,r4,8,16,23
00db10
-	rlwimi	r4,r4,16,0,15
00db10
+	insrdi	r4,r4,8,48
00db10
+	insrdi	r4,r4,16,32
00db10
 
00db10
 	/* Now r4 has a word of c bytes and r0 has
00db10
 	   a word of null bytes.  */
00db10
@@ -48,11 +48,17 @@ ENTRY (BP_SYM(strchr))
00db10
 
00db10
 	/* Move the words left and right to discard the bits that are
00db10
 	   not part of the string and to bring them back as zeros.  */
00db10
-
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	srw	r10,r10,r6
00db10
+	srw	r11,r11,r6
00db10
+	slw	r10,r10,r6
00db10
+	slw	r11,r11,r6
00db10
+#else
00db10
 	slw	r10,r10,r6
00db10
 	slw	r11,r11,r6
00db10
 	srw	r10,r10,r6
00db10
 	srw	r11,r11,r6
00db10
+#endif
00db10
 	or	r5,r10,r11    /* OR the results to speed things up.  */
00db10
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
00db10
 				 have been found.  */
00db10
@@ -67,7 +73,7 @@ ENTRY (BP_SYM(strchr))
00db10
 
00db10
 	/* Handle WORD2 of pair.  */
00db10
 	lwzu	r12,4(r8)
00db10
-	cmpb    r10,r12,r4
00db10
+	cmpb	r10,r12,r4
00db10
 	cmpb	r11,r12,r0
00db10
 	or	r5,r10,r11
00db10
 	cmpwi	cr7,r5,0
00db10
@@ -102,22 +108,31 @@ L(loop):
00db10
 	bne	cr6,L(done)
00db10
 
00db10
 	/* The c/null byte must be in the second word.  Adjust the address
00db10
-	   again and move the result of cmpb to r10 so we can calculate the
00db10
-	   pointer.  */
00db10
+	   again and move the result of cmpb to r10/r11 so we can calculate
00db10
+	   the pointer.  */
00db10
 
00db10
 	mr	r10,r6
00db10
 	mr	r11,r7
00db10
 	addi	r8,r8,4
00db10
 
00db10
-	/* r5 has the output of the cmpb instruction, that is, it contains
00db10
+	/* r10/r11 have the output of the cmpb instructions, that is,
00db10
 	   0xff in the same position as the c/null byte in the original
00db10
 	   word from the string.  Use that to calculate the pointer.  */
00db10
 L(done):
00db10
-	cntlzw	r4,r10	      /* Count leading zeroes before c matches.  */
00db10
-	cntlzw	r0,r11	      /* Count leading zeroes before null matches.  */
00db10
-	cmplw	cr7,r4,r0
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi    r3,r10,-1
00db10
+	andc    r3,r3,r10
00db10
+	popcntw	r0,r3
00db10
+	addi    r4,r11,-1
00db10
+	andc    r4,r4,r11
00db10
+	cmplw	cr7,r3,r4
00db10
+	bgt	cr7,L(no_match)
00db10
+#else
00db10
+	cntlzw	r0,r10	      /* Count leading zeros before c matches.  */
00db10
+	cmplw	cr7,r11,r10
00db10
 	bgt	cr7,L(no_match)
00db10
-	srwi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
00db10
+#endif
00db10
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
00db10
 	add	r3,r8,r0      /* Return address of the matching c byte
00db10
 				 or null in case c was not found.  */
00db10
 	blr
00db10
@@ -135,10 +150,14 @@ L(null_match):
00db10
 	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
00db10
 
00db10
 	/* Move the words left and right to discard the bits that are
00db10
-	   not part of the string and to bring them back as zeros.  */
00db10
-
00db10
+	   not part of the string and bring them back as zeros.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	srw	r5,r5,r6
00db10
+	slw	r5,r5,r6
00db10
+#else
00db10
 	slw	r5,r5,r6
00db10
 	srw	r5,r5,r6
00db10
+#endif
00db10
 	cmpwi	cr7,r5,0      /* If r10 == 0, no c or null bytes
00db10
 				 have been found.  */
00db10
 	bne	cr7,L(done_null)
00db10
@@ -193,7 +212,13 @@ L(loop_null):
00db10
 	   0xff in the same position as the null byte in the original
00db10
 	   word from the string.  Use that to calculate the pointer.  */
00db10
 L(done_null):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi    r0,r5,-1
00db10
+	andc    r0,r0,r5
00db10
+	popcntw	r0,r0
00db10
+#else
00db10
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
00db10
+#endif
00db10
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
00db10
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
00db10
 	blr
00db10
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchrnul.S
00db10
===================================================================
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchrnul.S.orig
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strchrnul.S
00db10
@@ -29,8 +29,8 @@ ENTRY (BP_SYM(__strchrnul))
00db10
 	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
00db10
 
00db10
 	/* Replicate byte to word.  */
00db10
-	rlwimi	r4,r4,8,16,23
00db10
-	rlwimi	r4,r4,16,0,15
00db10
+	insrdi	r4,r4,8,48
00db10
+	insrdi	r4,r4,16,32
00db10
 
00db10
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
00db10
 	lwz	r12,0(r8)     /* Load word from memory.  */
00db10
@@ -45,10 +45,17 @@ ENTRY (BP_SYM(__strchrnul))
00db10
 
00db10
 	/* Move the words left and right to discard the bits that are
00db10
 	   not part of the string and bring them back as zeros.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	srw	r10,r10,r6
00db10
+	srw	r9,r9,r6
00db10
+	slw	r10,r10,r6
00db10
+	slw	r9,r9,r6
00db10
+#else
00db10
 	slw	r10,r10,r6
00db10
 	slw	r9,r9,r6
00db10
 	srw	r10,r10,r6
00db10
 	srw	r9,r9,r6
00db10
+#endif
00db10
 	or	r5,r9,r10     /* OR the results to speed things up.  */
00db10
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
00db10
 				 have been found.  */
00db10
@@ -56,7 +63,7 @@ ENTRY (BP_SYM(__strchrnul))
00db10
 
00db10
 	mtcrf   0x01,r8
00db10
 
00db10
-	/* Are we now aligned to a quadword boundary?  If so, skip to
00db10
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
00db10
 	   the main loop.  Otherwise, go through the alignment code.  */
00db10
 
00db10
 	bt	29,L(loop)
00db10
@@ -78,7 +85,7 @@ L(loop):
00db10
 	   single register for speed.  This is an attempt
00db10
 	   to speed up the null-checking process for bigger strings.  */
00db10
 	lwz	r12,4(r8)
00db10
-	lwzu     r11,8(r8)
00db10
+	lwzu	r11,8(r8)
00db10
 	cmpb	r10,r12,r0
00db10
 	cmpb	r9,r12,r4
00db10
 	cmpb	r6,r11,r0
00db10
@@ -97,9 +104,9 @@ L(loop):
00db10
 	addi	r8,r8,-4
00db10
 	bne	cr6,L(done)
00db10
 
00db10
-	/* The c/null byte must be in the second word.  Adjust the
00db10
-	   address again and move the result of cmpb to r10 so we can calculate
00db10
-	   the pointer.  */
00db10
+	/* The c/null byte must be in the second word.  Adjust the address
00db10
+	   again and move the result of cmpb to r5 so we can calculate the
00db10
+	   pointer.  */
00db10
 	mr	r5,r10
00db10
 	addi	r8,r8,4
00db10
 
00db10
@@ -107,7 +114,13 @@ L(loop):
00db10
 	   0xff in the same position as the c/null byte in the original
00db10
 	   word from the string.  Use that to calculate the pointer.  */
00db10
 L(done):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi    r0,r5,-1
00db10
+	andc    r0,r0,r5
00db10
+	popcntw	r0,r0
00db10
+#else
00db10
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
00db10
+#endif
00db10
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
00db10
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
00db10
 	blr
00db10
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strchr.S
00db10
===================================================================
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strchr.S.orig
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strchr.S
00db10
@@ -44,6 +44,8 @@ ENTRY (BP_SYM (strchr))
00db10
 #define rIGN	r10	/* number of bits we should ignore in the first word */
00db10
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
00db10
 #define rTMP3	r12
00db10
+#define rTMP4	rIGN
00db10
+#define rTMP5	rMASK
00db10
 
00db10
 	CHECK_BOUNDS_LOW (rSTR, rTMP1, rTMP2)
00db10
 	STORE_RETURN_BOUNDS (rTMP1, rTMP2)
00db10
@@ -59,53 +61,74 @@ ENTRY (BP_SYM (strchr))
00db10
 	addi	r7F7F, r7F7F, 0x7f7f
00db10
 /* Test the first (partial?) word.  */
00db10
 	lwz	rWORD, 0(rSTR)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	slw	rMASK, rMASK, rIGN
00db10
+#else
00db10
 	srw	rMASK, rMASK, rIGN
00db10
+#endif
00db10
 	orc	rWORD, rWORD, rMASK
00db10
 	add	rTMP1, rFEFE, rWORD
00db10
 	nor	rTMP2, r7F7F, rWORD
00db10
-	and.	rTMP1, rTMP1, rTMP2
00db10
+	and.	rTMP4, rTMP1, rTMP2
00db10
 	xor	rTMP3, rCHR, rWORD
00db10
 	orc	rTMP3, rTMP3, rMASK
00db10
 	b	L(loopentry)
00db10
 
00db10
 /* The loop.  */
00db10
 
00db10
-L(loop):lwzu rWORD, 4(rSTR)
00db10
-	and.	rTMP1, rTMP1, rTMP2
00db10
+L(loop):
00db10
+	lwzu	rWORD, 4(rSTR)
00db10
+	and.	rTMP5, rTMP1, rTMP2
00db10
 /* Test for 0.	*/
00db10
-	add	rTMP1, rFEFE, rWORD
00db10
-	nor	rTMP2, r7F7F, rWORD
00db10
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
00db10
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
00db10
 	bne	L(foundit)
00db10
-	and.	rTMP1, rTMP1, rTMP2
00db10
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
00db10
 /* Start test for the bytes we're looking for.  */
00db10
 	xor	rTMP3, rCHR, rWORD
00db10
 L(loopentry):
00db10
 	add	rTMP1, rFEFE, rTMP3
00db10
 	nor	rTMP2, r7F7F, rTMP3
00db10
 	beq	L(loop)
00db10
+
00db10
 /* There is a zero byte in the word, but may also be a matching byte (either
00db10
    before or after the zero byte).  In fact, we may be looking for a
00db10
-   zero byte, in which case we return a match.  We guess that this hasn't
00db10
-   happened, though.  */
00db10
-L(missed):
00db10
-	and.	rTMP1, rTMP1, rTMP2
00db10
+   zero byte, in which case we return a match.  */
00db10
+	and.	rTMP5, rTMP1, rTMP2
00db10
 	li	rRTN, 0
00db10
 	STORE_RETURN_VALUE (rSTR)
00db10
 	beqlr
00db10
-/* It did happen. Decide which one was first...
00db10
-   I'm not sure if this is actually faster than a sequence of
00db10
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
00db10
+/* At this point:
00db10
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
00db10
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
00db10
+   But there may be false matches in the next most significant byte from
00db10
+   a true match due to carries.  This means we need to recalculate the
00db10
+   matches using a longer method for big-endian.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi	rTMP1, rTMP5, -1
00db10
+	andc	rTMP1, rTMP1, rTMP5
00db10
+	cntlzw	rCLZB, rTMP1
00db10
+	addi	rTMP2, rTMP4, -1
00db10
+	andc	rTMP2, rTMP2, rTMP4
00db10
+	cmplw	rTMP1, rTMP2
00db10
+	bgtlr
00db10
+	subfic	rCLZB, rCLZB, 32-7
00db10
+#else
00db10
+/* I think we could reduce this by two instructions by keeping the "nor"
00db10
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
00db10
+   one instruction could be pruned from L(foundit).  */
00db10
 	and	rFEFE, r7F7F, rWORD
00db10
-	or	rMASK, r7F7F, rWORD
00db10
+	or	rTMP5, r7F7F, rWORD
00db10
 	and	rTMP1, r7F7F, rTMP3
00db10
-	or	rIGN, r7F7F, rTMP3
00db10
+	or	rTMP4, r7F7F, rTMP3
00db10
 	add	rFEFE, rFEFE, r7F7F
00db10
 	add	rTMP1, rTMP1, r7F7F
00db10
-	nor	rWORD, rMASK, rFEFE
00db10
-	nor	rTMP2, rIGN, rTMP1
00db10
+	nor	rWORD, rTMP5, rFEFE
00db10
+	nor	rTMP2, rTMP4, rTMP1
00db10
+	cntlzw	rCLZB, rTMP2
00db10
 	cmplw	rWORD, rTMP2
00db10
 	bgtlr
00db10
-	cntlzw	rCLZB, rTMP2
00db10
+#endif
00db10
 	srwi	rCLZB, rCLZB, 3
00db10
 	add	rRTN, rSTR, rCLZB
00db10
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge)
00db10
@@ -113,13 +136,21 @@ L(missed):
00db10
 	blr
00db10
 
00db10
 L(foundit):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi	rTMP1, rTMP5, -1
00db10
+	andc	rTMP1, rTMP1, rTMP5
00db10
+	cntlzw	rCLZB, rTMP1
00db10
+	subfic	rCLZB, rCLZB, 32-7-32
00db10
+	srawi	rCLZB, rCLZB, 3
00db10
+#else
00db10
 	and	rTMP1, r7F7F, rTMP3
00db10
-	or	rIGN, r7F7F, rTMP3
00db10
+	or	rTMP4, r7F7F, rTMP3
00db10
 	add	rTMP1, rTMP1, r7F7F
00db10
-	nor	rTMP2, rIGN, rTMP1
00db10
+	nor	rTMP2, rTMP4, rTMP1
00db10
 	cntlzw	rCLZB, rTMP2
00db10
 	subi	rSTR, rSTR, 4
00db10
 	srwi	rCLZB, rCLZB, 3
00db10
+#endif
00db10
 	add	rRTN, rSTR, rCLZB
00db10
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge)
00db10
 	STORE_RETURN_VALUE (rSTR)
00db10
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchr.S
00db10
===================================================================
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchr.S.orig
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchr.S
00db10
@@ -37,8 +37,8 @@ ENTRY (BP_SYM(strchr))
00db10
 	beq	cr7,L(null_match)
00db10
 
00db10
 	/* Replicate byte to doubleword.  */
00db10
-	rlwimi	r4,r4,8,16,23
00db10
-	rlwimi	r4,r4,16,0,15
00db10
+	insrdi	r4,r4,8,48
00db10
+	insrdi	r4,r4,16,32
00db10
 	insrdi  r4,r4,32,0
00db10
 
00db10
 	/* Now r4 has a doubleword of c bytes and r0 has
00db10
@@ -49,11 +49,17 @@ ENTRY (BP_SYM(strchr))
00db10
 
00db10
 	/* Move the doublewords left and right to discard the bits that are
00db10
 	   not part of the string and bring them back as zeros.  */
00db10
-
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	srd	r10,r10,r6
00db10
+	srd	r11,r11,r6
00db10
+	sld	r10,r10,r6
00db10
+	sld	r11,r11,r6
00db10
+#else
00db10
 	sld	r10,r10,r6
00db10
 	sld	r11,r11,r6
00db10
 	srd	r10,r10,r6
00db10
 	srd	r11,r11,r6
00db10
+#endif
00db10
 	or	r5,r10,r11    /* OR the results to speed things up.  */
00db10
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
00db10
 				 have been found.  */
00db10
@@ -110,15 +116,24 @@ L(loop):
00db10
 	mr	r11,r7
00db10
 	addi	r8,r8,8
00db10
 
00db10
-	/* r5 has the output of the cmpb instruction, that is, it contains
00db10
+	/* r10/r11 have the output of the cmpb instructions, that is,
00db10
 	   0xff in the same position as the c/null byte in the original
00db10
 	   doubleword from the string.  Use that to calculate the pointer.  */
00db10
 L(done):
00db10
-	cntlzd	r4,r10	      /* Count leading zeroes before c matches.  */
00db10
-	cntlzd	r0,r11	      /* Count leading zeroes before null matches.  */
00db10
-	cmpld	cr7,r4,r0
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi    r3,r10,-1
00db10
+	andc    r3,r3,r10
00db10
+	popcntd	r0,r3
00db10
+	addi    r4,r11,-1
00db10
+	andc    r4,r4,r11
00db10
+	cmpld	cr7,r3,r4
00db10
 	bgt	cr7,L(no_match)
00db10
-	srdi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
00db10
+#else
00db10
+	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
00db10
+	cmpld	cr7,r11,r10
00db10
+	bgt	cr7,L(no_match)
00db10
+#endif
00db10
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
00db10
 	add	r3,r8,r0      /* Return address of the matching c byte
00db10
 				 or null in case c was not found.  */
00db10
 	blr
00db10
@@ -137,9 +152,13 @@ L(null_match):
00db10
 
00db10
 	/* Move the doublewords left and right to discard the bits that are
00db10
 	   not part of the string and bring them back as zeros.  */
00db10
-
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	srd	r5,r5,r6
00db10
+	sld	r5,r5,r6
00db10
+#else
00db10
 	sld	r5,r5,r6
00db10
 	srd	r5,r5,r6
00db10
+#endif
00db10
 	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
00db10
 				 have been found.  */
00db10
 	bne	cr7,L(done_null)
00db10
@@ -194,7 +213,13 @@ L(loop_null):
00db10
 	   0xff in the same position as the null byte in the original
00db10
 	   doubleword from the string.  Use that to calculate the pointer.  */
00db10
 L(done_null):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi    r0,r5,-1
00db10
+	andc    r0,r0,r5
00db10
+	popcntd	r0,r0
00db10
+#else
00db10
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
00db10
+#endif
00db10
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
00db10
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
00db10
 	blr
00db10
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchrnul.S
00db10
===================================================================
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchrnul.S.orig
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strchrnul.S
00db10
@@ -29,8 +29,8 @@ ENTRY (BP_SYM(__strchrnul))
00db10
 	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
00db10
 
00db10
 	/* Replicate byte to doubleword.  */
00db10
-	rlwimi	r4,r4,8,16,23
00db10
-	rlwimi	r4,r4,16,0,15
00db10
+	insrdi	r4,r4,8,48
00db10
+	insrdi	r4,r4,16,32
00db10
 	insrdi	r4,r4,32,0
00db10
 
00db10
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
00db10
@@ -46,10 +46,17 @@ ENTRY (BP_SYM(__strchrnul))
00db10
 
00db10
 	/* Move the doublewords left and right to discard the bits that are
00db10
 	   not part of the string and to bring them back as zeros.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	srd	r10,r10,r6
00db10
+	srd	r9,r9,r6
00db10
+	sld	r10,r10,r6
00db10
+	sld	r9,r9,r6
00db10
+#else
00db10
 	sld	r10,r10,r6
00db10
 	sld	r9,r9,r6
00db10
 	srd	r10,r10,r6
00db10
 	srd	r9,r9,r6
00db10
+#endif
00db10
 	or	r5,r9,r10     /* OR the results to speed things up.  */
00db10
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
00db10
 				 have been found.  */
00db10
@@ -99,7 +106,7 @@ L(loop):
00db10
 	bne	cr6,L(done)
00db10
 
00db10
 	/* The c/null byte must be in the second doubleword.  Adjust the
00db10
-	   address again and move the result of cmpb to r10 so we can calculate
00db10
+	   address again and move the result of cmpb to r5 so we can calculate
00db10
 	   the pointer.  */
00db10
 	mr	r5,r10
00db10
 	addi	r8,r8,8
00db10
@@ -108,7 +115,13 @@ L(loop):
00db10
 	   0xff in the same position as the c/null byte in the original
00db10
 	   doubleword from the string.  Use that to calculate the pointer.  */
00db10
 L(done):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi    r0,r5,-1
00db10
+	andc    r0,r0,r5
00db10
+	popcntd	r0,r0
00db10
+#else
00db10
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
00db10
+#endif
00db10
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
00db10
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
00db10
 	blr
00db10
Index: glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strchr.S
00db10
===================================================================
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strchr.S.orig
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strchr.S
00db10
@@ -50,14 +50,16 @@ ENTRY (BP_SYM (strchr))
00db10
 #define rIGN	r10	/* number of bits we should ignore in the first word */
00db10
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
00db10
 #define rTMP3	r12
00db10
+#define rTMP4	rIGN
00db10
+#define rTMP5	rMASK
00db10
 
00db10
 	CHECK_BOUNDS_LOW (rSTR, rTMP1, rTMP2)
00db10
 	STORE_RETURN_BOUNDS (rTMP1, rTMP2)
00db10
 
00db10
 	dcbt	0,rRTN
00db10
-	rlwimi	rCHR, rCHR, 8, 16, 23
00db10
+	insrdi	rCHR, rCHR, 8, 48
00db10
 	li	rMASK, -1
00db10
-	rlwimi	rCHR, rCHR, 16, 0, 15
00db10
+	insrdi	rCHR, rCHR, 16, 32
00db10
 	rlwinm	rIGN, rRTN, 3, 26, 28
00db10
 	insrdi	rCHR, rCHR, 32, 0
00db10
 	lis	rFEFE, -0x101
00db10
@@ -70,53 +72,74 @@ ENTRY (BP_SYM (strchr))
00db10
 	add	rFEFE, rFEFE, rTMP1
00db10
 /* Test the first (partial?) word.  */
00db10
 	ld	rWORD, 0(rSTR)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	sld	rMASK, rMASK, rIGN
00db10
+#else
00db10
 	srd	rMASK, rMASK, rIGN
00db10
+#endif
00db10
 	orc	rWORD, rWORD, rMASK
00db10
 	add	rTMP1, rFEFE, rWORD
00db10
 	nor	rTMP2, r7F7F, rWORD
00db10
-	and.	rTMP1, rTMP1, rTMP2
00db10
+	and.	rTMP4, rTMP1, rTMP2
00db10
 	xor	rTMP3, rCHR, rWORD
00db10
 	orc	rTMP3, rTMP3, rMASK
00db10
 	b	L(loopentry)
00db10
 
00db10
 /* The loop.  */
00db10
 
00db10
-L(loop):ldu rWORD, 8(rSTR)
00db10
-	and.	rTMP1, rTMP1, rTMP2
00db10
+L(loop):
00db10
+	ldu	rWORD, 8(rSTR)
00db10
+	and.	rTMP5, rTMP1, rTMP2
00db10
 /* Test for 0.	*/
00db10
-	add	rTMP1, rFEFE, rWORD
00db10
-	nor	rTMP2, r7F7F, rWORD
00db10
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
00db10
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
00db10
 	bne	L(foundit)
00db10
-	and.	rTMP1, rTMP1, rTMP2
00db10
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
00db10
 /* Start test for the bytes we're looking for.  */
00db10
 	xor	rTMP3, rCHR, rWORD
00db10
 L(loopentry):
00db10
 	add	rTMP1, rFEFE, rTMP3
00db10
 	nor	rTMP2, r7F7F, rTMP3
00db10
 	beq	L(loop)
00db10
+
00db10
 /* There is a zero byte in the word, but may also be a matching byte (either
00db10
    before or after the zero byte).  In fact, we may be looking for a
00db10
-   zero byte, in which case we return a match.  We guess that this hasn't
00db10
-   happened, though.  */
00db10
-L(missed):
00db10
-	and.	rTMP1, rTMP1, rTMP2
00db10
+   zero byte, in which case we return a match.  */
00db10
+	and.	rTMP5, rTMP1, rTMP2
00db10
 	li	rRTN, 0
00db10
 	STORE_RETURN_VALUE (rSTR)
00db10
 	beqlr
00db10
-/* It did happen. Decide which one was first...
00db10
-   I'm not sure if this is actually faster than a sequence of
00db10
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
00db10
+/* At this point:
00db10
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
00db10
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
00db10
+   But there may be false matches in the next most significant byte from
00db10
+   a true match due to carries.  This means we need to recalculate the
00db10
+   matches using a longer method for big-endian.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi	rTMP1, rTMP5, -1
00db10
+	andc	rTMP1, rTMP1, rTMP5
00db10
+	cntlzd	rCLZB, rTMP1
00db10
+	addi	rTMP2, rTMP4, -1
00db10
+	andc	rTMP2, rTMP2, rTMP4
00db10
+	cmpld	rTMP1, rTMP2
00db10
+	bgtlr
00db10
+	subfic	rCLZB, rCLZB, 64-7
00db10
+#else
00db10
+/* I think we could reduce this by two instructions by keeping the "nor"
00db10
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
00db10
+   one instruction could be pruned from L(foundit).  */
00db10
 	and	rFEFE, r7F7F, rWORD
00db10
-	or	rMASK, r7F7F, rWORD
00db10
+	or	rTMP5, r7F7F, rWORD
00db10
 	and	rTMP1, r7F7F, rTMP3
00db10
-	or	rIGN, r7F7F, rTMP3
00db10
+	or	rTMP4, r7F7F, rTMP3
00db10
 	add	rFEFE, rFEFE, r7F7F
00db10
 	add	rTMP1, rTMP1, r7F7F
00db10
-	nor	rWORD, rMASK, rFEFE
00db10
-	nor	rTMP2, rIGN, rTMP1
00db10
+	nor	rWORD, rTMP5, rFEFE
00db10
+	nor	rTMP2, rTMP4, rTMP1
00db10
+	cntlzd	rCLZB, rTMP2
00db10
 	cmpld	rWORD, rTMP2
00db10
 	bgtlr
00db10
-	cntlzd	rCLZB, rTMP2
00db10
+#endif
00db10
 	srdi	rCLZB, rCLZB, 3
00db10
 	add	rRTN, rSTR, rCLZB
00db10
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge)
00db10
@@ -124,13 +147,21 @@ L(missed):
00db10
 	blr
00db10
 
00db10
 L(foundit):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi	rTMP1, rTMP5, -1
00db10
+	andc	rTMP1, rTMP1, rTMP5
00db10
+	cntlzd	rCLZB, rTMP1
00db10
+	subfic	rCLZB, rCLZB, 64-7-64
00db10
+	sradi	rCLZB, rCLZB, 3
00db10
+#else
00db10
 	and	rTMP1, r7F7F, rTMP3
00db10
-	or	rIGN, r7F7F, rTMP3
00db10
+	or	rTMP4, r7F7F, rTMP3
00db10
 	add	rTMP1, rTMP1, r7F7F
00db10
-	nor	rTMP2, rIGN, rTMP1
00db10
+	nor	rTMP2, rTMP4, rTMP1
00db10
 	cntlzd	rCLZB, rTMP2
00db10
 	subi	rSTR, rSTR, 8
00db10
 	srdi	rCLZB, rCLZB, 3
00db10
+#endif
00db10
 	add	rRTN, rSTR, rCLZB
00db10
 	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge)
00db10
 	STORE_RETURN_VALUE (rSTR)