olga / rpms / glibc

Forked from rpms/glibc 5 years ago
Clone
00db10
    Backport of
00db10
    commit 72607db038df1a1a7987af814aad8d2ed466c45c
00db10
    Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
00db10
    Date:   Fri Jan 9 11:56:35 2015 -0500
00db10
    
00db10
        powerpc: Optimize POWER7 strcmp trailing checks
00db10
    
00db10
        This patch optimized the POWER7 trailing check by avoiding using byte
00db10
        read operations and instead use the doubleword already readed with
00db10
        bitwise operations.
00db10
    
00db10
        ChangeLog:
00db10
    	2015-01-13  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
00db10
    		    Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
00db10
    
00db10
    	* sysdeps/powerpc/powerpc64/power7/strcmp.S (strcmp): Optimize
00db10
    	trailing byte check.
00db10
00db10
diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S
00db10
index f16a9d8..ade2811 100644
00db10
--- a/sysdeps/powerpc/powerpc64/power7/strcmp.S
00db10
+++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S
00db10
@@ -25,122 +25,96 @@
00db10
 
00db10
 /* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])  */
00db10
 
00db10
+	.machine	power7
00db10
 EALIGN (strcmp, 4, 0)
00db10
 	CALL_MCOUNT 2
00db10
 
00db10
 	or r9, r3, r4
00db10
 	rldicl. r10, r9, 0, 61	/* are s1 and s2 8 byte aligned..?  */
00db10
 	bne cr0, L(process_unaligned_bytes)
00db10
+	li	r5, 0
00db10
 
00db10
+	.align 4
00db10
 /* process input parameters on double word aligned boundary  */
00db10
-	ld r9, 0(r4)		/* load s2 at offset=0  */
00db10
-	li r10, 0		/* load mask=0  */
00db10
-	cmpb r10, r9, r10	/* compare bytes at s2 with mask  */
00db10
-	cmpdi cr7, r10, 0	/* is NULL found ..? is end of string HIT  */
00db10
-	bne cr7, L(process_unaligned_bytes)	/* process byte by byte  */
00db10
-
00db10
-	ld r10, 0(r3)		/* load s1 at offset=0  */
00db10
-	li r8, 0		/* load mask=0  */
00db10
-	cmpb r8, r10, r8	/* compare bytes at s1 with mask  */
00db10
-	cmpdi cr7, r8, 0	/* is NULL found ..? is end of string HIT  */
00db10
-	bne cr7, L(process_unaligned_bytes)	/* process byte by byte  */
00db10
-
00db10
-/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO  */
00db10
-	cmpb r9, r10, r9	/* compare s1 and s2  */
00db10
-	cmpdi cr7, r9, -1	/* compare result with 0xFFFFFFFFFFFFFFFF  */
00db10
-	bne cr7, L(process_unaligned_bytes)	/* s1,s2 mismatch found  */
00db10
-
00db10
-	addi r5, r3, 8		/* save next offset of s2  */
00db10
-	addi r11, r4, 8		/* save next offset of s1  */
00db10
-	ld r8, 8(r4)		/* load s2 at offset=8  */
00db10
-	li r9, 0		/* load mask=0  */
00db10
-	cmpb r9, r8, r9		/* compare bytes at s2 with mask  */
00db10
-	cmpdi cr7, r9, 0	/* NULL found ..?  */
00db10
-	bne cr7, L(processBytes)/* update input and process bytes one by one  */
00db10
-
00db10
-	mr r9, r4		/* save s2  */
00db10
-	li r10, 0		/* load mask=0  */
00db10
-
00db10
-	ld r7, 8(r3)		/* load s1 at offset=8  */
00db10
-	cmpb r6, r7, r10	/* compare bytes at s1 with mask  */
00db10
-	cmpdi cr7, r6, 0	/* is NULL found  */
00db10
-	bne cr7, L(processBytes)/* mismatch, so process one by one  */
00db10
-
00db10
 L(unrollDword):
00db10
-	cmpb r8, r7, r8		/* compare s1 and s2  */
00db10
-	cmpdi cr7, r8, -1	/* compare result with 0xFFFFFFFFFFFFFFFF  */
00db10
-	bne cr7, L(processBytes)/* mismatch with s1 and s2  */
00db10
-
00db10
-	addi r5, r3, 16		/* save offset=16 of s1  */
00db10
-	addi r4, r9, 16		/* save offset=16 of s2  */
00db10
-	ld r8, 16(r9)		/* load s2 at offset=16  */
00db10
-	cmpb r7, r8, r10	/* compare bytes at s2 with mask  */
00db10
-	cmpdi cr7, r7, 0	/* NULL found  ..?  */
00db10
-	bne cr7, L(update2processBytes)
00db10
-
00db10
-	ld r7, 16(r3)		/* load s1 at offset=16  */
00db10
-	cmpb r6, r7, r10	/* check s1 for end of string  */
00db10
-	cmpdi cr7, r6, 0	/* end of s1 ?,then handle byte by byte  */
00db10
-	bne 7,L(update2processBytes)
00db10
-
00db10
-	cmpb r8, r7, r8		/* compare s1 and s2 double words  */
00db10
-	cmpdi cr7, r8, -1	/* compare results with 0xFFFFFFFFFFFFFFFF  */
00db10
-	bne cr7,L(update2processBytes)
00db10
-
00db10
-	addi r5, r3, 24		/* update s1 to offset=24  */
00db10
-	addi r4, r9, 24		/* update s2 to offset=24  */
00db10
-
00db10
-	ld r8, 24(r9)		/* load s2  */
00db10
-	cmpb r7, r8, r10	/* compare s2 for NULL  */
00db10
-	cmpdi cr7, r7, 0	/* verify if s2 is ending now  */
00db10
-	bne cr7,L(update2processBytes)
00db10
-
00db10
-	ld r7, 24(r3)		/* load s1 at offset=24  */
00db10
-	cmpb r6, r7, r10	/* verify for NULL  */
00db10
-	cmpdi cr7, r6, 0	/* is NULL found  */
00db10
-	bne cr7, L(update2processBytes)
00db10
-
00db10
-	cmpb r8, r7, r8		/* compare s1 and s2  */
00db10
-	cmpdi cr7, r8, -1	/* are s1 and s2 same ..?  */
00db10
-	bne cr7, L(update2processBytes)
00db10
-
00db10
-	addi r7, r9, 32		/* update s2 to next double word  */
00db10
-	addi r3, r3, 32		/* update s1 to next double word  */
00db10
-
00db10
-	ld r8, 32(r9)		/* load s2  */
00db10
-	mr r4, r7		/* save s2  */
00db10
-	cmpb r6, r8, r10	/* compare s2 with NULL  */
00db10
-	cmpdi cr7, r6, 0	/* end of s2 ..? */
00db10
-	bne cr7, L(process_unaligned_bytes)
00db10
-
00db10
-	ld r6, 0(r3)		/* load and compare s1 for NULL  */
00db10
-	cmpb r5, r6, r10
00db10
-	cmpdi cr7, r5, 0
00db10
-	bne cr7, L(process_unaligned_bytes)
00db10
-
00db10
-	cmpb r8, r6, r8		/* compare s1 and s2  */
00db10
-	cmpdi cr7, r8, -1
00db10
-	bne cr7, L(process_unaligned_bytes)
00db10
-
00db10
-	addi r5, r3, 8		/* increment s1 and d2 here  */
00db10
-	addi r11, r9, 40
00db10
-
00db10
-	ld r8, 40(r9)		/* process s2 now  */
00db10
-	cmpb r9, r8, r10
00db10
-	cmpdi cr7, r9, 0
00db10
-	bne cr7, L(processBytes)
00db10
-
00db10
-	mr r9, r7
00db10
-	ld r7, 8(r3)		/* process s1 now  */
00db10
-	cmpb r6, r7, r10
00db10
-	cmpdi cr7, r6, 0
00db10
-	beq cr7, L(unrollDword)	/* unroll to compare s1 and s2  */
00db10
-
00db10
-L(processBytes):
00db10
-	mr r4, r11		/* update input params  */
00db10
-	mr r3, r5
00db10
-
00db10
-	.p2align 4
00db10
+	ld	r8,0(r3)
00db10
+	ld	r10,0(r4)
00db10
+	cmpb	r7,r8,r5
00db10
+	cmpdi	cr7,r7,0
00db10
+	mr	r9,r7
00db10
+	bne 	cr7,L(null_found)
00db10
+	cmpld	cr7,r8,r10
00db10
+	bne	cr7,L(different)
00db10
+
00db10
+	ld	r8,8(r3)
00db10
+	ld	r10,8(r4)
00db10
+	cmpb	r7,r8,r5
00db10
+	cmpdi	cr7,r7,0
00db10
+	mr	r9,r7
00db10
+	bne 	cr7,L(null_found)
00db10
+	cmpld	cr7,r8,r10
00db10
+	bne	cr7,L(different)
00db10
+
00db10
+	ld	r8,16(r3)
00db10
+	ld	r10,16(r4)
00db10
+	cmpb	r7,r8,r5
00db10
+	cmpdi	cr7,r7,0
00db10
+	mr	r9,r7
00db10
+	bne 	cr7,L(null_found)
00db10
+	cmpld	cr7,r8,r10
00db10
+	bne	cr7,L(different)
00db10
+
00db10
+	ld	r8,24(r3)
00db10
+	ld	r10,24(r4)
00db10
+	cmpb	r7,r8,r5
00db10
+	cmpdi	cr7,r7,0
00db10
+	mr	r9,r7
00db10
+	bne 	cr7,L(null_found)
00db10
+	cmpld	cr7,r8,r10
00db10
+	bne	cr7,L(different)
00db10
+
00db10
+	addi r3, r3, 32
00db10
+	addi r4, r4, 32
00db10
+	beq cr7, L(unrollDword)
00db10
+
00db10
+	.align 4
00db10
+L(null_found):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	neg	r7,r9
00db10
+	and	r9,r9,r7
00db10
+	li	r7,-1
00db10
+	cntlzd	r9,r9
00db10
+	subfic	r9,r9,71
00db10
+	sld	r9,r7,r9
00db10
+#else
00db10
+	cntlzd	r9,r9
00db10
+	li	r7,-1
00db10
+	addi	r9,r9,8
00db10
+	srd	r9,r7,r9
00db10
+#endif
00db10
+	or	r8,r8,r9
00db10
+	or	r10,r10,r9
00db10
+
00db10
+L(different):
00db10
+	cmpb	r9,r8,r10
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	addi	r7,r9,1
00db10
+	andc	r9,r7,r9
00db10
+	cntlzd	r9,r9
00db10
+	subfic	r9,r9,63
00db10
+#else
00db10
+	not	r9,r9
00db10
+	cntlzd	r9,r9
00db10
+	subfic	r9,r9,56
00db10
+#endif
00db10
+	srd	r3,r8,r9
00db10
+	srd	r10,r10,r9
00db10
+	rldicl	r10,r10,0,56
00db10
+	rldicl	r3,r3,0,56
00db10
+	subf	r3,r10,r3
00db10
+	blr
00db10
+
00db10
+	.align 4
00db10
 L(process_unaligned_bytes):
00db10
 	lbz r9, 0(r3)		/* load byte from s1  */
00db10
 	lbz r10, 0(r4)		/* load byte from s2  */
00db10
@@ -172,24 +146,19 @@ L(process_unaligned_bytes):
00db10
 	addi r4, r4, 4		/* increment s2 by unroll factor  */
00db10
 	beq cr6, L(process_unaligned_bytes)	/* unroll byte processing  */
00db10
 
00db10
-	.p2align 4
00db10
+	.align 4
00db10
 L(ComputeDiff):
00db10
 	extsw r9, r9
00db10
 	subf r10, r10, r9	/* compute s1 - s2  */
00db10
 	extsw r3, r10
00db10
 	blr			/* return  */
00db10
 
00db10
-	.p2align 4
00db10
+	.align 4
00db10
 L(diffOfNULL):
00db10
 	li r9, 0
00db10
 	subf r10, r10, r9	/* compute s1 - s2  */
00db10
 	extsw r3, r10		/* sign extend result  */
00db10
 	blr			/* return  */
00db10
 
00db10
-	.p2align 4
00db10
-L(update2processBytes):
00db10
-	mr r3, r5		/* update and proceed  */
00db10
-	b L(process_unaligned_bytes)
00db10
-
00db10
 END (strcmp)
00db10
 libc_hidden_builtin_def (strcmp)