|
|
bca718 |
Backport of
|
|
|
bca718 |
commit ce6615c9c686acd34672a9f4eba9bcf5553496f6
|
|
|
bca718 |
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
|
|
|
bca718 |
Date: Sun Jan 11 19:33:17 2015 -0600
|
|
|
bca718 |
|
|
|
bca718 |
powerpc: Fix POWER7/PPC64 performance regression on LE
|
|
|
bca718 |
|
|
|
bca718 |
This patch fixes a performance regression on the POWER7/PPC64 memcmp
|
|
|
bca718 |
porting for Little Endian. The LE code uses 'ldbrx' instruction to read
|
|
|
bca718 |
the memory on byte reversed form, however ISA 2.06 just provide the indexed
|
|
|
bca718 |
form which uses a register value as additional index, instead of a fixed value
|
|
|
bca718 |
enconded in the instruction.
|
|
|
bca718 |
|
|
|
bca718 |
And the port strategy for LE uses r0 index value and update the address
|
|
|
bca718 |
value on each compare loop interation. For large compare size values,
|
|
|
bca718 |
it adds 8 more instructions plus some more depending of trailing
|
|
|
bca718 |
size. This patch fixes it by adding pre-calculate indexes to remove the
|
|
|
bca718 |
address update on loops and tailing sizes.
|
|
|
bca718 |
|
|
|
bca718 |
For large sizes it shows a considerable gain, with double performance
|
|
|
bca718 |
pairing with BE.
|
|
|
bca718 |
|
|
|
bca718 |
ChangeLog:
|
|
|
bca718 |
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
|
|
|
bca718 |
|
|
|
bca718 |
* sysdeps/powerpc/powerpc64/power7/memcmp.S (memcmp): Fix performance
|
|
|
bca718 |
regression on LE.
|
|
|
bca718 |
|
|
|
bca718 |
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
|
|
|
bca718 |
index 09bff69..98b9e54 100644
|
|
|
bca718 |
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
|
|
|
bca718 |
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
|
|
|
bca718 |
@@ -26,18 +26,48 @@
|
|
|
bca718 |
EALIGN (memcmp, 4, 0)
|
|
|
bca718 |
CALL_MCOUNT 3
|
|
|
bca718 |
|
|
|
bca718 |
-#define rRTN r3
|
|
|
bca718 |
-#define rSTR1 r3 /* first string arg */
|
|
|
bca718 |
-#define rSTR2 r4 /* second string arg */
|
|
|
bca718 |
-#define rN r5 /* max string length */
|
|
|
bca718 |
-#define rWORD1 r6 /* current word in s1 */
|
|
|
bca718 |
-#define rWORD2 r7 /* current word in s2 */
|
|
|
bca718 |
-#define rWORD3 r8 /* next word in s1 */
|
|
|
bca718 |
-#define rWORD4 r9 /* next word in s2 */
|
|
|
bca718 |
-#define rWORD5 r10 /* next word in s1 */
|
|
|
bca718 |
-#define rWORD6 r11 /* next word in s2 */
|
|
|
bca718 |
-#define rWORD7 r30 /* next word in s1 */
|
|
|
bca718 |
-#define rWORD8 r31 /* next word in s2 */
|
|
|
bca718 |
+#define rRTN r3
|
|
|
bca718 |
+#define rSTR1 r3 /* first string arg */
|
|
|
bca718 |
+#define rSTR2 r4 /* second string arg */
|
|
|
bca718 |
+#define rN r5 /* max string length */
|
|
|
bca718 |
+#define rWORD1 r6 /* current word in s1 */
|
|
|
bca718 |
+#define rWORD2 r7 /* current word in s2 */
|
|
|
bca718 |
+#define rWORD3 r8 /* next word in s1 */
|
|
|
bca718 |
+#define rWORD4 r9 /* next word in s2 */
|
|
|
bca718 |
+#define rWORD5 r10 /* next word in s1 */
|
|
|
bca718 |
+#define rWORD6 r11 /* next word in s2 */
|
|
|
bca718 |
+
|
|
|
bca718 |
+#define rOFF8 r20 /* 8 bytes offset. */
|
|
|
bca718 |
+#define rOFF16 r21 /* 16 bytes offset. */
|
|
|
bca718 |
+#define rOFF24 r22 /* 24 bytes offset. */
|
|
|
bca718 |
+#define rOFF32 r23 /* 24 bytes offset. */
|
|
|
bca718 |
+#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
|
|
|
bca718 |
+#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
|
|
|
bca718 |
+#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
|
|
|
bca718 |
+#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
|
|
|
bca718 |
+#define rSHR r28 /* Unaligned shift right count. */
|
|
|
bca718 |
+#define rSHL r29 /* Unaligned shift left count. */
|
|
|
bca718 |
+#define rWORD7 r30 /* next word in s1 */
|
|
|
bca718 |
+#define rWORD8 r31 /* next word in s2 */
|
|
|
bca718 |
+
|
|
|
bca718 |
+#define rWORD8SAVE (-8)
|
|
|
bca718 |
+#define rWORD7SAVE (-16)
|
|
|
bca718 |
+#define rOFF8SAVE (-24)
|
|
|
bca718 |
+#define rOFF16SAVE (-32)
|
|
|
bca718 |
+#define rOFF24SAVE (-40)
|
|
|
bca718 |
+#define rOFF32SAVE (-48)
|
|
|
bca718 |
+#define rSHRSAVE (-56)
|
|
|
bca718 |
+#define rSHLSAVE (-64)
|
|
|
bca718 |
+#define rWORD8SHIFTSAVE (-72)
|
|
|
bca718 |
+#define rWORD2SHIFTSAVE (-80)
|
|
|
bca718 |
+#define rWORD4SHIFTSAVE (-88)
|
|
|
bca718 |
+#define rWORD6SHIFTSAVE (-96)
|
|
|
bca718 |
+
|
|
|
bca718 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
+# define LD ldbrx
|
|
|
bca718 |
+#else
|
|
|
bca718 |
+# define LD ldx
|
|
|
bca718 |
+#endif
|
|
|
bca718 |
|
|
|
bca718 |
xor r0, rSTR2, rSTR1
|
|
|
bca718 |
cmpldi cr6, rN, 0
|
|
|
bca718 |
@@ -51,10 +81,24 @@ EALIGN (memcmp, 4, 0)
|
|
|
bca718 |
/* If less than 8 bytes or not aligned, use the unaligned
|
|
|
bca718 |
byte loop. */
|
|
|
bca718 |
blt cr1, L(bytealigned)
|
|
|
bca718 |
- std rWORD8, -8(r1)
|
|
|
bca718 |
- cfi_offset(rWORD8, -8)
|
|
|
bca718 |
- std rWORD7, -16(r1)
|
|
|
bca718 |
- cfi_offset(rWORD7, -16)
|
|
|
bca718 |
+ std rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD8, rWORD8SAVE)
|
|
|
bca718 |
+ std rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD7, rWORD7SAVE)
|
|
|
bca718 |
+ std rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD7, rOFF8SAVE)
|
|
|
bca718 |
+ std rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD7, rOFF16SAVE)
|
|
|
bca718 |
+ std rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD7, rOFF24SAVE)
|
|
|
bca718 |
+ std rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD7, rOFF32SAVE)
|
|
|
bca718 |
+
|
|
|
bca718 |
+ li rOFF8,8
|
|
|
bca718 |
+ li rOFF16,16
|
|
|
bca718 |
+ li rOFF24,24
|
|
|
bca718 |
+ li rOFF32,32
|
|
|
bca718 |
+
|
|
|
bca718 |
bne L(unaligned)
|
|
|
bca718 |
/* At this point we know both strings have the same alignment and the
|
|
|
bca718 |
compare length is at least 8 bytes. r12 contains the low order
|
|
|
bca718 |
@@ -79,15 +123,8 @@ L(samealignment):
|
|
|
bca718 |
sldi rWORD6, r12, 3
|
|
|
bca718 |
srdi r0, rN, 5 /* Divide by 32 */
|
|
|
bca718 |
andi. r12, rN, 24 /* Get the DW remainder */
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 0(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 0(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, 0, rSTR1
|
|
|
bca718 |
+ LD rWORD2, 0, rSTR2
|
|
|
bca718 |
cmpldi cr1, r12, 16
|
|
|
bca718 |
cmpldi cr7, rN, 32
|
|
|
bca718 |
clrldi rN, rN, 61
|
|
|
bca718 |
@@ -104,15 +141,8 @@ L(dsP1):
|
|
|
bca718 |
cmpld cr5, rWORD5, rWORD6
|
|
|
bca718 |
blt cr7, L(dP1x)
|
|
|
bca718 |
/* Do something useful in this cycle since we have to branch anyway. */
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
b L(dP1e)
|
|
|
bca718 |
/* Remainder is 16 */
|
|
|
bca718 |
@@ -123,15 +153,8 @@ L(dPs2):
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
blt cr7, L(dP2x)
|
|
|
bca718 |
/* Do something useful in this cycle since we have to branch anyway. */
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD7, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD8, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
b L(dP2e)
|
|
|
bca718 |
/* Remainder is 24 */
|
|
|
bca718 |
@@ -173,72 +196,43 @@ L(dP1):
|
|
|
bca718 |
change any on the early exit path. The key here is the non-early
|
|
|
bca718 |
exit path only cares about the condition code (cr5), not about which
|
|
|
bca718 |
register pair was used. */
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 0(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 0(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, 0, rSTR1
|
|
|
bca718 |
+ LD rWORD6, 0, rSTR2
|
|
|
bca718 |
cmpld cr5, rWORD5, rWORD6
|
|
|
bca718 |
blt cr7, L(dP1x)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
L(dP1e):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD4, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 24(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD6, rOFF24, rSTR2
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
bne cr5, L(dLcr5x)
|
|
|
bca718 |
bne cr7, L(dLcr7x)
|
|
|
bca718 |
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ldu rWORD7, 32(rSTR1)
|
|
|
bca718 |
- ldu rWORD8, 32(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF32, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF32, rSTR2
|
|
|
bca718 |
+ addi rSTR1, rSTR1, 32
|
|
|
bca718 |
+ addi rSTR2, rSTR2, 32
|
|
|
bca718 |
bne cr1, L(dLcr1)
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
bdnz L(dLoop)
|
|
|
bca718 |
bne cr6, L(dLcr6)
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
.align 3
|
|
|
bca718 |
L(dP1x):
|
|
|
bca718 |
sldi. r12, rN, 3
|
|
|
bca718 |
bne cr5, L(dLcr5x)
|
|
|
bca718 |
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
|
|
|
bca718 |
bne L(d00)
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
li rRTN, 0
|
|
|
bca718 |
blr
|
|
|
bca718 |
|
|
|
bca718 |
@@ -246,79 +240,41 @@ L(dP1x):
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dP2):
|
|
|
bca718 |
mtctr r0
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 0(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 0(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, 0, rSTR1
|
|
|
bca718 |
+ LD rWORD6, 0, rSTR2
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
blt cr7, L(dP2x)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD7, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD8, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
L(dP2e):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 24(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD4, rOFF24, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
-#ifndef __LITTLE_ENDIAN__
|
|
|
bca718 |
addi rSTR1, rSTR1, 8
|
|
|
bca718 |
addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
bne cr6, L(dLcr6)
|
|
|
bca718 |
bne cr5, L(dLcr5)
|
|
|
bca718 |
b L(dLoop2)
|
|
|
bca718 |
-/* Again we are on a early exit path (16-23 byte compare), we want to
|
|
|
bca718 |
- only use volatile registers and avoid restoring non-volatile
|
|
|
bca718 |
- registers. */
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dP2x):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD4, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
sldi. r12, rN, 3
|
|
|
bca718 |
bne cr6, L(dLcr6x)
|
|
|
bca718 |
-#ifndef __LITTLE_ENDIAN__
|
|
|
bca718 |
addi rSTR1, rSTR1, 8
|
|
|
bca718 |
addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
bne cr1, L(dLcr1x)
|
|
|
bca718 |
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
|
|
|
bca718 |
bne L(d00)
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
li rRTN, 0
|
|
|
bca718 |
blr
|
|
|
bca718 |
|
|
|
bca718 |
@@ -326,52 +282,22 @@ L(dP2x):
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dP3):
|
|
|
bca718 |
mtctr r0
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 0(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 0(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, 0, rSTR1
|
|
|
bca718 |
+ LD rWORD4, 0, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
L(dP3e):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD6, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
blt cr7, L(dP3x)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD7, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD8, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 24(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF24, rSTR2
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
-#ifndef __LITTLE_ENDIAN__
|
|
|
bca718 |
addi rSTR1, rSTR1, 16
|
|
|
bca718 |
addi rSTR2, rSTR2, 16
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
bne cr1, L(dLcr1)
|
|
|
bca718 |
bne cr6, L(dLcr6)
|
|
|
bca718 |
b L(dLoop1)
|
|
|
bca718 |
@@ -380,26 +306,21 @@ L(dP3e):
|
|
|
bca718 |
registers. */
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dP3x):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
sldi. r12, rN, 3
|
|
|
bca718 |
bne cr1, L(dLcr1x)
|
|
|
bca718 |
-#ifndef __LITTLE_ENDIAN__
|
|
|
bca718 |
addi rSTR1, rSTR1, 16
|
|
|
bca718 |
addi rSTR2, rSTR2, 16
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
bne cr6, L(dLcr6x)
|
|
|
bca718 |
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
|
|
|
bca718 |
bne cr7, L(dLcr7x)
|
|
|
bca718 |
bne L(d00)
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
li rRTN, 0
|
|
|
bca718 |
blr
|
|
|
bca718 |
|
|
|
bca718 |
@@ -407,46 +328,20 @@ L(dP3x):
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dP4):
|
|
|
bca718 |
mtctr r0
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 0(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 0(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, 0, rSTR1
|
|
|
bca718 |
+ LD rWORD2, 0, rSTR2
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
L(dP4e):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD4, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD6, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ldu rWORD7, 24(rSTR1)
|
|
|
bca718 |
- ldu rWORD8, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF24, rSTR2
|
|
|
bca718 |
+ addi rSTR1, rSTR1, 24
|
|
|
bca718 |
+ addi rSTR2, rSTR2, 24
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
bne cr7, L(dLcr7)
|
|
|
bca718 |
bne cr1, L(dLcr1)
|
|
|
bca718 |
@@ -454,51 +349,25 @@ L(dP4e):
|
|
|
bca718 |
/* This is the primary loop */
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dLoop):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
bne cr6, L(dLcr6)
|
|
|
bca718 |
L(dLoop1):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD4, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
bne cr5, L(dLcr5)
|
|
|
bca718 |
L(dLoop2):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 24(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD6, rOFF24, rSTR2
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
bne cr7, L(dLcr7)
|
|
|
bca718 |
L(dLoop3):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ldu rWORD7, 32(rSTR1)
|
|
|
bca718 |
- ldu rWORD8, 32(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF32, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF32, rSTR2
|
|
|
bca718 |
+ addi rSTR1, rSTR1, 32
|
|
|
bca718 |
+ addi rSTR2, rSTR2, 32
|
|
|
bca718 |
bne cr1, L(dLcr1)
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
bdnz L(dLoop)
|
|
|
bca718 |
@@ -519,62 +388,75 @@ L(d14):
|
|
|
bca718 |
sldi. r12, rN, 3
|
|
|
bca718 |
bne cr5, L(dLcr5)
|
|
|
bca718 |
L(d04):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
|
|
|
bca718 |
- beq L(zeroLength)
|
|
|
bca718 |
+ beq L(duzeroLength)
|
|
|
bca718 |
/* At this point we have a remainder of 1 to 7 bytes to compare. Since
|
|
|
bca718 |
we are aligned it is safe to load the whole double word, and use
|
|
|
bca718 |
shift right double to eliminate bits beyond the compare length. */
|
|
|
bca718 |
L(d00):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
srd rWORD1, rWORD1, rN
|
|
|
bca718 |
srd rWORD2, rWORD2, rN
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
bne cr7, L(dLcr7x)
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
li rRTN, 0
|
|
|
bca718 |
blr
|
|
|
bca718 |
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dLcr7):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
L(dLcr7x):
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
li rRTN, 1
|
|
|
bca718 |
bgtlr cr7
|
|
|
bca718 |
li rRTN, -1
|
|
|
bca718 |
blr
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dLcr1):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
L(dLcr1x):
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
li rRTN, 1
|
|
|
bca718 |
bgtlr cr1
|
|
|
bca718 |
li rRTN, -1
|
|
|
bca718 |
blr
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dLcr6):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
L(dLcr6x):
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
li rRTN, 1
|
|
|
bca718 |
bgtlr cr6
|
|
|
bca718 |
li rRTN, -1
|
|
|
bca718 |
blr
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dLcr5):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
L(dLcr5x):
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
li rRTN, 1
|
|
|
bca718 |
bgtlr cr5
|
|
|
bca718 |
li rRTN, -1
|
|
|
bca718 |
@@ -583,10 +465,6 @@ L(dLcr5x):
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(bytealigned):
|
|
|
bca718 |
mtctr rN
|
|
|
bca718 |
-#if 0
|
|
|
bca718 |
-/* Huh? We've already branched on cr6! */
|
|
|
bca718 |
- beq cr6, L(zeroLength)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
|
|
|
bca718 |
/* We need to prime this loop. This loop is swing modulo scheduled
|
|
|
bca718 |
to avoid pipe delays. The dependent instruction latencies (load to
|
|
|
bca718 |
@@ -685,6 +563,7 @@ L(b11):
|
|
|
bca718 |
L(bx12):
|
|
|
bca718 |
sub rRTN, rWORD1, rWORD2
|
|
|
bca718 |
blr
|
|
|
bca718 |
+
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(zeroLength):
|
|
|
bca718 |
li rRTN, 0
|
|
|
bca718 |
@@ -705,42 +584,36 @@ L(zeroLength):
|
|
|
bca718 |
we need to adjust the length (rN) and special case the loop
|
|
|
bca718 |
versioning for the first DW. This ensures that the loop count is
|
|
|
bca718 |
correct and the first DW (shifted) is in the expected resister pair. */
|
|
|
bca718 |
-#define rSHL r29 /* Unaligned shift left count. */
|
|
|
bca718 |
-#define rSHR r28 /* Unaligned shift right count. */
|
|
|
bca718 |
-#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
|
|
|
bca718 |
-#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
|
|
|
bca718 |
-#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
|
|
|
bca718 |
-#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
|
|
|
bca718 |
L(unaligned):
|
|
|
bca718 |
- std rSHL, -24(r1)
|
|
|
bca718 |
- cfi_offset(rSHL, -24)
|
|
|
bca718 |
+ std rSHL, rSHLSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rSHL, rSHLSAVE)
|
|
|
bca718 |
clrldi rSHL, rSTR2, 61
|
|
|
bca718 |
beq cr6, L(duzeroLength)
|
|
|
bca718 |
- std rSHR, -32(r1)
|
|
|
bca718 |
- cfi_offset(rSHR, -32)
|
|
|
bca718 |
+ std rSHR, rSHRSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rSHR, rSHRSAVE)
|
|
|
bca718 |
beq cr5, L(DWunaligned)
|
|
|
bca718 |
- std rWORD8_SHIFT, -40(r1)
|
|
|
bca718 |
- cfi_offset(rWORD8_SHIFT, -40)
|
|
|
bca718 |
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
|
|
|
bca718 |
/* Adjust the logical start of rSTR2 to compensate for the extra bits
|
|
|
bca718 |
in the 1st rSTR1 DW. */
|
|
|
bca718 |
sub rWORD8_SHIFT, rSTR2, r12
|
|
|
bca718 |
/* But do not attempt to address the DW before that DW that contains
|
|
|
bca718 |
the actual start of rSTR2. */
|
|
|
bca718 |
clrrdi rSTR2, rSTR2, 3
|
|
|
bca718 |
- std rWORD2_SHIFT, -48(r1)
|
|
|
bca718 |
- cfi_offset(rWORD2_SHIFT, -48)
|
|
|
bca718 |
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
|
|
|
bca718 |
/* Compute the left/right shift counts for the unaligned rSTR2,
|
|
|
bca718 |
compensating for the logical (DW aligned) start of rSTR1. */
|
|
|
bca718 |
clrldi rSHL, rWORD8_SHIFT, 61
|
|
|
bca718 |
clrrdi rSTR1, rSTR1, 3
|
|
|
bca718 |
- std rWORD4_SHIFT, -56(r1)
|
|
|
bca718 |
- cfi_offset(rWORD4_SHIFT, -56)
|
|
|
bca718 |
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
|
|
|
bca718 |
sldi rSHL, rSHL, 3
|
|
|
bca718 |
cmpld cr5, rWORD8_SHIFT, rSTR2
|
|
|
bca718 |
add rN, rN, r12
|
|
|
bca718 |
sldi rWORD6, r12, 3
|
|
|
bca718 |
- std rWORD6_SHIFT, -64(r1)
|
|
|
bca718 |
- cfi_offset(rWORD6_SHIFT, -64)
|
|
|
bca718 |
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
|
|
|
bca718 |
subfic rSHR, rSHL, 64
|
|
|
bca718 |
srdi r0, rN, 5 /* Divide by 32 */
|
|
|
bca718 |
andi. r12, rN, 24 /* Get the DW remainder */
|
|
|
bca718 |
@@ -750,25 +623,13 @@ L(unaligned):
|
|
|
bca718 |
this may cross a page boundary and cause a page fault. */
|
|
|
bca718 |
li rWORD8, 0
|
|
|
bca718 |
blt cr5, L(dus0)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
+ LD rWORD8, 0, rSTR2
|
|
|
bca718 |
addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD8, 0(rSTR2)
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
sld rWORD8, rWORD8, rSHL
|
|
|
bca718 |
|
|
|
bca718 |
L(dus0):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 0(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 0(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, 0, rSTR1
|
|
|
bca718 |
+ LD rWORD2, 0, rSTR2
|
|
|
bca718 |
cmpldi cr1, r12, 16
|
|
|
bca718 |
cmpldi cr7, rN, 32
|
|
|
bca718 |
srd r12, rWORD2, rSHR
|
|
|
bca718 |
@@ -796,12 +657,7 @@ L(dusP1):
|
|
|
bca718 |
beq L(duZeroReturn)
|
|
|
bca718 |
li r0, 0
|
|
|
bca718 |
ble cr7, L(dutrim)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
srd r0, rWORD2, rSHR
|
|
|
bca718 |
b L(dutrim)
|
|
|
bca718 |
/* Remainder is 16 */
|
|
|
bca718 |
@@ -832,27 +688,21 @@ L(duPs4):
|
|
|
bca718 |
compare length is at least 8 bytes. */
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(DWunaligned):
|
|
|
bca718 |
- std rWORD8_SHIFT, -40(r1)
|
|
|
bca718 |
- cfi_offset(rWORD8_SHIFT, -40)
|
|
|
bca718 |
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
|
|
|
bca718 |
clrrdi rSTR2, rSTR2, 3
|
|
|
bca718 |
- std rWORD2_SHIFT, -48(r1)
|
|
|
bca718 |
- cfi_offset(rWORD2_SHIFT, -48)
|
|
|
bca718 |
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
|
|
|
bca718 |
srdi r0, rN, 5 /* Divide by 32 */
|
|
|
bca718 |
- std rWORD4_SHIFT, -56(r1)
|
|
|
bca718 |
- cfi_offset(rWORD4_SHIFT, -56)
|
|
|
bca718 |
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
|
|
|
bca718 |
andi. r12, rN, 24 /* Get the DW remainder */
|
|
|
bca718 |
- std rWORD6_SHIFT, -64(r1)
|
|
|
bca718 |
- cfi_offset(rWORD6_SHIFT, -64)
|
|
|
bca718 |
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
|
|
|
bca718 |
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
|
|
|
bca718 |
sldi rSHL, rSHL, 3
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
+ LD rWORD6, 0, rSTR2
|
|
|
bca718 |
+ LD rWORD8, rOFF8, rSTR2
|
|
|
bca718 |
addi rSTR2, rSTR2, 8
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD6, 0(rSTR2)
|
|
|
bca718 |
- ldu rWORD8, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
cmpldi cr1, r12, 16
|
|
|
bca718 |
cmpldi cr7, rN, 32
|
|
|
bca718 |
clrldi rN, rN, 61
|
|
|
bca718 |
@@ -867,52 +717,26 @@ L(DWunaligned):
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duP1):
|
|
|
bca718 |
srd r12, rWORD8, rSHR
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD7, 0(rSTR1)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, 0, rSTR1
|
|
|
bca718 |
sld rWORD8_SHIFT, rWORD8, rSHL
|
|
|
bca718 |
or rWORD8, r12, rWORD6_SHIFT
|
|
|
bca718 |
blt cr7, L(duP1x)
|
|
|
bca718 |
L(duP1e):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
srd r0, rWORD2, rSHR
|
|
|
bca718 |
sld rWORD2_SHIFT, rWORD2, rSHL
|
|
|
bca718 |
or rWORD2, r0, rWORD8_SHIFT
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD4, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
srd r12, rWORD4, rSHR
|
|
|
bca718 |
sld rWORD4_SHIFT, rWORD4, rSHL
|
|
|
bca718 |
bne cr5, L(duLcr5)
|
|
|
bca718 |
or rWORD4, r12, rWORD2_SHIFT
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 24(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD6, rOFF24, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
srd r0, rWORD6, rSHR
|
|
|
bca718 |
sld rWORD6_SHIFT, rWORD6, rSHL
|
|
|
bca718 |
@@ -932,82 +756,47 @@ L(duP1x):
|
|
|
bca718 |
beq L(duZeroReturn)
|
|
|
bca718 |
li r0, 0
|
|
|
bca718 |
ble cr7, L(dutrim)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
srd r0, rWORD2, rSHR
|
|
|
bca718 |
b L(dutrim)
|
|
|
bca718 |
/* Remainder is 16 */
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duP2):
|
|
|
bca718 |
srd r0, rWORD8, rSHR
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 0(rSTR1)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, 0, rSTR1
|
|
|
bca718 |
or rWORD6, r0, rWORD6_SHIFT
|
|
|
bca718 |
sld rWORD6_SHIFT, rWORD8, rSHL
|
|
|
bca718 |
L(duP2e):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD7, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD8, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
srd r12, rWORD8, rSHR
|
|
|
bca718 |
sld rWORD8_SHIFT, rWORD8, rSHL
|
|
|
bca718 |
or rWORD8, r12, rWORD6_SHIFT
|
|
|
bca718 |
blt cr7, L(duP2x)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
bne cr6, L(duLcr6)
|
|
|
bca718 |
srd r0, rWORD2, rSHR
|
|
|
bca718 |
sld rWORD2_SHIFT, rWORD2, rSHL
|
|
|
bca718 |
or rWORD2, r0, rWORD8_SHIFT
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 24(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD4, rOFF24, rSTR2
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
bne cr5, L(duLcr5)
|
|
|
bca718 |
srd r12, rWORD4, rSHR
|
|
|
bca718 |
sld rWORD4_SHIFT, rWORD4, rSHL
|
|
|
bca718 |
or rWORD4, r12, rWORD2_SHIFT
|
|
|
bca718 |
-#ifndef __LITTLE_ENDIAN__
|
|
|
bca718 |
addi rSTR1, rSTR1, 8
|
|
|
bca718 |
addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
b L(duLoop2)
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duP2x):
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
-#ifndef __LITTLE_ENDIAN__
|
|
|
bca718 |
addi rSTR1, rSTR1, 8
|
|
|
bca718 |
addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
bne cr6, L(duLcr6)
|
|
|
bca718 |
sldi. rN, rN, 3
|
|
|
bca718 |
bne cr5, L(duLcr5)
|
|
|
bca718 |
@@ -1015,12 +804,7 @@ L(duP2x):
|
|
|
bca718 |
beq L(duZeroReturn)
|
|
|
bca718 |
li r0, 0
|
|
|
bca718 |
ble cr7, L(dutrim)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
srd r0, rWORD2, rSHR
|
|
|
bca718 |
b L(dutrim)
|
|
|
bca718 |
|
|
|
bca718 |
@@ -1028,73 +812,39 @@ L(duP2x):
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duP3):
|
|
|
bca718 |
srd r12, rWORD8, rSHR
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 0(rSTR1)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, 0, rSTR1
|
|
|
bca718 |
sld rWORD4_SHIFT, rWORD8, rSHL
|
|
|
bca718 |
or rWORD4, r12, rWORD6_SHIFT
|
|
|
bca718 |
L(duP3e):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD6, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
srd r0, rWORD6, rSHR
|
|
|
bca718 |
sld rWORD6_SHIFT, rWORD6, rSHL
|
|
|
bca718 |
or rWORD6, r0, rWORD4_SHIFT
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD7, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD8, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
bne cr1, L(duLcr1)
|
|
|
bca718 |
srd r12, rWORD8, rSHR
|
|
|
bca718 |
sld rWORD8_SHIFT, rWORD8, rSHL
|
|
|
bca718 |
or rWORD8, r12, rWORD6_SHIFT
|
|
|
bca718 |
blt cr7, L(duP3x)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 24(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF24, rSTR2
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
bne cr6, L(duLcr6)
|
|
|
bca718 |
srd r0, rWORD2, rSHR
|
|
|
bca718 |
sld rWORD2_SHIFT, rWORD2, rSHL
|
|
|
bca718 |
or rWORD2, r0, rWORD8_SHIFT
|
|
|
bca718 |
-#ifndef __LITTLE_ENDIAN__
|
|
|
bca718 |
addi rSTR1, rSTR1, 16
|
|
|
bca718 |
addi rSTR2, rSTR2, 16
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
b L(duLoop1)
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duP3x):
|
|
|
bca718 |
-#ifndef __LITTLE_ENDIAN__
|
|
|
bca718 |
addi rSTR1, rSTR1, 16
|
|
|
bca718 |
addi rSTR2, rSTR2, 16
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
-#if 0
|
|
|
bca718 |
-/* Huh? We've already branched on cr1! */
|
|
|
bca718 |
- bne cr1, L(duLcr1)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
bne cr6, L(duLcr6)
|
|
|
bca718 |
sldi. rN, rN, 3
|
|
|
bca718 |
@@ -1103,12 +853,7 @@ L(duP3x):
|
|
|
bca718 |
beq L(duZeroReturn)
|
|
|
bca718 |
li r0, 0
|
|
|
bca718 |
ble cr7, L(dutrim)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
srd r0, rWORD2, rSHR
|
|
|
bca718 |
b L(dutrim)
|
|
|
bca718 |
|
|
|
bca718 |
@@ -1117,51 +862,27 @@ L(duP3x):
|
|
|
bca718 |
L(duP4):
|
|
|
bca718 |
mtctr r0
|
|
|
bca718 |
srd r0, rWORD8, rSHR
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 0(rSTR1)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, 0, rSTR1
|
|
|
bca718 |
sld rWORD2_SHIFT, rWORD8, rSHL
|
|
|
bca718 |
or rWORD2, r0, rWORD6_SHIFT
|
|
|
bca718 |
L(duP4e):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD4, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
srd r12, rWORD4, rSHR
|
|
|
bca718 |
sld rWORD4_SHIFT, rWORD4, rSHL
|
|
|
bca718 |
or rWORD4, r12, rWORD2_SHIFT
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD6, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
bne cr7, L(duLcr7)
|
|
|
bca718 |
srd r0, rWORD6, rSHR
|
|
|
bca718 |
sld rWORD6_SHIFT, rWORD6, rSHL
|
|
|
bca718 |
or rWORD6, r0, rWORD4_SHIFT
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ldu rWORD7, 24(rSTR1)
|
|
|
bca718 |
- ldu rWORD8, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF24, rSTR2
|
|
|
bca718 |
+ addi rSTR1, rSTR1, 24
|
|
|
bca718 |
+ addi rSTR2, rSTR2, 24
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
bne cr1, L(duLcr1)
|
|
|
bca718 |
srd r12, rWORD8, rSHR
|
|
|
bca718 |
@@ -1172,60 +893,34 @@ L(duP4e):
|
|
|
bca718 |
/* This is the primary loop */
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duLoop):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 8(rSTR1)
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF8, rSTR1
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
bne cr6, L(duLcr6)
|
|
|
bca718 |
srd r0, rWORD2, rSHR
|
|
|
bca718 |
sld rWORD2_SHIFT, rWORD2, rSHL
|
|
|
bca718 |
or rWORD2, r0, rWORD8_SHIFT
|
|
|
bca718 |
L(duLoop1):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD3, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD4, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD3, 16(rSTR1)
|
|
|
bca718 |
- ld rWORD4, 16(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD3, rOFF16, rSTR1
|
|
|
bca718 |
+ LD rWORD4, rOFF16, rSTR2
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
bne cr5, L(duLcr5)
|
|
|
bca718 |
srd r12, rWORD4, rSHR
|
|
|
bca718 |
sld rWORD4_SHIFT, rWORD4, rSHL
|
|
|
bca718 |
or rWORD4, r12, rWORD2_SHIFT
|
|
|
bca718 |
L(duLoop2):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD5, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD6, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD5, 24(rSTR1)
|
|
|
bca718 |
- ld rWORD6, 24(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD5, rOFF24, rSTR1
|
|
|
bca718 |
+ LD rWORD6, rOFF24, rSTR2
|
|
|
bca718 |
cmpld cr5, rWORD7, rWORD8
|
|
|
bca718 |
bne cr7, L(duLcr7)
|
|
|
bca718 |
srd r0, rWORD6, rSHR
|
|
|
bca718 |
sld rWORD6_SHIFT, rWORD6, rSHL
|
|
|
bca718 |
or rWORD6, r0, rWORD4_SHIFT
|
|
|
bca718 |
L(duLoop3):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD7, 0, rSTR1
|
|
|
bca718 |
- ldbrx rWORD8, 0, rSTR2
|
|
|
bca718 |
- addi rSTR1, rSTR1, 8
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ldu rWORD7, 32(rSTR1)
|
|
|
bca718 |
- ldu rWORD8, 32(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD7, rOFF32, rSTR1
|
|
|
bca718 |
+ LD rWORD8, rOFF32, rSTR2
|
|
|
bca718 |
+ addi rSTR1, rSTR1, 32
|
|
|
bca718 |
+ addi rSTR2, rSTR2, 32
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
bne cr1, L(duLcr1)
|
|
|
bca718 |
srd r12, rWORD8, rSHR
|
|
|
bca718 |
@@ -1234,10 +929,6 @@ L(duLoop3):
|
|
|
bca718 |
bdnz L(duLoop)
|
|
|
bca718 |
|
|
|
bca718 |
L(duL4):
|
|
|
bca718 |
-#if 0
|
|
|
bca718 |
-/* Huh? We've already branched on cr1! */
|
|
|
bca718 |
- bne cr1, L(duLcr1)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
cmpld cr1, rWORD3, rWORD4
|
|
|
bca718 |
bne cr6, L(duLcr6)
|
|
|
bca718 |
cmpld cr6, rWORD5, rWORD6
|
|
|
bca718 |
@@ -1264,99 +955,102 @@ L(du14):
|
|
|
bca718 |
beq L(duZeroReturn)
|
|
|
bca718 |
li r0, 0
|
|
|
bca718 |
ble cr7, L(dutrim)
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD2, 0, rSTR2
|
|
|
bca718 |
- addi rSTR2, rSTR2, 8
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD2, 8(rSTR2)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD2, rOFF8, rSTR2
|
|
|
bca718 |
srd r0, rWORD2, rSHR
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dutrim):
|
|
|
bca718 |
-#ifdef __LITTLE_ENDIAN__
|
|
|
bca718 |
- ldbrx rWORD1, 0, rSTR1
|
|
|
bca718 |
-#else
|
|
|
bca718 |
- ld rWORD1, 8(rSTR1)
|
|
|
bca718 |
-#endif
|
|
|
bca718 |
+ LD rWORD1, rOFF8, rSTR1
|
|
|
bca718 |
ld rWORD8, -8(r1)
|
|
|
bca718 |
subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
|
|
|
bca718 |
or rWORD2, r0, rWORD8_SHIFT
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
- ld rSHL, -24(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
+ ld rSHL, rSHLSAVE(r1)
|
|
|
bca718 |
srd rWORD1, rWORD1, rN
|
|
|
bca718 |
srd rWORD2, rWORD2, rN
|
|
|
bca718 |
- ld rSHR, -32(r1)
|
|
|
bca718 |
- ld rWORD8_SHIFT, -40(r1)
|
|
|
bca718 |
+ ld rSHR, rSHRSAVE(r1)
|
|
|
bca718 |
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
|
|
|
bca718 |
li rRTN, 0
|
|
|
bca718 |
cmpld cr7, rWORD1, rWORD2
|
|
|
bca718 |
- ld rWORD2_SHIFT, -48(r1)
|
|
|
bca718 |
- ld rWORD4_SHIFT, -56(r1)
|
|
|
bca718 |
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
|
|
|
bca718 |
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
|
|
|
bca718 |
beq cr7, L(dureturn24)
|
|
|
bca718 |
li rRTN, 1
|
|
|
bca718 |
- ld rWORD6_SHIFT, -64(r1)
|
|
|
bca718 |
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
bgtlr cr7
|
|
|
bca718 |
li rRTN, -1
|
|
|
bca718 |
blr
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duLcr7):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
li rRTN, 1
|
|
|
bca718 |
bgt cr7, L(dureturn29)
|
|
|
bca718 |
- ld rSHL, -24(r1)
|
|
|
bca718 |
- ld rSHR, -32(r1)
|
|
|
bca718 |
+ ld rSHL, rSHLSAVE(r1)
|
|
|
bca718 |
+ ld rSHR, rSHRSAVE(r1)
|
|
|
bca718 |
li rRTN, -1
|
|
|
bca718 |
b L(dureturn27)
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duLcr1):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
li rRTN, 1
|
|
|
bca718 |
bgt cr1, L(dureturn29)
|
|
|
bca718 |
- ld rSHL, -24(r1)
|
|
|
bca718 |
- ld rSHR, -32(r1)
|
|
|
bca718 |
+ ld rSHL, rSHLSAVE(r1)
|
|
|
bca718 |
+ ld rSHR, rSHRSAVE(r1)
|
|
|
bca718 |
li rRTN, -1
|
|
|
bca718 |
b L(dureturn27)
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duLcr6):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
li rRTN, 1
|
|
|
bca718 |
bgt cr6, L(dureturn29)
|
|
|
bca718 |
- ld rSHL, -24(r1)
|
|
|
bca718 |
- ld rSHR, -32(r1)
|
|
|
bca718 |
+ ld rSHL, rSHLSAVE(r1)
|
|
|
bca718 |
+ ld rSHR, rSHRSAVE(r1)
|
|
|
bca718 |
li rRTN, -1
|
|
|
bca718 |
b L(dureturn27)
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(duLcr5):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
li rRTN, 1
|
|
|
bca718 |
bgt cr5, L(dureturn29)
|
|
|
bca718 |
- ld rSHL, -24(r1)
|
|
|
bca718 |
- ld rSHR, -32(r1)
|
|
|
bca718 |
+ ld rSHL, rSHLSAVE(r1)
|
|
|
bca718 |
+ ld rSHR, rSHRSAVE(r1)
|
|
|
bca718 |
li rRTN, -1
|
|
|
bca718 |
b L(dureturn27)
|
|
|
bca718 |
+
|
|
|
bca718 |
.align 3
|
|
|
bca718 |
L(duZeroReturn):
|
|
|
bca718 |
li rRTN, 0
|
|
|
bca718 |
.align 4
|
|
|
bca718 |
L(dureturn):
|
|
|
bca718 |
- ld rWORD8, -8(r1)
|
|
|
bca718 |
- ld rWORD7, -16(r1)
|
|
|
bca718 |
+ ld rWORD8, rWORD8SAVE(r1)
|
|
|
bca718 |
+ ld rWORD7, rWORD7SAVE(r1)
|
|
|
bca718 |
L(dureturn29):
|
|
|
bca718 |
- ld rSHL, -24(r1)
|
|
|
bca718 |
- ld rSHR, -32(r1)
|
|
|
bca718 |
+ ld rSHL, rSHLSAVE(r1)
|
|
|
bca718 |
+ ld rSHR, rSHRSAVE(r1)
|
|
|
bca718 |
L(dureturn27):
|
|
|
bca718 |
- ld rWORD8_SHIFT, -40(r1)
|
|
|
bca718 |
-L(dureturn26):
|
|
|
bca718 |
- ld rWORD2_SHIFT, -48(r1)
|
|
|
bca718 |
-L(dureturn25):
|
|
|
bca718 |
- ld rWORD4_SHIFT, -56(r1)
|
|
|
bca718 |
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
|
|
|
bca718 |
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
|
|
|
bca718 |
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
|
|
|
bca718 |
L(dureturn24):
|
|
|
bca718 |
- ld rWORD6_SHIFT, -64(r1)
|
|
|
bca718 |
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
blr
|
|
|
bca718 |
+
|
|
|
bca718 |
L(duzeroLength):
|
|
|
bca718 |
+ ld rOFF8, rOFF8SAVE(r1)
|
|
|
bca718 |
+ ld rOFF16, rOFF16SAVE(r1)
|
|
|
bca718 |
+ ld rOFF24, rOFF24SAVE(r1)
|
|
|
bca718 |
+ ld rOFF32, rOFF32SAVE(r1)
|
|
|
bca718 |
li rRTN, 0
|
|
|
bca718 |
blr
|
|
|
bca718 |
|