5de29b
# commit 759cfef3ac4c07dba1ece0bbc1207e099348816d
5de29b
# Author: Alan Modra <amodra@gmail.com>
5de29b
# Date:   Sat Aug 17 18:47:22 2013 +0930
5de29b
# 
5de29b
#     PowerPC LE memcpy
5de29b
#     http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html
5de29b
#     
5de29b
#     LIttle-endian support for memcpy.  I spent some time cleaning up the
5de29b
#     64-bit power7 memcpy, in order to avoid the extra alignment traps
5de29b
#     power7 takes for little-endian.  It probably would have been better
5de29b
#     to copy the linux kernel version of memcpy.
5de29b
#     
5de29b
#         * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
5de29b
#         * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
5de29b
#         * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
5de29b
#         * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
5de29b
#         * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
5de29b
#         * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
5de29b
#         * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
5de29b
#         * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
5de29b
#         * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise.  Make better
5de29b
#         use of regs.  Use power7 mtocrf.  Tidy function tails.
5de29b
# 
12745e
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
5de29b
@@ -205,15 +205,28 @@
5de29b
     blt   cr6,5f
5de29b
     srwi  7,6,16
5de29b
     bgt	  cr6,3f
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    sth   7,0(3)
5de29b
+#else
5de29b
     sth   6,0(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 3:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,24
5de29b
+    stb   6,0(3)
5de29b
+    sth   7,1(3)
5de29b
+#else
5de29b
     stb   7,0(3)
5de29b
     sth   6,1(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 5:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,8
5de29b
+#endif
5de29b
     stb   6,0(3)
5de29b
 7:
5de29b
     cmplwi	cr1,10,16
5de29b
@@ -341,13 +354,23 @@
5de29b
     bf      30,1f
5de29b
 
5de29b
     /* there are at least two words to copy, so copy them */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srw   0,6,10
5de29b
+    slw   8,7,9
5de29b
+#else
5de29b
     slw   0,6,10  /* shift 1st src word to left align it in R0 */
5de29b
     srw   8,7,9   /* shift 2nd src word to right align it in R8 */
5de29b
+#endif
5de29b
     or    0,0,8   /* or them to get word to store */
5de29b
     lwz   6,8(5)  /* load the 3rd src word */
5de29b
     stw   0,0(4)  /* store the 1st dst word */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srw   0,7,10
5de29b
+    slw   8,6,9
5de29b
+#else
5de29b
     slw   0,7,10  /* now left align 2nd src word into R0 */
5de29b
     srw   8,6,9   /* shift 3rd src word to right align it in R8 */
5de29b
+#endif
5de29b
     or    0,0,8   /* or them to get word to store */
5de29b
     lwz   7,12(5)
5de29b
     stw   0,4(4)  /* store the 2nd dst word */
5de29b
@@ -355,8 +378,13 @@
5de29b
     addi  5,5,16
5de29b
     bf    31,4f
5de29b
     /* there is a third word to copy, so copy it */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srw   0,6,10
5de29b
+    slw   8,7,9
5de29b
+#else
5de29b
     slw   0,6,10  /* shift 3rd src word to left align it in R0 */
5de29b
     srw   8,7,9   /* shift 4th src word to right align it in R8 */
5de29b
+#endif
5de29b
     or    0,0,8   /* or them to get word to store */
5de29b
     stw   0,0(4)  /* store 3rd dst word */
5de29b
     mr    6,7
5de29b
@@ -366,8 +394,13 @@
5de29b
     b     4f
5de29b
     .align 4
5de29b
 1:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srw     0,6,10
5de29b
+    slw     8,7,9
5de29b
+#else
5de29b
     slw     0,6,10  /* shift 1st src word to left align it in R0 */
5de29b
     srw     8,7,9   /* shift 2nd src word to right align it in R8 */
5de29b
+#endif
5de29b
     addi  5,5,8
5de29b
     or    0,0,8   /* or them to get word to store */
5de29b
     bf    31,4f
5de29b
@@ -380,23 +413,43 @@
5de29b
     .align  4
5de29b
 4:
5de29b
     /* copy 16 bytes at a time */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srw   0,6,10
5de29b
+    slw   8,7,9
5de29b
+#else
5de29b
     slw   0,6,10
5de29b
     srw   8,7,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     lwz   6,0(5)
5de29b
     stw   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srw   0,7,10
5de29b
+    slw   8,6,9
5de29b
+#else
5de29b
     slw   0,7,10
5de29b
     srw   8,6,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     lwz   7,4(5)
5de29b
     stw   0,4(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srw   0,6,10
5de29b
+    slw   8,7,9
5de29b
+#else
5de29b
     slw   0,6,10
5de29b
     srw   8,7,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     lwz   6,8(5)
5de29b
     stw   0,8(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srw   0,7,10
5de29b
+    slw   8,6,9
5de29b
+#else
5de29b
     slw   0,7,10
5de29b
     srw   8,6,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     lwz   7,12(5)
5de29b
     stw   0,12(4)
5de29b
@@ -405,8 +458,13 @@
5de29b
     bdnz+ 4b
5de29b
 8:
5de29b
     /* calculate and store the final word */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srw   0,6,10
5de29b
+    slw   8,7,9
5de29b
+#else
5de29b
     slw   0,6,10
5de29b
     srw   8,7,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     stw   0,0(4)
5de29b
 3:
12745e
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
5de29b
@@ -221,15 +221,28 @@
5de29b
     blt   cr6,5f
5de29b
     srwi  7,6,16
5de29b
     bgt	  cr6,3f
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    sth   7,0(3)
5de29b
+#else
5de29b
     sth   6,0(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 3:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,24
5de29b
+    stb   6,0(3)
5de29b
+    sth   7,1(3)
5de29b
+#else
5de29b
     stb   7,0(3)
5de29b
     sth   6,1(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 5:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,8
5de29b
+#endif
5de29b
     stb   6,0(3)
5de29b
 7:
5de29b
     cmplwi	cr1,10,16
5de29b
@@ -579,7 +592,11 @@
5de29b
     lwz     6,-1(4)
5de29b
     cmplwi  cr6,31,4
5de29b
     srwi    8,31,5    /* calculate the 32 byte loop count */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srwi    6,6,8
5de29b
+#else
5de29b
     slwi    6,6,8
5de29b
+#endif
5de29b
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
5de29b
     blt     cr5,L(wdu1_32tail)
5de29b
     mtctr   8
5de29b
@@ -587,8 +604,12 @@
5de29b
 
5de29b
     lwz   8,3(4)
5de29b
     lwz   7,4(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rldimi 6,8,24,32
5de29b
+#else
5de29b
 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
5de29b
     rlwimi 6,8,8,(32-8),31
5de29b
+#endif
5de29b
     b      L(wdu1_loop32x)
5de29b
     .align  4
5de29b
 L(wdu1_loop32):
5de29b
@@ -597,8 +618,12 @@
5de29b
     lwz   7,4(4)
5de29b
     stw   10,-8(3)
5de29b
     stw   11,-4(3)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rldimi 6,8,24,32
5de29b
+#else
5de29b
 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
5de29b
     rlwimi 6,8,8,(32-8),31
5de29b
+#endif
5de29b
 L(wdu1_loop32x):
5de29b
     lwz   10,8(4)
5de29b
     lwz   11,12(4)
5de29b
@@ -615,7 +640,11 @@
5de29b
     stw   6,16(3)
5de29b
     stw   7,20(3)
5de29b
     addi  3,3,32
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srwi  6,8,8
5de29b
+#else
5de29b
     slwi  6,8,8
5de29b
+#endif
5de29b
     bdnz+ L(wdu1_loop32)
5de29b
     stw   10,-8(3)
5de29b
     stw   11,-4(3)
5de29b
@@ -626,8 +655,12 @@
5de29b
     blt     cr6,L(wdu_4tail)
5de29b
     /* calculate and store the final word */
5de29b
     lwz   8,3(4)
5de29b
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rldimi 6,8,24,32
5de29b
+#else
5de29b
+/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
5de29b
     rlwimi 6,8,8,(32-8),31
5de29b
+#endif
5de29b
     b     L(wdu_32tailx)
5de29b
 
5de29b
 L(wdu2_32):
5de29b
@@ -635,7 +668,11 @@
5de29b
     lwz     6,-2(4)
5de29b
     cmplwi  cr6,31,4
5de29b
     srwi    8,31,5    /* calculate the 32 byte loop count */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srwi    6,6,16
5de29b
+#else
5de29b
     slwi    6,6,16
5de29b
+#endif
5de29b
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
5de29b
     blt     cr5,L(wdu2_32tail)
5de29b
     mtctr   8
5de29b
@@ -643,8 +680,11 @@
5de29b
 
5de29b
     lwz   8,2(4)
5de29b
     lwz   7,4(4)
5de29b
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rldimi 6,8,16,32
5de29b
+#else
5de29b
     rlwimi 6,8,16,(32-16),31
5de29b
+#endif
5de29b
     b      L(wdu2_loop32x)
5de29b
     .align  4
5de29b
 L(wdu2_loop32):
5de29b
@@ -653,8 +693,11 @@
5de29b
     lwz   7,4(4)
5de29b
     stw   10,-8(3)
5de29b
     stw   11,-4(3)
5de29b
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rldimi 6,8,16,32
5de29b
+#else
5de29b
     rlwimi 6,8,16,(32-16),31
5de29b
+#endif
5de29b
 L(wdu2_loop32x):
5de29b
     lwz   10,8(4)
5de29b
     lwz   11,12(4)
5de29b
@@ -672,7 +715,11 @@
5de29b
     stw   6,16(3)
5de29b
     stw   7,20(3)
5de29b
     addi  3,3,32
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srwi  6,8,16
5de29b
+#else
5de29b
     slwi  6,8,16
5de29b
+#endif
5de29b
     bdnz+ L(wdu2_loop32)
5de29b
     stw   10,-8(3)
5de29b
     stw   11,-4(3)
5de29b
@@ -683,8 +730,11 @@
5de29b
     blt     cr6,L(wdu_4tail)
5de29b
     /* calculate and store the final word */
5de29b
     lwz   8,2(4)
5de29b
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rldimi 6,8,16,32
5de29b
+#else
5de29b
     rlwimi 6,8,16,(32-16),31
5de29b
+#endif
5de29b
     b     L(wdu_32tailx)
5de29b
 
5de29b
 L(wdu3_32):
5de29b
@@ -692,7 +742,11 @@
5de29b
     lwz     6,-3(4)
5de29b
     cmplwi  cr6,31,4
5de29b
     srwi    8,31,5    /* calculate the 32 byte loop count */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srwi    6,6,24
5de29b
+#else
5de29b
     slwi    6,6,24
5de29b
+#endif
5de29b
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
5de29b
     blt     cr5,L(wdu3_32tail)
5de29b
     mtctr   8
5de29b
@@ -700,8 +754,11 @@
5de29b
 
5de29b
     lwz   8,1(4)
5de29b
     lwz   7,4(4)
5de29b
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rldimi 6,8,8,32
5de29b
+#else
5de29b
     rlwimi 6,8,24,(32-24),31
5de29b
+#endif
5de29b
     b      L(wdu3_loop32x)
5de29b
     .align  4
5de29b
 L(wdu3_loop32):
5de29b
@@ -710,8 +767,11 @@
5de29b
     lwz   7,4(4)
5de29b
     stw   10,-8(3)
5de29b
     stw   11,-4(3)
5de29b
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rldimi 6,8,8,32
5de29b
+#else
5de29b
     rlwimi 6,8,24,(32-24),31
5de29b
+#endif
5de29b
 L(wdu3_loop32x):
5de29b
     lwz   10,8(4)
5de29b
     lwz   11,12(4)
5de29b
@@ -728,7 +788,11 @@
5de29b
     stw   6,16(3)
5de29b
     stw   7,20(3)
5de29b
     addi  3,3,32
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srwi  6,8,24
5de29b
+#else
5de29b
     slwi  6,8,24
5de29b
+#endif
5de29b
     bdnz+ L(wdu3_loop32)
5de29b
     stw   10,-8(3)
5de29b
     stw   11,-4(3)
5de29b
@@ -739,8 +803,11 @@
5de29b
     blt     cr6,L(wdu_4tail)
5de29b
     /* calculate and store the final word */
5de29b
     lwz   8,1(4)
5de29b
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rldimi 6,8,8,32
5de29b
+#else
5de29b
     rlwimi 6,8,24,(32-24),31
5de29b
+#endif
5de29b
     b     L(wdu_32tailx)
5de29b
     .align  4
5de29b
 L(wdu_32tailx):
12745e
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
5de29b
@@ -385,7 +385,7 @@
5de29b
 
5de29b
 	beq    L(copy_GE_32_unaligned_cont)
5de29b
 
5de29b
-	/* SRC is not quadword aligned, get it aligned.  */
5de29b
+	/* DST is not quadword aligned, get it aligned.  */
5de29b
 
5de29b
 	mtcrf   0x01,0
5de29b
 	subf    31,0,5
5de29b
@@ -437,13 +437,21 @@
5de29b
 	mr      11,12
5de29b
 	mtcrf   0x01,9
5de29b
 	cmplwi  cr6,9,1
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	lvsr    5,0,12
5de29b
+#else
5de29b
 	lvsl    5,0,12
5de29b
+#endif
5de29b
 	lvx     3,0,12
5de29b
 	bf      31,L(setup_unaligned_loop)
5de29b
 
5de29b
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
5de29b
 	lvx     4,12,6
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm   6,4,3,5
5de29b
+#else
5de29b
 	vperm   6,3,4,5
5de29b
+#endif
5de29b
 	addi    11,12,16
5de29b
 	addi    10,3,16
5de29b
 	stvx    6,0,3
5de29b
@@ -463,11 +471,17 @@
5de29b
 	vector instructions though.  */
5de29b
 
5de29b
 	lvx	4,11,6	      /* vr4 = r11+16.  */
5de29b
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
5de29b
-			      of vr3/vr4 into vr6.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm   6,4,3,5
5de29b
+#else
5de29b
+	vperm   6,3,4,5
5de29b
+#endif
5de29b
 	lvx	3,11,7	      /* vr3 = r11+32.  */
5de29b
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
5de29b
-			      of vr3/vr4 into vr10.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm   10,3,4,5
5de29b
+#else
5de29b
+	vperm   10,4,3,5
5de29b
+#endif
5de29b
 	addi    11,11,32
5de29b
 	stvx    6,0,10
5de29b
 	stvx    10,10,6
12745e
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
5de29b
@@ -327,7 +327,7 @@
5de29b
 
5de29b
 	beq	L(copy_GE_32_unaligned_cont)
5de29b
 
5de29b
-	/* SRC is not quadword aligned, get it aligned.  */
5de29b
+	/* DST is not quadword aligned, get it aligned.  */
5de29b
 
5de29b
 	mtcrf	0x01,0
5de29b
 	subf	31,0,5
5de29b
@@ -379,13 +379,21 @@
5de29b
 	mr	11,12
5de29b
 	mtcrf	0x01,9
5de29b
 	cmplwi	cr6,9,1
5de29b
-	lvsl	5,0,12
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	lvsr    5,0,12
5de29b
+#else
5de29b
+	lvsl    5,0,12
5de29b
+#endif
5de29b
 	lvx	3,0,12
5de29b
 	bf	31,L(setup_unaligned_loop)
5de29b
 
5de29b
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
5de29b
 	lvx	4,12,6
5de29b
-	vperm	6,3,4,5
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm   6,4,3,5
5de29b
+#else
5de29b
+	vperm   6,3,4,5
5de29b
+#endif
5de29b
 	addi	11,12,16
5de29b
 	addi	10,3,16
5de29b
 	stvx	6,0,3
5de29b
@@ -405,11 +413,17 @@
5de29b
 	vector instructions though.  */
5de29b
 
5de29b
 	lvx	4,11,6	      /* vr4 = r11+16.  */
5de29b
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
5de29b
-				 of vr3/vr4 into vr6.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm   6,4,3,5
5de29b
+#else
5de29b
+	vperm   6,3,4,5
5de29b
+#endif
5de29b
 	lvx	3,11,7	      /* vr3 = r11+32.  */
5de29b
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
5de29b
-				 of vr3/vr4 into vr10.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm   10,3,4,5
5de29b
+#else
5de29b
+	vperm   10,4,3,5
5de29b
+#endif
5de29b
 	addi	11,11,32
5de29b
 	stvx	6,0,10
5de29b
 	stvx	10,10,6
12745e
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
5de29b
@@ -214,15 +214,28 @@
5de29b
     blt   cr6,5f
5de29b
     srdi  7,6,16
5de29b
     bgt	  cr6,3f
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    sth   7,0(3)
5de29b
+#else
5de29b
     sth   6,0(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 3:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,24
5de29b
+    stb   6,0(3)
5de29b
+    sth   7,1(3)
5de29b
+#else
5de29b
     stb   7,0(3)
5de29b
     sth   6,1(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 5:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,8
5de29b
+#endif
5de29b
     stb   6,0(3)
5de29b
 7:
5de29b
     cmpldi	cr1,10,16
5de29b
@@ -330,7 +343,11 @@
5de29b
     ld    7,8(5)
5de29b
     subfic  9,10,64
5de29b
     beq   2f
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd   0,6,10
5de29b
+#else
5de29b
     sld   0,6,10
5de29b
+#endif
5de29b
     cmpldi  11,1
5de29b
     mr    6,7
5de29b
     addi  4,4,-8
5de29b
@@ -338,15 +355,25 @@
5de29b
     b     1f
5de29b
 2:  addi  5,5,8
5de29b
     .align  4
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+0:  srd   0,6,10
5de29b
+    sld   8,7,9
5de29b
+#else
5de29b
 0:  sld   0,6,10
5de29b
     srd   8,7,9
5de29b
+#endif
5de29b
     cmpldi  11,2
5de29b
     ld    6,8(5)
5de29b
     or    0,0,8
5de29b
     addi  11,11,-2
5de29b
     std   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd   0,7,10
5de29b
+1:  sld   8,6,9
5de29b
+#else
5de29b
     sld   0,7,10
5de29b
 1:  srd   8,6,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     beq   8f
5de29b
     ld    7,16(5)
12745e
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:05:51.000000000 -0500
5de29b
@@ -1,5 +1,5 @@
5de29b
 /* Optimized memcpy implementation for PowerPC64.
5de29b
-   Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
5de29b
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
5de29b
    This file is part of the GNU C Library.
5de29b
 
5de29b
    The GNU C Library is free software; you can redistribute it and/or
5de29b
@@ -17,26 +17,24 @@
5de29b
    <http://www.gnu.org/licenses/>.  */
5de29b
 
5de29b
 #include <sysdep.h>
5de29b
-#include <bp-sym.h>
5de29b
-#include <bp-asm.h>
5de29b
 
5de29b
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
5de29b
    Returns 'dst'.
5de29b
 
5de29b
-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
5de29b
-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
5de29b
-   with the appropriate combination of byte and halfword load/stores. 
5de29b
-   There is minimal effort to optimize the alignment of short moves.  
5de29b
+   Memcpy handles short copies (< 32-bytes) using a binary move blocks
5de29b
+   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
5de29b
+   with the appropriate combination of byte and halfword load/stores.
5de29b
+   There is minimal effort to optimize the alignment of short moves.
5de29b
    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
5de29b
-   of handling unligned load/stores that do not cross 32-byte boundries.
5de29b
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
5de29b
 
5de29b
    Longer moves (>= 32-bytes) justify the effort to get at least the
5de29b
    destination doubleword (8-byte) aligned.  Further optimization is
5de29b
-   posible when both source and destination are doubleword aligned.
5de29b
+   possible when both source and destination are doubleword aligned.
5de29b
    Each case has a optimized unrolled loop.   */
5de29b
 
5de29b
 	.machine power4
5de29b
-EALIGN (BP_SYM (memcpy), 5, 0)
5de29b
+EALIGN (memcpy, 5, 0)
5de29b
 	CALL_MCOUNT 3
5de29b
 
5de29b
     cmpldi cr1,5,31
5de29b
@@ -44,20 +42,20 @@
5de29b
     std   3,-16(1)
5de29b
     std   31,-8(1)
5de29b
     cfi_offset(31,-8)
5de29b
-    andi. 11,3,7	/* check alignement of dst.  */
5de29b
+    andi. 11,3,7	/* check alignment of dst.  */
5de29b
     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
5de29b
-    clrldi 10,4,61	/* check alignement of src.  */
5de29b
+    clrldi 10,4,61	/* check alignment of src.  */
5de29b
     cmpldi cr6,5,8
5de29b
     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
5de29b
-    cmpld cr6,10,11     
5de29b
+    cmpld cr6,10,11
5de29b
     mr    12,4
5de29b
     srdi  9,5,3		/* Number of full double words remaining.  */
5de29b
     mtcrf 0x01,0
5de29b
     mr    31,5
5de29b
     beq   .L0
5de29b
-  
5de29b
+
5de29b
     subf  31,0,5
5de29b
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.  */
5de29b
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
5de29b
 1:  bf    31,2f
5de29b
     lbz   6,0(12)
5de29b
     addi  12,12,1
5de29b
@@ -74,17 +72,17 @@
5de29b
     stw   6,0(3)
5de29b
     addi  3,3,4
5de29b
 0:
5de29b
-    clrldi 10,12,61	/* check alignement of src again.  */     
5de29b
+    clrldi 10,12,61	/* check alignment of src again.  */
5de29b
     srdi  9,31,3	/* Number of full double words remaining.  */
5de29b
-    
5de29b
-  /* Copy doublewords from source to destination, assumpting the
5de29b
+
5de29b
+  /* Copy doublewords from source to destination, assuming the
5de29b
      destination is aligned on a doubleword boundary.
5de29b
 
5de29b
      At this point we know there are at least 25 bytes left (32-7) to copy.
5de29b
-     The next step is to determine if the source is also doubleword aligned. 
5de29b
+     The next step is to determine if the source is also doubleword aligned.
5de29b
      If not branch to the unaligned move code at .L6. which uses
5de29b
      a load, shift, store strategy.
5de29b
-     
5de29b
+
5de29b
      Otherwise source and destination are doubleword aligned, and we can
5de29b
      the optimized doubleword copy loop.  */
5de29b
 .L0:
5de29b
@@ -97,14 +95,14 @@
5de29b
      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
5de29b
      If the copy is not an exact multiple of 32 bytes, 1-3
5de29b
      doublewords are copied as needed to set up the main loop.  After
5de29b
-     the main loop exits there may be a tail of 1-7 bytes. These byte are 
5de29b
+     the main loop exits there may be a tail of 1-7 bytes. These byte are
5de29b
      copied a word/halfword/byte at a time as needed to preserve alignment.  */
5de29b
 
5de29b
     srdi  8,31,5
5de29b
     cmpldi	cr1,9,4
5de29b
     cmpldi	cr6,11,0
5de29b
     mr    11,12
5de29b
-    
5de29b
+
5de29b
     bf    30,1f
5de29b
     ld    6,0(12)
5de29b
     ld    7,8(12)
5de29b
@@ -115,7 +113,7 @@
5de29b
     addi  10,3,16
5de29b
     bf    31,4f
5de29b
     ld    0,16(12)
5de29b
-    std   0,16(3)    
5de29b
+    std   0,16(3)
5de29b
     blt   cr1,3f
5de29b
     addi  11,12,24
5de29b
     addi  10,3,24
5de29b
@@ -129,7 +127,7 @@
5de29b
     addi  11,12,8
5de29b
     std   6,0(3)
5de29b
     addi  10,3,8
5de29b
-    
5de29b
+
5de29b
     .align  4
5de29b
 4:
5de29b
     ld    6,0(11)
5de29b
@@ -144,7 +142,7 @@
5de29b
     std   0,24(10)
5de29b
     addi  10,10,32
5de29b
     bdnz  4b
5de29b
-3:  
5de29b
+3:
5de29b
 
5de29b
     rldicr 0,31,0,60
5de29b
     mtcrf 0x01,31
5de29b
@@ -152,9 +150,9 @@
5de29b
 .L9:
5de29b
     add   3,3,0
5de29b
     add   12,12,0
5de29b
-    
5de29b
+
5de29b
 /*  At this point we have a tail of 0-7 bytes and we know that the
5de29b
-    destiniation is double word aligned.  */
5de29b
+    destination is double word aligned.  */
5de29b
 4:  bf    29,2f
5de29b
     lwz   6,0(12)
5de29b
     addi  12,12,4
5de29b
@@ -173,29 +171,29 @@
5de29b
     ld 31,-8(1)
5de29b
     ld 3,-16(1)
5de29b
     blr
5de29b
-       
5de29b
-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
5de29b
-   bytes.  Each case is handled without loops, using binary (1,2,4,8) 
5de29b
-   tests.  
5de29b
-   
5de29b
+
5de29b
+/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
5de29b
+   bytes.  Each case is handled without loops, using binary (1,2,4,8)
5de29b
+   tests.
5de29b
+
5de29b
    In the short (0-8 byte) case no attempt is made to force alignment
5de29b
-   of either source or destination.  The hardware will handle the 
5de29b
-   unaligned load/stores with small delays for crossing 32- 64-byte, and 
5de29b
+   of either source or destination.  The hardware will handle the
5de29b
+   unaligned load/stores with small delays for crossing 32- 64-byte, and
5de29b
    4096-byte boundaries. Since these short moves are unlikely to be
5de29b
-   unaligned or cross these boundaries, the overhead to force 
5de29b
+   unaligned or cross these boundaries, the overhead to force
5de29b
    alignment is not justified.
5de29b
-   
5de29b
+
5de29b
    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
5de29b
    boundaries.  Since only loads are sensitive to the 32-/64-byte
5de29b
-   boundaries it is more important to align the source then the 
5de29b
+   boundaries it is more important to align the source then the
5de29b
    destination.  If the source is not already word aligned, we first
5de29b
-   move 1-3 bytes as needed.  Since we are only word aligned we don't 
5de29b
-   use double word load/stores to insure that all loads are aligned. 
5de29b
+   move 1-3 bytes as needed.  Since we are only word aligned we don't
5de29b
+   use double word load/stores to insure that all loads are aligned.
5de29b
    While the destination and stores may still be unaligned, this
5de29b
    is only an issue for page (4096 byte boundary) crossing, which
5de29b
    should be rare for these short moves.  The hardware handles this
5de29b
-   case automatically with a small delay.  */ 
5de29b
-   
5de29b
+   case automatically with a small delay.  */
5de29b
+
5de29b
     .align  4
5de29b
 .L2:
5de29b
     mtcrf 0x01,5
5de29b
@@ -216,15 +214,28 @@
5de29b
     blt   cr6,5f
5de29b
     srdi  7,6,16
5de29b
     bgt	  cr6,3f
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    sth   7,0(3)
5de29b
+#else
5de29b
     sth   6,0(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 3:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,24
5de29b
+    stb   6,0(3)
5de29b
+    sth   7,1(3)
5de29b
+#else
5de29b
     stb   7,0(3)
5de29b
     sth   6,1(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 5:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,8
5de29b
+#endif
5de29b
     stb   6,0(3)
5de29b
 7:
5de29b
     cmpldi	cr1,10,16
5de29b
@@ -258,11 +269,11 @@
5de29b
     lwz   6,0(12)
5de29b
     addi  12,12,4
5de29b
     stw   6,0(3)
5de29b
-    addi  3,3,4    
5de29b
+    addi  3,3,4
5de29b
 2:  /* Move 2-3 bytes.  */
5de29b
     bf    30,1f
5de29b
     lhz   6,0(12)
5de29b
-    sth   6,0(3) 
5de29b
+    sth   6,0(3)
5de29b
     bf    31,0f
5de29b
     lbz   7,2(12)
5de29b
     stb   7,2(3)
5de29b
@@ -283,8 +294,8 @@
5de29b
     mr    12,4
5de29b
     bne   cr6,4f
5de29b
 /* Would have liked to use use ld/std here but the 630 processors are
5de29b
-   slow for load/store doubles that are not at least word aligned.  
5de29b
-   Unaligned Load/Store word execute with only a 1 cycle penaltity.  */
5de29b
+   slow for load/store doubles that are not at least word aligned.
5de29b
+   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
5de29b
     lwz   6,0(4)
5de29b
     lwz   7,4(4)
5de29b
     stw   6,0(3)
5de29b
@@ -299,14 +310,14 @@
5de29b
 6:
5de29b
     bf    30,5f
5de29b
     lhz   7,4(4)
5de29b
-    sth   7,4(3) 
5de29b
+    sth   7,4(3)
5de29b
     bf    31,0f
5de29b
     lbz   8,6(4)
5de29b
     stb   8,6(3)
5de29b
     ld 3,-16(1)
5de29b
     blr
5de29b
     .align  4
5de29b
-5:  
5de29b
+5:
5de29b
     bf    31,0f
5de29b
     lbz   6,4(4)
5de29b
     stb   6,4(3)
5de29b
@@ -336,13 +347,23 @@
5de29b
     bf      30,1f
5de29b
 
5de29b
     /* there are at least two DWs to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd     0,6,10
5de29b
+    sld     8,7,9
5de29b
+#else
5de29b
     sld     0,6,10
5de29b
     srd     8,7,9
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      6,16(5)
5de29b
     std     0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd     0,7,10
5de29b
+    sld     8,6,9
5de29b
+#else
5de29b
     sld     0,7,10
5de29b
     srd     8,6,9
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      7,24(5)
5de29b
     std     0,8(4)
5de29b
@@ -351,8 +372,13 @@
5de29b
     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
5de29b
     bf      31,4f
5de29b
     /* there is a third DW to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd     0,6,10
5de29b
+    sld     8,7,9
5de29b
+#else
5de29b
     sld     0,6,10
5de29b
     srd     8,7,9
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     std     0,0(4)
5de29b
     mr      6,7
5de29b
@@ -363,8 +389,13 @@
5de29b
     b       4f
5de29b
     .align 4
5de29b
 1:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd     0,6,10
5de29b
+    sld     8,7,9
5de29b
+#else
5de29b
     sld     0,6,10
5de29b
     srd     8,7,9
5de29b
+#endif
5de29b
     addi    5,5,16
5de29b
     or      0,0,8
5de29b
     bf      31,4f
5de29b
@@ -375,23 +406,44 @@
5de29b
     addi    4,4,8
5de29b
     .align 4
5de29b
 /* copy 32 bytes at a time */
5de29b
-4:  sld   0,6,10
5de29b
+4:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd   0,6,10
5de29b
+    sld   8,7,9
5de29b
+#else
5de29b
+    sld   0,6,10
5de29b
     srd   8,7,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,0(5)
5de29b
     std   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd   0,7,10
5de29b
+    sld   8,6,9
5de29b
+#else
5de29b
     sld   0,7,10
5de29b
     srd   8,6,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,8(5)
5de29b
     std   0,8(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd   0,6,10
5de29b
+    sld   8,7,9
5de29b
+#else
5de29b
     sld   0,6,10
5de29b
     srd   8,7,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,16(5)
5de29b
     std   0,16(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd   0,7,10
5de29b
+    sld   8,6,9
5de29b
+#else
5de29b
     sld   0,7,10
5de29b
     srd   8,6,9
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,24(5)
5de29b
     std   0,24(4)
5de29b
@@ -401,9 +453,14 @@
5de29b
     .align 4
5de29b
 8:
5de29b
     /* calculate and store the final DW */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srd   0,6,10
5de29b
+    sld   8,7,9
5de29b
+#else
5de29b
     sld   0,6,10
5de29b
     srd   8,7,9
5de29b
-    or    0,0,8  
5de29b
+#endif
5de29b
+    or    0,0,8
5de29b
     std   0,0(4)
5de29b
 3:
5de29b
     rldicr 0,31,0,60
5de29b
@@ -413,5 +470,5 @@
5de29b
     ld 31,-8(1)
5de29b
     ld 3,-16(1)
5de29b
     blr
5de29b
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
5de29b
+END_GEN_TB (memcpy,TB_TOCLESS)
5de29b
 libc_hidden_builtin_def (memcpy)
12745e
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:05:27.000000000 -0500
5de29b
@@ -1,5 +1,5 @@
5de29b
 /* Optimized memcpy implementation for PowerPC64.
5de29b
-   Copyright (C) 2003, 2006, 2007, 2011 Free Software Foundation, Inc.
5de29b
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
5de29b
    This file is part of the GNU C Library.
5de29b
 
5de29b
    The GNU C Library is free software; you can redistribute it and/or
5de29b
@@ -17,52 +17,50 @@
5de29b
    <http://www.gnu.org/licenses/>.  */
5de29b
 
5de29b
 #include <sysdep.h>
5de29b
-#include <bp-sym.h>
5de29b
-#include <bp-asm.h>
5de29b
 
5de29b
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
5de29b
    Returns 'dst'.
5de29b
 
5de29b
-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
5de29b
-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
5de29b
-   with the appropriate combination of byte and halfword load/stores. 
5de29b
-   There is minimal effort to optimize the alignment of short moves.  
5de29b
+   Memcpy handles short copies (< 32-bytes) using a binary move blocks
5de29b
+   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
5de29b
+   with the appropriate combination of byte and halfword load/stores.
5de29b
+   There is minimal effort to optimize the alignment of short moves.
5de29b
    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
5de29b
-   of handling unligned load/stores that do not cross 32-byte boundries.
5de29b
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
5de29b
 
5de29b
    Longer moves (>= 32-bytes) justify the effort to get at least the
5de29b
    destination doubleword (8-byte) aligned.  Further optimization is
5de29b
-   posible when both source and destination are doubleword aligned.
5de29b
-   Each case has a optimized unrolled loop.  
5de29b
-     
5de29b
-   For POWER6 unaligned loads will take a 20+ cycle hicup for any
5de29b
+   possible when both source and destination are doubleword aligned.
5de29b
+   Each case has a optimized unrolled loop.
5de29b
+
5de29b
+   For POWER6 unaligned loads will take a 20+ cycle hiccup for any
5de29b
    L1 cache miss that crosses a 32- or 128-byte boundary.  Store
5de29b
-   is more forgiving and does not take a hicup until page or 
5de29b
-   segment boundaries.  So we require doubleword alignment for 
5de29b
+   is more forgiving and does not take a hiccup until page or
5de29b
+   segment boundaries.  So we require doubleword alignment for
5de29b
    the source but may take a risk and only require word alignment
5de29b
    for the destination.  */
5de29b
 
5de29b
 	.machine	"power6"
5de29b
-EALIGN (BP_SYM (memcpy), 7, 0)
5de29b
+EALIGN (memcpy, 7, 0)
5de29b
 	CALL_MCOUNT 3
5de29b
 
5de29b
     cmpldi cr1,5,31
5de29b
     neg   0,3
5de29b
     std   3,-16(1)
5de29b
     std   31,-8(1)
5de29b
-    andi. 11,3,7	/* check alignement of dst.  */
5de29b
+    andi. 11,3,7	/* check alignment of dst.  */
5de29b
     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
5de29b
-    clrldi 10,4,61	/* check alignement of src.  */
5de29b
+    clrldi 10,4,61	/* check alignment of src.  */
5de29b
     cmpldi cr6,5,8
5de29b
     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
5de29b
     mtcrf 0x01,0
5de29b
-    cmpld cr6,10,11  
5de29b
+    cmpld cr6,10,11
5de29b
     srdi  9,5,3		/* Number of full double words remaining.  */
5de29b
     beq   .L0
5de29b
-  
5de29b
+
5de29b
     subf  5,0,5
5de29b
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.
5de29b
-     Duplicate some code to maximize fall-throught and minimize agen delays.  */
5de29b
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.
5de29b
+     Duplicate some code to maximize fall-through and minimize agen delays.  */
5de29b
 1:  bf    31,2f
5de29b
     lbz   6,0(4)
5de29b
     stb   6,0(3)
5de29b
@@ -78,7 +76,7 @@
5de29b
     lwz   6,1(4)
5de29b
     stw   6,1(3)
5de29b
     b     0f
5de29b
-    
5de29b
+
5de29b
 2:  bf    30,4f
5de29b
     lhz   6,0(4)
5de29b
     sth   6,0(3)
5de29b
@@ -86,26 +84,26 @@
5de29b
     lwz   6,2(4)
5de29b
     stw   6,2(3)
5de29b
     b     0f
5de29b
-    
5de29b
+
5de29b
 4:  bf    29,0f
5de29b
     lwz   6,0(4)
5de29b
     stw   6,0(3)
5de29b
-0: 
5de29b
+0:
5de29b
 /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
5de29b
     add   4,4,0
5de29b
     add   3,3,0
5de29b
-    
5de29b
-    clrldi 10,4,61	/* check alignement of src again.  */     
5de29b
+
5de29b
+    clrldi 10,4,61	/* check alignment of src again.  */
5de29b
     srdi  9,5,3	/* Number of full double words remaining.  */
5de29b
-    
5de29b
-  /* Copy doublewords from source to destination, assumpting the
5de29b
+
5de29b
+  /* Copy doublewords from source to destination, assuming the
5de29b
      destination is aligned on a doubleword boundary.
5de29b
 
5de29b
      At this point we know there are at least 25 bytes left (32-7) to copy.
5de29b
-     The next step is to determine if the source is also doubleword aligned. 
5de29b
+     The next step is to determine if the source is also doubleword aligned.
5de29b
      If not branch to the unaligned move code at .L6. which uses
5de29b
      a load, shift, store strategy.
5de29b
-     
5de29b
+
5de29b
      Otherwise source and destination are doubleword aligned, and we can
5de29b
      the optimized doubleword copy loop.  */
5de29b
     .align  4
5de29b
@@ -123,14 +121,14 @@
5de29b
      the main loop exits there may be a tail of 1-7 bytes. These byte
5de29b
      are copied a word/halfword/byte at a time as needed to preserve
5de29b
      alignment.
5de29b
-     
5de29b
+
5de29b
      For POWER6 the L1 is store-through and the L2 is store-in.  The
5de29b
      L2 is clocked at half CPU clock so we can store 16 bytes every
5de29b
      other cycle.  POWER6 also has a load/store bypass so we can do
5de29b
-     load, load, store, store every 2 cycles.  
5de29b
-     
5de29b
+     load, load, store, store every 2 cycles.
5de29b
+
5de29b
      The following code is sensitive to cache line alignment.  Do not
5de29b
-     make any change with out first making sure thay don't result in
5de29b
+     make any change with out first making sure they don't result in
5de29b
      splitting ld/std pairs across a cache line.  */
5de29b
 
5de29b
     mtcrf 0x02,5
5de29b
@@ -273,7 +271,7 @@
5de29b
     std   8,16+96(10)
5de29b
     std   0,24+96(10)
5de29b
     ble   cr5,L(das_loop_e)
5de29b
-    
5de29b
+
5de29b
     mtctr   12
5de29b
     .align  4
5de29b
 L(das_loop2):
5de29b
@@ -326,10 +324,10 @@
5de29b
     .align  4
5de29b
 L(das_tail):
5de29b
     beq   cr1,0f
5de29b
-    
5de29b
+
5de29b
 L(das_tail2):
5de29b
 /*  At this point we have a tail of 0-7 bytes and we know that the
5de29b
-    destiniation is double word aligned.  */
5de29b
+    destination is double word aligned.  */
5de29b
 4:  bf    29,2f
5de29b
     lwz   6,0(4)
5de29b
     stw   6,0(3)
5de29b
@@ -344,7 +342,7 @@
5de29b
     lbz   6,4(4)
5de29b
     stb   6,4(3)
5de29b
     b     0f
5de29b
-  
5de29b
+
5de29b
 2:  bf    30,1f
5de29b
     lhz   6,0(4)
5de29b
     sth   6,0(3)
5de29b
@@ -352,7 +350,7 @@
5de29b
     lbz   6,2(4)
5de29b
     stb   6,2(3)
5de29b
     b     0f
5de29b
-    
5de29b
+
5de29b
 1:  bf    31,0f
5de29b
     lbz   6,0(4)
5de29b
     stb   6,0(3)
5de29b
@@ -361,7 +359,7 @@
5de29b
     ld 3,-16(1)
5de29b
     blr
5de29b
 
5de29b
-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
5de29b
+/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
5de29b
    bytes.  Each case is handled without loops, using binary (1,2,4,8)
5de29b
    tests.
5de29b
 
5de29b
@@ -402,15 +400,28 @@
5de29b
     blt   cr6,5f
5de29b
     srdi  7,6,16
5de29b
     bgt	  cr6,3f
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    sth   7,0(3)
5de29b
+#else
5de29b
     sth   6,0(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 3:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,24
5de29b
+    stb   6,0(3)
5de29b
+    sth   7,1(3)
5de29b
+#else
5de29b
     stb   7,0(3)
5de29b
     sth   6,1(3)
5de29b
+#endif
5de29b
     b     7f
5de29b
     .align  4
5de29b
 5:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    rotlwi 6,6,8
5de29b
+#endif
5de29b
     stb   6,0(3)
5de29b
 7:
5de29b
     cmpldi	cr1,10,16
5de29b
@@ -421,7 +432,7 @@
5de29b
 /* At least 6 bytes left and the source is word aligned.  This allows
5de29b
    some speculative loads up front.  */
5de29b
 /* We need to special case the fall-through because the biggest delays
5de29b
-   are due to address computation not being ready in time for the 
5de29b
+   are due to address computation not being ready in time for the
5de29b
    AGEN.  */
5de29b
     lwz   6,0(12)
5de29b
     lwz   7,4(12)
5de29b
@@ -452,7 +463,7 @@
5de29b
     ld    3,-16(1)
5de29b
     blr
5de29b
     .align  4
5de29b
-L(dus_tail16p8):  /* less then 8 bytes left.  */
5de29b
+L(dus_tail16p8):  /* less than 8 bytes left.  */
5de29b
     beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
5de29b
     cmpldi	cr1,10,20
5de29b
     bf    29,L(dus_tail16p2)
5de29b
@@ -466,7 +477,7 @@
5de29b
     ld    3,-16(1)
5de29b
     blr
5de29b
     .align  4
5de29b
-L(dus_tail16p4):  /* less then 4 bytes left.  */
5de29b
+L(dus_tail16p4):  /* less than 4 bytes left.  */
5de29b
     addi  12,12,24
5de29b
     addi  3,3,24
5de29b
     bgt   cr0,L(dus_tail2)
5de29b
@@ -474,7 +485,7 @@
5de29b
     ld    3,-16(1)
5de29b
     blr
5de29b
     .align  4
5de29b
-L(dus_tail16p2):  /* 16 bytes moved, less then 4 bytes left.  */
5de29b
+L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
5de29b
     addi  12,12,16
5de29b
     addi  3,3,16
5de29b
     b     L(dus_tail2)
5de29b
@@ -499,7 +510,7 @@
5de29b
     ld    3,-16(1)
5de29b
     blr
5de29b
     .align  4
5de29b
-L(dus_tail8p4):  /* less then 4 bytes left.  */
5de29b
+L(dus_tail8p4):  /* less than 4 bytes left.  */
5de29b
     addi  12,12,8
5de29b
     addi  3,3,8
5de29b
     bgt   cr1,L(dus_tail2)
5de29b
@@ -510,14 +521,14 @@
5de29b
     .align  4
5de29b
 L(dus_tail4):  /* Move 4 bytes.  */
5de29b
 /*  r6 already loaded speculatively.  If we are here we know there is
5de29b
-    more then 4 bytes left.  So there is no need to test.  */
5de29b
+    more than 4 bytes left.  So there is no need to test.  */
5de29b
     addi  12,12,4
5de29b
     stw   6,0(3)
5de29b
     addi  3,3,4
5de29b
 L(dus_tail2):  /* Move 2-3 bytes.  */
5de29b
     bf    30,L(dus_tail1)
5de29b
     lhz   6,0(12)
5de29b
-    sth   6,0(3) 
5de29b
+    sth   6,0(3)
5de29b
     bf    31,L(dus_tailX)
5de29b
     lbz   7,2(12)
5de29b
     stb   7,2(3)
5de29b
@@ -537,7 +548,7 @@
5de29b
 .LE8:
5de29b
     mr    12,4
5de29b
     bne   cr6,L(dus_4)
5de29b
-/* Exactly 8 bytes.  We may cross a 32-/128-byte boundry and take a ~20
5de29b
+/* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
5de29b
    cycle delay.  This case should be rare and any attempt to avoid this
5de29b
    would take most of 20 cycles any way.  */
5de29b
     ld   6,0(4)
5de29b
@@ -552,7 +563,7 @@
5de29b
     stw   6,0(3)
5de29b
     bf    30,L(dus_5)
5de29b
     lhz   7,4(4)
5de29b
-    sth   7,4(3) 
5de29b
+    sth   7,4(3)
5de29b
     bf    31,L(dus_0)
5de29b
     lbz   8,6(4)
5de29b
     stb   8,6(3)
5de29b
@@ -590,20 +601,31 @@
5de29b
     bge     cr0, L(du4_do)
5de29b
     blt     cr5, L(du1_do)
5de29b
     beq     cr5, L(du2_do)
5de29b
-    b       L(du3_do) 
5de29b
-       
5de29b
+    b       L(du3_do)
5de29b
+
5de29b
     .align 4
5de29b
 L(du1_do):
5de29b
     bf      30,L(du1_1dw)
5de29b
 
5de29b
     /* there are at least two DWs to copy */
5de29b
+    /* FIXME: can combine last shift and "or" into "rldimi" */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 8
5de29b
+    sldi     8,7, 64-8
5de29b
+#else
5de29b
     sldi     0,6, 8
5de29b
     srdi     8,7, 64-8
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      6,16(5)
5de29b
     std     0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,7, 8
5de29b
+    sldi     8,6, 64-8
5de29b
+#else
5de29b
     sldi     0,7, 8
5de29b
     srdi     8,6, 64-8
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      7,24(5)
5de29b
     std     0,8(4)
5de29b
@@ -612,8 +634,13 @@
5de29b
     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
5de29b
     bf      31,L(du1_loop)
5de29b
     /* there is a third DW to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 8
5de29b
+    sldi     8,7, 64-8
5de29b
+#else
5de29b
     sldi     0,6, 8
5de29b
     srdi     8,7, 64-8
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     std     0,0(4)
5de29b
     mr      6,7
5de29b
@@ -624,8 +651,13 @@
5de29b
     b       L(du1_loop)
5de29b
     .align 4
5de29b
 L(du1_1dw):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 8
5de29b
+    sldi     8,7, 64-8
5de29b
+#else
5de29b
     sldi     0,6, 8
5de29b
     srdi     8,7, 64-8
5de29b
+#endif
5de29b
     addi    5,5,16
5de29b
     or      0,0,8
5de29b
     bf      31,L(du1_loop)
5de29b
@@ -637,23 +669,43 @@
5de29b
     .align 4
5de29b
 /* copy 32 bytes at a time */
5de29b
 L(du1_loop):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 8
5de29b
+    sldi   8,7, 64-8
5de29b
+#else
5de29b
     sldi   0,6, 8
5de29b
     srdi   8,7, 64-8
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,0(5)
5de29b
     std   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 8
5de29b
+    sldi   8,6, 64-8
5de29b
+#else
5de29b
     sldi   0,7, 8
5de29b
     srdi   8,6, 64-8
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,8(5)
5de29b
     std   0,8(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 8
5de29b
+    sldi   8,7, 64-8
5de29b
+#else
5de29b
     sldi   0,6, 8
5de29b
     srdi   8,7, 64-8
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,16(5)
5de29b
     std   0,16(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 8
5de29b
+    sldi   8,6, 64-8
5de29b
+#else
5de29b
     sldi   0,7, 8
5de29b
     srdi   8,6, 64-8
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,24(5)
5de29b
     std   0,24(4)
5de29b
@@ -663,9 +715,14 @@
5de29b
     .align 4
5de29b
 L(du1_fini):
5de29b
     /* calculate and store the final DW */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 8
5de29b
+    sldi   8,7, 64-8
5de29b
+#else
5de29b
     sldi   0,6, 8
5de29b
     srdi   8,7, 64-8
5de29b
-    or    0,0,8  
5de29b
+#endif
5de29b
+    or    0,0,8
5de29b
     std   0,0(4)
5de29b
     b     L(du_done)
5de29b
 
5de29b
@@ -674,13 +731,23 @@
5de29b
     bf      30,L(du2_1dw)
5de29b
 
5de29b
     /* there are at least two DWs to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 16
5de29b
+    sldi     8,7, 64-16
5de29b
+#else
5de29b
     sldi     0,6, 16
5de29b
     srdi     8,7, 64-16
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      6,16(5)
5de29b
     std     0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,7, 16
5de29b
+    sldi     8,6, 64-16
5de29b
+#else
5de29b
     sldi     0,7, 16
5de29b
     srdi     8,6, 64-16
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      7,24(5)
5de29b
     std     0,8(4)
5de29b
@@ -689,8 +756,13 @@
5de29b
     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
5de29b
     bf      31,L(du2_loop)
5de29b
     /* there is a third DW to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 16
5de29b
+    sldi     8,7, 64-16
5de29b
+#else
5de29b
     sldi     0,6, 16
5de29b
     srdi     8,7, 64-16
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     std     0,0(4)
5de29b
     mr      6,7
5de29b
@@ -701,8 +773,13 @@
5de29b
     b       L(du2_loop)
5de29b
     .align 4
5de29b
 L(du2_1dw):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 16
5de29b
+    sldi     8,7, 64-16
5de29b
+#else
5de29b
     sldi     0,6, 16
5de29b
     srdi     8,7, 64-16
5de29b
+#endif
5de29b
     addi    5,5,16
5de29b
     or      0,0,8
5de29b
     bf      31,L(du2_loop)
5de29b
@@ -714,23 +791,43 @@
5de29b
     .align 4
5de29b
 /* copy 32 bytes at a time */
5de29b
 L(du2_loop):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 16
5de29b
+    sldi   8,7, 64-16
5de29b
+#else
5de29b
     sldi   0,6, 16
5de29b
     srdi   8,7, 64-16
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,0(5)
5de29b
     std   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 16
5de29b
+    sldi   8,6, 64-16
5de29b
+#else
5de29b
     sldi   0,7, 16
5de29b
     srdi   8,6, 64-16
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,8(5)
5de29b
     std   0,8(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 16
5de29b
+    sldi   8,7, 64-16
5de29b
+#else
5de29b
     sldi   0,6, 16
5de29b
     srdi   8,7, 64-16
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,16(5)
5de29b
     std   0,16(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 16
5de29b
+    sldi   8,6, 64-16
5de29b
+#else
5de29b
     sldi   0,7, 16
5de29b
     srdi   8,6, 64-16
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,24(5)
5de29b
     std   0,24(4)
5de29b
@@ -740,9 +837,14 @@
5de29b
     .align 4
5de29b
 L(du2_fini):
5de29b
     /* calculate and store the final DW */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 16
5de29b
+    sldi   8,7, 64-16
5de29b
+#else
5de29b
     sldi   0,6, 16
5de29b
     srdi   8,7, 64-16
5de29b
-    or    0,0,8  
5de29b
+#endif
5de29b
+    or    0,0,8
5de29b
     std   0,0(4)
5de29b
     b     L(du_done)
5de29b
 
5de29b
@@ -751,13 +853,23 @@
5de29b
     bf      30,L(du3_1dw)
5de29b
 
5de29b
     /* there are at least two DWs to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 24
5de29b
+    sldi     8,7, 64-24
5de29b
+#else
5de29b
     sldi     0,6, 24
5de29b
     srdi     8,7, 64-24
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      6,16(5)
5de29b
     std     0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,7, 24
5de29b
+    sldi     8,6, 64-24
5de29b
+#else
5de29b
     sldi     0,7, 24
5de29b
     srdi     8,6, 64-24
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      7,24(5)
5de29b
     std     0,8(4)
5de29b
@@ -766,8 +878,13 @@
5de29b
     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
5de29b
     bf      31,L(du3_loop)
5de29b
     /* there is a third DW to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 24
5de29b
+    sldi     8,7, 64-24
5de29b
+#else
5de29b
     sldi     0,6, 24
5de29b
     srdi     8,7, 64-24
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     std     0,0(4)
5de29b
     mr      6,7
5de29b
@@ -778,8 +895,13 @@
5de29b
     b       L(du3_loop)
5de29b
     .align 4
5de29b
 L(du3_1dw):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 24
5de29b
+    sldi     8,7, 64-24
5de29b
+#else
5de29b
     sldi     0,6, 24
5de29b
     srdi     8,7, 64-24
5de29b
+#endif
5de29b
     addi    5,5,16
5de29b
     or      0,0,8
5de29b
     bf      31,L(du3_loop)
5de29b
@@ -791,23 +913,43 @@
5de29b
     .align 4
5de29b
 /* copy 32 bytes at a time */
5de29b
 L(du3_loop):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 24
5de29b
+    sldi   8,7, 64-24
5de29b
+#else
5de29b
     sldi   0,6, 24
5de29b
     srdi   8,7, 64-24
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,0(5)
5de29b
     std   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 24
5de29b
+    sldi   8,6, 64-24
5de29b
+#else
5de29b
     sldi   0,7, 24
5de29b
     srdi   8,6, 64-24
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,8(5)
5de29b
     std   0,8(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 24
5de29b
+    sldi   8,7, 64-24
5de29b
+#else
5de29b
     sldi   0,6, 24
5de29b
     srdi   8,7, 64-24
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,16(5)
5de29b
     std   0,16(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 24
5de29b
+    sldi   8,6, 64-24
5de29b
+#else
5de29b
     sldi   0,7, 24
5de29b
     srdi   8,6, 64-24
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,24(5)
5de29b
     std   0,24(4)
5de29b
@@ -817,9 +959,14 @@
5de29b
     .align 4
5de29b
 L(du3_fini):
5de29b
     /* calculate and store the final DW */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 24
5de29b
+    sldi   8,7, 64-24
5de29b
+#else
5de29b
     sldi   0,6, 24
5de29b
     srdi   8,7, 64-24
5de29b
-    or    0,0,8  
5de29b
+#endif
5de29b
+    or    0,0,8
5de29b
     std   0,0(4)
5de29b
     b     L(du_done)
5de29b
 
5de29b
@@ -834,13 +981,23 @@
5de29b
     bf      30,L(du4_1dw)
5de29b
 
5de29b
     /* there are at least two DWs to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 32
5de29b
+    sldi     8,7, 64-32
5de29b
+#else
5de29b
     sldi     0,6, 32
5de29b
     srdi     8,7, 64-32
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      6,16(5)
5de29b
     std     0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,7, 32
5de29b
+    sldi     8,6, 64-32
5de29b
+#else
5de29b
     sldi     0,7, 32
5de29b
     srdi     8,6, 64-32
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      7,24(5)
5de29b
     std     0,8(4)
5de29b
@@ -849,8 +1006,13 @@
5de29b
     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
5de29b
     bf      31,L(du4_loop)
5de29b
     /* there is a third DW to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 32
5de29b
+    sldi     8,7, 64-32
5de29b
+#else
5de29b
     sldi     0,6, 32
5de29b
     srdi     8,7, 64-32
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     std     0,0(4)
5de29b
     mr      6,7
5de29b
@@ -861,8 +1023,13 @@
5de29b
     b       L(du4_loop)
5de29b
     .align 4
5de29b
 L(du4_1dw):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 32
5de29b
+    sldi     8,7, 64-32
5de29b
+#else
5de29b
     sldi     0,6, 32
5de29b
     srdi     8,7, 64-32
5de29b
+#endif
5de29b
     addi    5,5,16
5de29b
     or      0,0,8
5de29b
     bf      31,L(du4_loop)
5de29b
@@ -874,23 +1041,43 @@
5de29b
     .align 4
5de29b
 /* copy 32 bytes at a time */
5de29b
 L(du4_loop):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 32
5de29b
+    sldi   8,7, 64-32
5de29b
+#else
5de29b
     sldi   0,6, 32
5de29b
     srdi   8,7, 64-32
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,0(5)
5de29b
     std   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 32
5de29b
+    sldi   8,6, 64-32
5de29b
+#else
5de29b
     sldi   0,7, 32
5de29b
     srdi   8,6, 64-32
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,8(5)
5de29b
     std   0,8(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 32
5de29b
+    sldi   8,7, 64-32
5de29b
+#else
5de29b
     sldi   0,6, 32
5de29b
     srdi   8,7, 64-32
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,16(5)
5de29b
     std   0,16(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 32
5de29b
+    sldi   8,6, 64-32
5de29b
+#else
5de29b
     sldi   0,7, 32
5de29b
     srdi   8,6, 64-32
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,24(5)
5de29b
     std   0,24(4)
5de29b
@@ -900,9 +1087,14 @@
5de29b
     .align 4
5de29b
 L(du4_fini):
5de29b
     /* calculate and store the final DW */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 32
5de29b
+    sldi   8,7, 64-32
5de29b
+#else
5de29b
     sldi   0,6, 32
5de29b
     srdi   8,7, 64-32
5de29b
-    or    0,0,8  
5de29b
+#endif
5de29b
+    or    0,0,8
5de29b
     std   0,0(4)
5de29b
     b     L(du_done)
5de29b
 
5de29b
@@ -911,13 +1103,23 @@
5de29b
     bf      30,L(du5_1dw)
5de29b
 
5de29b
     /* there are at least two DWs to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 40
5de29b
+    sldi     8,7, 64-40
5de29b
+#else
5de29b
     sldi     0,6, 40
5de29b
     srdi     8,7, 64-40
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      6,16(5)
5de29b
     std     0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,7, 40
5de29b
+    sldi     8,6, 64-40
5de29b
+#else
5de29b
     sldi     0,7, 40
5de29b
     srdi     8,6, 64-40
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      7,24(5)
5de29b
     std     0,8(4)
5de29b
@@ -926,8 +1128,13 @@
5de29b
     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
5de29b
     bf      31,L(du5_loop)
5de29b
     /* there is a third DW to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 40
5de29b
+    sldi     8,7, 64-40
5de29b
+#else
5de29b
     sldi     0,6, 40
5de29b
     srdi     8,7, 64-40
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     std     0,0(4)
5de29b
     mr      6,7
5de29b
@@ -938,8 +1145,13 @@
5de29b
     b       L(du5_loop)
5de29b
     .align 4
5de29b
 L(du5_1dw):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 40
5de29b
+    sldi     8,7, 64-40
5de29b
+#else
5de29b
     sldi     0,6, 40
5de29b
     srdi     8,7, 64-40
5de29b
+#endif
5de29b
     addi    5,5,16
5de29b
     or      0,0,8
5de29b
     bf      31,L(du5_loop)
5de29b
@@ -951,23 +1163,43 @@
5de29b
     .align 4
5de29b
 /* copy 32 bytes at a time */
5de29b
 L(du5_loop):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 40
5de29b
+    sldi   8,7, 64-40
5de29b
+#else
5de29b
     sldi   0,6, 40
5de29b
     srdi   8,7, 64-40
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,0(5)
5de29b
     std   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 40
5de29b
+    sldi   8,6, 64-40
5de29b
+#else
5de29b
     sldi   0,7, 40
5de29b
     srdi   8,6, 64-40
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,8(5)
5de29b
     std   0,8(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 40
5de29b
+    sldi   8,7, 64-40
5de29b
+#else
5de29b
     sldi   0,6, 40
5de29b
     srdi   8,7, 64-40
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,16(5)
5de29b
     std   0,16(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 40
5de29b
+    sldi   8,6, 64-40
5de29b
+#else
5de29b
     sldi   0,7, 40
5de29b
     srdi   8,6, 64-40
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,24(5)
5de29b
     std   0,24(4)
5de29b
@@ -977,9 +1209,14 @@
5de29b
     .align 4
5de29b
 L(du5_fini):
5de29b
     /* calculate and store the final DW */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 40
5de29b
+    sldi   8,7, 64-40
5de29b
+#else
5de29b
     sldi   0,6, 40
5de29b
     srdi   8,7, 64-40
5de29b
-    or    0,0,8  
5de29b
+#endif
5de29b
+    or    0,0,8
5de29b
     std   0,0(4)
5de29b
     b     L(du_done)
5de29b
 
5de29b
@@ -988,13 +1225,23 @@
5de29b
     bf      30,L(du6_1dw)
5de29b
 
5de29b
     /* there are at least two DWs to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 48
5de29b
+    sldi     8,7, 64-48
5de29b
+#else
5de29b
     sldi     0,6, 48
5de29b
     srdi     8,7, 64-48
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      6,16(5)
5de29b
     std     0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,7, 48
5de29b
+    sldi     8,6, 64-48
5de29b
+#else
5de29b
     sldi     0,7, 48
5de29b
     srdi     8,6, 64-48
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      7,24(5)
5de29b
     std     0,8(4)
5de29b
@@ -1003,8 +1250,13 @@
5de29b
     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
5de29b
     bf      31,L(du6_loop)
5de29b
     /* there is a third DW to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 48
5de29b
+    sldi     8,7, 64-48
5de29b
+#else
5de29b
     sldi     0,6, 48
5de29b
     srdi     8,7, 64-48
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     std     0,0(4)
5de29b
     mr      6,7
5de29b
@@ -1015,8 +1267,13 @@
5de29b
     b       L(du6_loop)
5de29b
     .align 4
5de29b
 L(du6_1dw):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 48
5de29b
+    sldi     8,7, 64-48
5de29b
+#else
5de29b
     sldi     0,6, 48
5de29b
     srdi     8,7, 64-48
5de29b
+#endif
5de29b
     addi    5,5,16
5de29b
     or      0,0,8
5de29b
     bf      31,L(du6_loop)
5de29b
@@ -1028,23 +1285,43 @@
5de29b
     .align 4
5de29b
 /* copy 32 bytes at a time */
5de29b
 L(du6_loop):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 48
5de29b
+    sldi   8,7, 64-48
5de29b
+#else
5de29b
     sldi   0,6, 48
5de29b
     srdi   8,7, 64-48
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,0(5)
5de29b
     std   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 48
5de29b
+    sldi   8,6, 64-48
5de29b
+#else
5de29b
     sldi   0,7, 48
5de29b
     srdi   8,6, 64-48
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,8(5)
5de29b
     std   0,8(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 48
5de29b
+    sldi   8,7, 64-48
5de29b
+#else
5de29b
     sldi   0,6, 48
5de29b
     srdi   8,7, 64-48
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,16(5)
5de29b
     std   0,16(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 48
5de29b
+    sldi   8,6, 64-48
5de29b
+#else
5de29b
     sldi   0,7, 48
5de29b
     srdi   8,6, 64-48
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,24(5)
5de29b
     std   0,24(4)
5de29b
@@ -1054,9 +1331,14 @@
5de29b
     .align 4
5de29b
 L(du6_fini):
5de29b
     /* calculate and store the final DW */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 48
5de29b
+    sldi   8,7, 64-48
5de29b
+#else
5de29b
     sldi   0,6, 48
5de29b
     srdi   8,7, 64-48
5de29b
-    or    0,0,8  
5de29b
+#endif
5de29b
+    or    0,0,8
5de29b
     std   0,0(4)
5de29b
     b     L(du_done)
5de29b
 
5de29b
@@ -1065,13 +1347,23 @@
5de29b
     bf      30,L(du7_1dw)
5de29b
 
5de29b
     /* there are at least two DWs to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 56
5de29b
+    sldi     8,7, 64-56
5de29b
+#else
5de29b
     sldi     0,6, 56
5de29b
     srdi     8,7, 64-56
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      6,16(5)
5de29b
     std     0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,7, 56
5de29b
+    sldi     8,6, 64-56
5de29b
+#else
5de29b
     sldi     0,7, 56
5de29b
     srdi     8,6, 64-56
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     ld      7,24(5)
5de29b
     std     0,8(4)
5de29b
@@ -1080,8 +1372,13 @@
5de29b
     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
5de29b
     bf      31,L(du7_loop)
5de29b
     /* there is a third DW to copy */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 56
5de29b
+    sldi     8,7, 64-56
5de29b
+#else
5de29b
     sldi     0,6, 56
5de29b
     srdi     8,7, 64-56
5de29b
+#endif
5de29b
     or      0,0,8
5de29b
     std     0,0(4)
5de29b
     mr      6,7
5de29b
@@ -1092,8 +1389,13 @@
5de29b
     b       L(du7_loop)
5de29b
     .align 4
5de29b
 L(du7_1dw):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi     0,6, 56
5de29b
+    sldi     8,7, 64-56
5de29b
+#else
5de29b
     sldi     0,6, 56
5de29b
     srdi     8,7, 64-56
5de29b
+#endif
5de29b
     addi    5,5,16
5de29b
     or      0,0,8
5de29b
     bf      31,L(du7_loop)
5de29b
@@ -1105,23 +1407,43 @@
5de29b
     .align 4
5de29b
 /* copy 32 bytes at a time */
5de29b
 L(du7_loop):
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 56
5de29b
+    sldi   8,7, 64-56
5de29b
+#else
5de29b
     sldi   0,6, 56
5de29b
     srdi   8,7, 64-56
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,0(5)
5de29b
     std   0,0(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 56
5de29b
+    sldi   8,6, 64-56
5de29b
+#else
5de29b
     sldi   0,7, 56
5de29b
     srdi   8,6, 64-56
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,8(5)
5de29b
     std   0,8(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 56
5de29b
+    sldi   8,7, 64-56
5de29b
+#else
5de29b
     sldi   0,6, 56
5de29b
     srdi   8,7, 64-56
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    6,16(5)
5de29b
     std   0,16(4)
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,7, 56
5de29b
+    sldi   8,6, 64-56
5de29b
+#else
5de29b
     sldi   0,7, 56
5de29b
     srdi   8,6, 64-56
5de29b
+#endif
5de29b
     or    0,0,8
5de29b
     ld    7,24(5)
5de29b
     std   0,24(4)
5de29b
@@ -1131,12 +1453,17 @@
5de29b
     .align 4
5de29b
 L(du7_fini):
5de29b
     /* calculate and store the final DW */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+    srdi   0,6, 56
5de29b
+    sldi   8,7, 64-56
5de29b
+#else
5de29b
     sldi   0,6, 56
5de29b
     srdi   8,7, 64-56
5de29b
-    or    0,0,8  
5de29b
+#endif
5de29b
+    or    0,0,8
5de29b
     std   0,0(4)
5de29b
     b     L(du_done)
5de29b
-    
5de29b
+
5de29b
     .align 4
5de29b
 L(du_done):
5de29b
     rldicr 0,31,0,60
5de29b
@@ -1144,9 +1471,9 @@
5de29b
     beq   cr1,0f	/* If the tail is 0 bytes we are done!  */
5de29b
 
5de29b
     add   3,3,0
5de29b
-    add   12,12,0    
5de29b
+    add   12,12,0
5de29b
 /*  At this point we have a tail of 0-7 bytes and we know that the
5de29b
-    destiniation is double word aligned.  */
5de29b
+    destination is double word aligned.  */
5de29b
 4:  bf    29,2f
5de29b
     lwz   6,0(12)
5de29b
     addi  12,12,4
5de29b
@@ -1165,5 +1492,5 @@
5de29b
     ld 31,-8(1)
5de29b
     ld 3,-16(1)
5de29b
     blr
5de29b
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
5de29b
+END_GEN_TB (memcpy,TB_TOCLESS)
5de29b
 libc_hidden_builtin_def (memcpy)
12745e
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:05:40.000000000 -0500
5de29b
@@ -1,5 +1,5 @@
5de29b
 /* Optimized memcpy implementation for PowerPC64/POWER7.
5de29b
-   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
5de29b
+   Copyright (C) 2010-2014 Free Software Foundation, Inc.
5de29b
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
5de29b
    This file is part of the GNU C Library.
5de29b
 
5de29b
@@ -18,425 +18,366 @@
5de29b
    <http://www.gnu.org/licenses/>.  */
5de29b
 
5de29b
 #include <sysdep.h>
5de29b
-#include <bp-sym.h>
5de29b
-#include <bp-asm.h>
5de29b
 
5de29b
 
5de29b
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
5de29b
    Returns 'dst'.  */
5de29b
 
5de29b
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
5de29b
+#define src 4
5de29b
+#define cnt 5
5de29b
+
5de29b
 	.machine power7
5de29b
-EALIGN (BP_SYM (memcpy), 5, 0)
5de29b
+EALIGN (memcpy, 5, 0)
5de29b
 	CALL_MCOUNT 3
5de29b
 
5de29b
-	cmpldi  cr1,5,31
5de29b
+	cmpldi	cr1,cnt,31
5de29b
 	neg	0,3
5de29b
-	std	3,-16(1)
5de29b
-	std	31,-8(1)
5de29b
-	cfi_offset(31,-8)
5de29b
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
5de29b
 				    code.  */
5de29b
 
5de29b
-	andi.   11,3,7	      /* Check alignment of DST.  */
5de29b
-
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
5de29b
+   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
5de29b
+   loop is only used for quadword aligned copies.  */
5de29b
+	andi.	10,3,15
5de29b
+	clrldi	11,4,60
5de29b
+#else
5de29b
+	andi.	10,3,7		/* Check alignment of DST.  */
5de29b
+	clrldi	11,4,61		/* Check alignment of SRC.  */
5de29b
+#endif
5de29b
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
5de29b
 
5de29b
-	clrldi  10,4,61       /* Check alignment of SRC.  */
5de29b
-	cmpld   cr6,10,11     /* SRC and DST alignments match?  */
5de29b
-	mr	12,4
5de29b
-	mr	31,5
5de29b
+	mr	dst,3
5de29b
 	bne	cr6,L(copy_GE_32_unaligned)
5de29b
+	beq	L(aligned_copy)
5de29b
 
5de29b
-	srdi    9,5,3	      /* Number of full quadwords remaining.  */
5de29b
-
5de29b
-	beq    L(copy_GE_32_aligned_cont)
5de29b
-
5de29b
-	clrldi  0,0,61
5de29b
-	mtcrf   0x01,0
5de29b
-	subf    31,0,5
5de29b
-
5de29b
-	/* Get the SRC aligned to 8 bytes.  */
5de29b
-
5de29b
-1:	bf	31,2f
5de29b
-	lbz	6,0(12)
5de29b
-	addi    12,12,1
5de29b
-	stb	6,0(3)
5de29b
-	addi    3,3,1
5de29b
-2:	bf      30,4f
5de29b
-	lhz     6,0(12)
5de29b
-	addi    12,12,2
5de29b
-	sth     6,0(3)
5de29b
-	addi    3,3,2
5de29b
-4:	bf      29,0f
5de29b
-	lwz     6,0(12)
5de29b
-	addi    12,12,4
5de29b
-	stw     6,0(3)
5de29b
-	addi    3,3,4
5de29b
-0:
5de29b
-	clrldi  10,12,61      /* Check alignment of SRC again.  */
5de29b
-	srdi    9,31,3	      /* Number of full doublewords remaining.  */
5de29b
-
5de29b
-L(copy_GE_32_aligned_cont):
5de29b
-
5de29b
-	clrldi  11,31,61
5de29b
-	mtcrf   0x01,9
5de29b
-
5de29b
-	srdi    8,31,5
5de29b
-	cmpldi  cr1,9,4
5de29b
-	cmpldi  cr6,11,0
5de29b
-	mr	11,12
5de29b
+	mtocrf	0x01,0
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	clrldi	0,0,60
5de29b
+#else
5de29b
+	clrldi	0,0,61
5de29b
+#endif
5de29b
 
5de29b
-	/* Copy 1~3 doublewords so the main loop starts
5de29b
-	at a multiple of 32 bytes.  */
5de29b
-
5de29b
-	bf	30,1f
5de29b
-	ld      6,0(12)
5de29b
-	ld      7,8(12)
5de29b
-	addi    11,12,16
5de29b
-	mtctr   8
5de29b
-	std     6,0(3)
5de29b
-	std     7,8(3)
5de29b
-	addi    10,3,16
5de29b
-	bf      31,4f
5de29b
-	ld      0,16(12)
5de29b
-	std     0,16(3)
5de29b
-	blt     cr1,3f
5de29b
-	addi    11,12,24
5de29b
-	addi    10,3,24
5de29b
-	b       4f
5de29b
-
5de29b
-	.align  4
5de29b
-1:	/* Copy 1 doubleword and set the counter.  */
5de29b
-	mr	10,3
5de29b
-	mtctr   8
5de29b
-	bf      31,4f
5de29b
-	ld      6,0(12)
5de29b
-	addi    11,12,8
5de29b
-	std     6,0(3)
5de29b
-	addi    10,3,8
5de29b
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
5de29b
+1:
5de29b
+	bf	31,2f
5de29b
+	lbz	6,0(src)
5de29b
+	addi	src,src,1
5de29b
+	stb	6,0(dst)
5de29b
+	addi	dst,dst,1
5de29b
+2:
5de29b
+	bf	30,4f
5de29b
+	lhz	6,0(src)
5de29b
+	addi	src,src,2
5de29b
+	sth	6,0(dst)
5de29b
+	addi	dst,dst,2
5de29b
+4:
5de29b
+	bf	29,8f
5de29b
+	lwz	6,0(src)
5de29b
+	addi	src,src,4
5de29b
+	stw	6,0(dst)
5de29b
+	addi	dst,dst,4
5de29b
+8:
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	bf	28,16f
5de29b
+	ld	6,0(src)
5de29b
+	addi	src,src,8
5de29b
+	std	6,0(dst)
5de29b
+	addi	dst,dst,8
5de29b
+16:
5de29b
+#endif
5de29b
+	subf	cnt,0,cnt
5de29b
 
5de29b
+/* Main aligned copy loop. Copies 128 bytes at a time. */
5de29b
 L(aligned_copy):
5de29b
-	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
5de29b
-	.align  4
5de29b
-4:
5de29b
-	/* check for any 32-byte or 64-byte lumps that are outside of a
5de29b
-	   nice 128-byte range.  R8 contains the number of 32-byte
5de29b
-	   lumps, so drop this into the CR, and use the SO/EQ bits to help
5de29b
-	   handle the 32- or 64- byte lumps.  Then handle the rest with an
5de29b
-	   unrolled 128-bytes-at-a-time copy loop. */
5de29b
-	mtocrf	1,8
5de29b
-	li	6,16	# 16() index
5de29b
-	li	7,32	# 32() index
5de29b
-	li	8,48	# 48() index
5de29b
-
5de29b
-L(aligned_32byte):
5de29b
-	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
5de29b
-	bns	cr7,L(aligned_64byte)
5de29b
-	lxvd2x	6,0,11
5de29b
-	lxvd2x	7,11,6
5de29b
-	addi	11,11,32
5de29b
-	stxvd2x	6,0,10
5de29b
-	stxvd2x	7,10,6
5de29b
-	addi	10,10,32
5de29b
-
5de29b
-L(aligned_64byte):
5de29b
-	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
5de29b
-	bne	cr7,L(aligned_128setup)
5de29b
-	lxvd2x	6,0,11
5de29b
-	lxvd2x	7,11,6
5de29b
-	lxvd2x	8,11,7
5de29b
-	lxvd2x	9,11,8
5de29b
-	addi	11,11,64
5de29b
-	stxvd2x	6,0,10
5de29b
-	stxvd2x	7,10,6
5de29b
-	stxvd2x	8,10,7
5de29b
-	stxvd2x	9,10,8
5de29b
-	addi	10,10,64
5de29b
-
5de29b
-L(aligned_128setup):
5de29b
-	/* Set up for the 128-byte at a time copy loop.  */
5de29b
-	srdi	8,31,7
5de29b
-	cmpdi	8,0	# Any 4x lumps left?
5de29b
-	beq	3f	# if not, move along.
5de29b
-	lxvd2x	6,0,11
5de29b
-	lxvd2x	7,11,6
5de29b
-	mtctr	8	# otherwise, load the ctr and begin.
5de29b
-	li	8,48	# 48() index
5de29b
+	li	6,16
5de29b
+	li	7,32
5de29b
+	li	8,48
5de29b
+	mtocrf	0x02,cnt
5de29b
+	srdi	12,cnt,7
5de29b
+	cmpdi	12,0
5de29b
+	beq	L(aligned_tail)
5de29b
+	lxvd2x	6,0,src
5de29b
+	lxvd2x	7,src,6
5de29b
+	mtctr	12
5de29b
 	b	L(aligned_128loop)
5de29b
 
5de29b
+	.align  4
5de29b
 L(aligned_128head):
5de29b
 	/* for the 2nd + iteration of this loop. */
5de29b
-	lxvd2x	6,0,11
5de29b
-	lxvd2x	7,11,6
5de29b
+	lxvd2x	6,0,src
5de29b
+	lxvd2x	7,src,6
5de29b
 L(aligned_128loop):
5de29b
-	lxvd2x	8,11,7
5de29b
-	lxvd2x	9,11,8
5de29b
-	stxvd2x	6,0,10
5de29b
-	addi	11,11,64
5de29b
-	stxvd2x	7,10,6
5de29b
-	stxvd2x	8,10,7
5de29b
-	stxvd2x	9,10,8
5de29b
-	lxvd2x	6,0,11
5de29b
-	lxvd2x	7,11,6
5de29b
-	addi	10,10,64
5de29b
-	lxvd2x	8,11,7
5de29b
-	lxvd2x	9,11,8
5de29b
-	addi	11,11,64
5de29b
-	stxvd2x	6,0,10
5de29b
-	stxvd2x	7,10,6
5de29b
-	stxvd2x	8,10,7
5de29b
-	stxvd2x	9,10,8
5de29b
-	addi	10,10,64
5de29b
+	lxvd2x	8,src,7
5de29b
+	lxvd2x	9,src,8
5de29b
+	stxvd2x	6,0,dst
5de29b
+	addi	src,src,64
5de29b
+	stxvd2x	7,dst,6
5de29b
+	stxvd2x	8,dst,7
5de29b
+	stxvd2x	9,dst,8
5de29b
+	lxvd2x	6,0,src
5de29b
+	lxvd2x	7,src,6
5de29b
+	addi	dst,dst,64
5de29b
+	lxvd2x	8,src,7
5de29b
+	lxvd2x	9,src,8
5de29b
+	addi	src,src,64
5de29b
+	stxvd2x	6,0,dst
5de29b
+	stxvd2x	7,dst,6
5de29b
+	stxvd2x	8,dst,7
5de29b
+	stxvd2x	9,dst,8
5de29b
+	addi	dst,dst,64
5de29b
 	bdnz	L(aligned_128head)
5de29b
 
5de29b
-3:
5de29b
-	/* Check for tail bytes.  */
5de29b
-	rldicr  0,31,0,60
5de29b
-	mtcrf   0x01,31
5de29b
-	beq	cr6,0f
5de29b
-
5de29b
-.L9:
5de29b
-	add	3,3,0
5de29b
-	add	12,12,0
5de29b
-
5de29b
-	/*  At this point we have a tail of 0-7 bytes and we know that the
5de29b
-	destination is doubleword-aligned.  */
5de29b
-4:	/* Copy 4 bytes.  */
5de29b
-	bf	29,2f
5de29b
-
5de29b
-	lwz     6,0(12)
5de29b
-	addi    12,12,4
5de29b
-	stw     6,0(3)
5de29b
-	addi    3,3,4
5de29b
-2:	/* Copy 2 bytes.  */
5de29b
-	bf	30,1f
5de29b
-
5de29b
-	lhz     6,0(12)
5de29b
-	addi    12,12,2
5de29b
-	sth     6,0(3)
5de29b
-	addi    3,3,2
5de29b
-1:	/* Copy 1 byte.  */
5de29b
-	bf	31,0f
5de29b
-
5de29b
-	lbz	6,0(12)
5de29b
-	stb	6,0(3)
5de29b
-0:	/* Return original DST pointer.  */
5de29b
-	ld	31,-8(1)
5de29b
-	ld	3,-16(1)
5de29b
+L(aligned_tail):
5de29b
+	mtocrf	0x01,cnt
5de29b
+	bf	25,32f
5de29b
+	lxvd2x	6,0,src
5de29b
+	lxvd2x	7,src,6
5de29b
+	lxvd2x	8,src,7
5de29b
+	lxvd2x	9,src,8
5de29b
+	addi	src,src,64
5de29b
+	stxvd2x	6,0,dst
5de29b
+	stxvd2x	7,dst,6
5de29b
+	stxvd2x	8,dst,7
5de29b
+	stxvd2x	9,dst,8
5de29b
+	addi	dst,dst,64
5de29b
+32:
5de29b
+	bf	26,16f
5de29b
+	lxvd2x	6,0,src
5de29b
+	lxvd2x	7,src,6
5de29b
+	addi	src,src,32
5de29b
+	stxvd2x	6,0,dst
5de29b
+	stxvd2x	7,dst,6
5de29b
+	addi	dst,dst,32
5de29b
+16:
5de29b
+	bf	27,8f
5de29b
+	lxvd2x	6,0,src
5de29b
+	addi	src,src,16
5de29b
+	stxvd2x	6,0,dst
5de29b
+	addi	dst,dst,16
5de29b
+8:
5de29b
+	bf	28,4f
5de29b
+	ld	6,0(src)
5de29b
+	addi	src,src,8
5de29b
+	std     6,0(dst)
5de29b
+	addi	dst,dst,8
5de29b
+4:	/* Copies 4~7 bytes.  */
5de29b
+	bf	29,L(tail2)
5de29b
+	lwz	6,0(src)
5de29b
+	stw     6,0(dst)
5de29b
+	bf      30,L(tail5)
5de29b
+	lhz     7,4(src)
5de29b
+	sth     7,4(dst)
5de29b
+	bflr	31
5de29b
+	lbz     8,6(src)
5de29b
+	stb     8,6(dst)
5de29b
+	/* Return original DST pointer.  */
5de29b
 	blr
5de29b
 
5de29b
-	/* Handle copies of 0~31 bytes.  */
5de29b
-	.align  4
5de29b
+
5de29b
+/* Handle copies of 0~31 bytes.  */
5de29b
+	.align	4
5de29b
 L(copy_LT_32):
5de29b
-	cmpldi  cr6,5,8
5de29b
-	mr	12,4
5de29b
-	mtcrf   0x01,5
5de29b
+	mr	dst,3
5de29b
+	cmpldi	cr6,cnt,8
5de29b
+	mtocrf	0x01,cnt
5de29b
 	ble	cr6,L(copy_LE_8)
5de29b
 
5de29b
 	/* At least 9 bytes to go.  */
5de29b
 	neg	8,4
5de29b
-	clrrdi  11,4,2
5de29b
-	andi.   0,8,3
5de29b
-	cmpldi  cr1,5,16
5de29b
-	mr	10,5
5de29b
+	andi.	0,8,3
5de29b
+	cmpldi	cr1,cnt,16
5de29b
 	beq	L(copy_LT_32_aligned)
5de29b
 
5de29b
-	/* Force 4-bytes alignment for SRC.  */
5de29b
-	mtocrf  0x01,0
5de29b
-	subf    10,0,5
5de29b
-2:	bf	30,1f
5de29b
-
5de29b
-	lhz	6,0(12)
5de29b
-	addi    12,12,2
5de29b
-	sth	6,0(3)
5de29b
-	addi    3,3,2
5de29b
-1:	bf	31,L(end_4bytes_alignment)
5de29b
-
5de29b
-	lbz	6,0(12)
5de29b
-	addi    12,12,1
5de29b
-	stb	6,0(3)
5de29b
-	addi    3,3,1
5de29b
+	/* Force 4-byte alignment for SRC.  */
5de29b
+	mtocrf	0x01,0
5de29b
+	subf	cnt,0,cnt
5de29b
+2:
5de29b
+	bf	30,1f
5de29b
+	lhz	6,0(src)
5de29b
+	addi	src,src,2
5de29b
+	sth	6,0(dst)
5de29b
+	addi	dst,dst,2
5de29b
+1:
5de29b
+	bf	31,L(end_4bytes_alignment)
5de29b
+	lbz	6,0(src)
5de29b
+	addi	src,src,1
5de29b
+	stb	6,0(dst)
5de29b
+	addi	dst,dst,1
5de29b
 
5de29b
-	.align  4
5de29b
+	.align	4
5de29b
 L(end_4bytes_alignment):
5de29b
-	cmpldi  cr1,10,16
5de29b
-	mtcrf   0x01,10
5de29b
+	cmpldi	cr1,cnt,16
5de29b
+	mtocrf	0x01,cnt
5de29b
 
5de29b
 L(copy_LT_32_aligned):
5de29b
 	/* At least 6 bytes to go, and SRC is word-aligned.  */
5de29b
 	blt	cr1,8f
5de29b
 
5de29b
 	/* Copy 16 bytes.  */
5de29b
-	lwz	6,0(12)
5de29b
-	lwz     7,4(12)
5de29b
-	stw     6,0(3)
5de29b
-	lwz     8,8(12)
5de29b
-	stw     7,4(3)
5de29b
-	lwz     6,12(12)
5de29b
-	addi    12,12,16
5de29b
-	stw     8,8(3)
5de29b
-	stw     6,12(3)
5de29b
-	addi    3,3,16
5de29b
+	lwz	6,0(src)
5de29b
+	lwz	7,4(src)
5de29b
+	stw	6,0(dst)
5de29b
+	lwz	8,8(src)
5de29b
+	stw	7,4(dst)
5de29b
+	lwz	6,12(src)
5de29b
+	addi	src,src,16
5de29b
+	stw	8,8(dst)
5de29b
+	stw	6,12(dst)
5de29b
+	addi	dst,dst,16
5de29b
 8:	/* Copy 8 bytes.  */
5de29b
-	bf	28,4f
5de29b
+	bf	28,L(tail4)
5de29b
+	lwz	6,0(src)
5de29b
+	lwz	7,4(src)
5de29b
+	addi	src,src,8
5de29b
+	stw	6,0(dst)
5de29b
+	stw	7,4(dst)
5de29b
+	addi	dst,dst,8
5de29b
+
5de29b
+	.align	4
5de29b
+/* Copies 4~7 bytes.  */
5de29b
+L(tail4):
5de29b
+	bf	29,L(tail2)
5de29b
+	lwz	6,0(src)
5de29b
+	stw	6,0(dst)
5de29b
+	bf	30,L(tail5)
5de29b
+	lhz	7,4(src)
5de29b
+	sth	7,4(dst)
5de29b
+	bflr	31
5de29b
+	lbz	8,6(src)
5de29b
+	stb	8,6(dst)
5de29b
+	/* Return original DST pointer.  */
5de29b
+	blr
5de29b
 
5de29b
-	lwz     6,0(12)
5de29b
-	lwz     7,4(12)
5de29b
-	addi    12,12,8
5de29b
-	stw     6,0(3)
5de29b
-	stw     7,4(3)
5de29b
-	addi    3,3,8
5de29b
-4:	/* Copy 4 bytes.  */
5de29b
-	bf	29,2f
5de29b
-
5de29b
-	lwz     6,0(12)
5de29b
-	addi    12,12,4
5de29b
-	stw     6,0(3)
5de29b
-	addi    3,3,4
5de29b
-2:	/* Copy 2-3 bytes.  */
5de29b
+	.align	4
5de29b
+/* Copies 2~3 bytes.  */
5de29b
+L(tail2):
5de29b
 	bf	30,1f
5de29b
-
5de29b
-	lhz     6,0(12)
5de29b
-	sth     6,0(3)
5de29b
-	bf      31,0f
5de29b
-	lbz     7,2(12)
5de29b
-	stb     7,2(3)
5de29b
-	ld	3,-16(1)
5de29b
+	lhz	6,0(src)
5de29b
+	sth	6,0(dst)
5de29b
+	bflr	31
5de29b
+	lbz	7,2(src)
5de29b
+	stb	7,2(dst)
5de29b
 	blr
5de29b
 
5de29b
-	.align  4
5de29b
-1:	/* Copy 1 byte.  */
5de29b
-	bf	31,0f
5de29b
+	.align	4
5de29b
+L(tail5):
5de29b
+	bflr	31
5de29b
+	lbz	6,4(src)
5de29b
+	stb	6,4(dst)
5de29b
+	blr
5de29b
 
5de29b
-	lbz	6,0(12)
5de29b
-	stb	6,0(3)
5de29b
-0:	/* Return original DST pointer.  */
5de29b
-	ld	3,-16(1)
5de29b
+	.align	4
5de29b
+1:
5de29b
+	bflr	31
5de29b
+	lbz	6,0(src)
5de29b
+	stb	6,0(dst)
5de29b
+	/* Return original DST pointer.  */
5de29b
 	blr
5de29b
 
5de29b
-	/* Handles copies of 0~8 bytes.  */
5de29b
-	.align  4
5de29b
+
5de29b
+/* Handles copies of 0~8 bytes.  */
5de29b
+	.align	4
5de29b
 L(copy_LE_8):
5de29b
-	bne	cr6,4f
5de29b
+	bne	cr6,L(tail4)
5de29b
 
5de29b
 	/* Though we could've used ld/std here, they are still
5de29b
 	slow for unaligned cases.  */
5de29b
 
5de29b
-	lwz	6,0(4)
5de29b
-	lwz     7,4(4)
5de29b
-	stw     6,0(3)
5de29b
-	stw     7,4(3)
5de29b
-	ld      3,-16(1)      /* Return original DST pointers.  */
5de29b
+	lwz	6,0(src)
5de29b
+	lwz	7,4(src)
5de29b
+	stw	6,0(dst)
5de29b
+	stw	7,4(dst)
5de29b
 	blr
5de29b
 
5de29b
-	.align  4
5de29b
-4:	/* Copies 4~7 bytes.  */
5de29b
-	bf	29,2b
5de29b
 
5de29b
-	lwz	6,0(4)
5de29b
-	stw     6,0(3)
5de29b
-	bf      30,5f
5de29b
-	lhz     7,4(4)
5de29b
-	sth     7,4(3)
5de29b
-	bf      31,0f
5de29b
-	lbz     8,6(4)
5de29b
-	stb     8,6(3)
5de29b
-	ld	3,-16(1)
5de29b
-	blr
5de29b
-
5de29b
-	.align  4
5de29b
-5:	/* Copy 1 byte.  */
5de29b
-	bf	31,0f
5de29b
-
5de29b
-	lbz	6,4(4)
5de29b
-	stb	6,4(3)
5de29b
-
5de29b
-0:	/* Return original DST pointer.  */
5de29b
-	ld	3,-16(1)
5de29b
-	blr
5de29b
-
5de29b
-	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
5de29b
-	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
5de29b
-	the data, allowing for aligned DST stores.  */
5de29b
-	.align  4
5de29b
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
5de29b
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
5de29b
+   the data, allowing for aligned DST stores.  */
5de29b
+	.align	4
5de29b
 L(copy_GE_32_unaligned):
5de29b
-	clrldi  0,0,60	      /* Number of bytes until the 1st
5de29b
-			      quadword.  */
5de29b
-	andi.   11,3,15       /* Check alignment of DST (against
5de29b
-			      quadwords).  */
5de29b
-	srdi    9,5,4	      /* Number of full quadwords remaining.  */
5de29b
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
5de29b
+#ifndef __LITTLE_ENDIAN__
5de29b
+	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
5de29b
+#endif
5de29b
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
5de29b
 
5de29b
 	beq	L(copy_GE_32_unaligned_cont)
5de29b
 
5de29b
-	/* SRC is not quadword aligned, get it aligned.  */
5de29b
+	/* DST is not quadword aligned, get it aligned.  */
5de29b
 
5de29b
-	mtcrf   0x01,0
5de29b
-	subf    31,0,5
5de29b
+	mtocrf	0x01,0
5de29b
+	subf	cnt,0,cnt
5de29b
 
5de29b
 	/* Vector instructions work best when proper alignment (16-bytes)
5de29b
 	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
5de29b
-1:	/* Copy 1 byte.  */
5de29b
+1:
5de29b
 	bf	31,2f
5de29b
-
5de29b
-	lbz	6,0(12)
5de29b
-	addi    12,12,1
5de29b
-	stb	6,0(3)
5de29b
-	addi    3,3,1
5de29b
-2:	/* Copy 2 bytes.  */
5de29b
+	lbz	6,0(src)
5de29b
+	addi	src,src,1
5de29b
+	stb	6,0(dst)
5de29b
+	addi	dst,dst,1
5de29b
+2:
5de29b
 	bf	30,4f
5de29b
-
5de29b
-	lhz     6,0(12)
5de29b
-	addi    12,12,2
5de29b
-	sth     6,0(3)
5de29b
-	addi    3,3,2
5de29b
-4:	/* Copy 4 bytes.  */
5de29b
+	lhz	6,0(src)
5de29b
+	addi	src,src,2
5de29b
+	sth	6,0(dst)
5de29b
+	addi	dst,dst,2
5de29b
+4:
5de29b
 	bf	29,8f
5de29b
-
5de29b
-	lwz     6,0(12)
5de29b
-	addi    12,12,4
5de29b
-	stw     6,0(3)
5de29b
-	addi    3,3,4
5de29b
-8:	/* Copy 8 bytes.  */
5de29b
+	lwz	6,0(src)
5de29b
+	addi	src,src,4
5de29b
+	stw	6,0(dst)
5de29b
+	addi	dst,dst,4
5de29b
+8:
5de29b
 	bf	28,0f
5de29b
-
5de29b
-	ld	6,0(12)
5de29b
-	addi    12,12,8
5de29b
-	std	6,0(3)
5de29b
-	addi    3,3,8
5de29b
+	ld	6,0(src)
5de29b
+	addi	src,src,8
5de29b
+	std	6,0(dst)
5de29b
+	addi	dst,dst,8
5de29b
 0:
5de29b
-	clrldi  10,12,60      /* Check alignment of SRC.  */
5de29b
-	srdi    9,31,4	      /* Number of full quadwords remaining.  */
5de29b
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
5de29b
 
5de29b
 	/* The proper alignment is present, it is OK to copy the bytes now.  */
5de29b
 L(copy_GE_32_unaligned_cont):
5de29b
 
5de29b
 	/* Setup two indexes to speed up the indexed vector operations.  */
5de29b
-	clrldi  11,31,60
5de29b
-	li      6,16	      /* Index for 16-bytes offsets.  */
5de29b
+	clrldi	10,cnt,60
5de29b
+	li	6,16	      /* Index for 16-bytes offsets.  */
5de29b
 	li	7,32	      /* Index for 32-bytes offsets.  */
5de29b
-	cmpldi  cr1,11,0
5de29b
-	srdi    8,31,5	      /* Setup the loop counter.  */
5de29b
-	mr      10,3
5de29b
-	mr      11,12
5de29b
-	mtcrf   0x01,9
5de29b
-	cmpldi  cr6,9,1
5de29b
-	lvsl    5,0,12
5de29b
-	lvx     3,0,12
5de29b
-	bf      31,L(setup_unaligned_loop)
5de29b
-
5de29b
-	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
5de29b
-	lvx     4,12,6
5de29b
-	vperm   6,3,4,5
5de29b
-	addi    11,12,16
5de29b
-	addi    10,3,16
5de29b
-	stvx    6,0,3
5de29b
+	cmpldi	cr1,10,0
5de29b
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
5de29b
+	mtocrf	0x01,9
5de29b
+	cmpldi	cr6,9,1
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	lvsr	5,0,src
5de29b
+#else
5de29b
+	lvsl	5,0,src
5de29b
+#endif
5de29b
+	lvx	3,0,src
5de29b
+	li	0,0
5de29b
+	bf	31,L(setup_unaligned_loop)
5de29b
+
5de29b
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
5de29b
+	lvx	4,src,6
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm	6,4,3,5
5de29b
+#else
5de29b
+	vperm	6,3,4,5
5de29b
+#endif
5de29b
+	addi	src,src,16
5de29b
+	stvx	6,0,dst
5de29b
+	addi	dst,dst,16
5de29b
 	vor	3,4,4
5de29b
+	clrrdi	0,src,60
5de29b
 
5de29b
 L(setup_unaligned_loop):
5de29b
-	mtctr   8
5de29b
-	ble     cr6,L(end_unaligned_loop)
5de29b
+	mtctr	8
5de29b
+	ble	cr6,L(end_unaligned_loop)
5de29b
 
5de29b
 	/* Copy 32 bytes at a time using vector instructions.  */
5de29b
-	.align  4
5de29b
+	.align	4
5de29b
 L(unaligned_loop):
5de29b
 
5de29b
 	/* Note: vr6/vr10 may contain data that was already copied,
5de29b
@@ -444,63 +385,56 @@
5de29b
 	some portions again. This is faster than having unaligned
5de29b
 	vector instructions though.  */
5de29b
 
5de29b
-	lvx	4,11,6	      /* vr4 = r11+16.  */
5de29b
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
5de29b
-			      of vr3/vr4 into vr6.  */
5de29b
-	lvx	3,11,7	      /* vr3 = r11+32.  */
5de29b
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
5de29b
-			      of vr3/vr4 into vr10.  */
5de29b
-	addi    11,11,32
5de29b
-	stvx    6,0,10
5de29b
-	stvx    10,10,6
5de29b
-	addi    10,10,32
5de29b
-
5de29b
+	lvx	4,src,6
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm	6,4,3,5
5de29b
+#else
5de29b
+	vperm	6,3,4,5
5de29b
+#endif
5de29b
+	lvx	3,src,7
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm	10,3,4,5
5de29b
+#else
5de29b
+	vperm	10,4,3,5
5de29b
+#endif
5de29b
+	addi	src,src,32
5de29b
+	stvx	6,0,dst
5de29b
+	stvx	10,dst,6
5de29b
+	addi	dst,dst,32
5de29b
 	bdnz	L(unaligned_loop)
5de29b
 
5de29b
-	.align  4
5de29b
+	clrrdi	0,src,60
5de29b
+
5de29b
+	.align	4
5de29b
 L(end_unaligned_loop):
5de29b
 
5de29b
 	/* Check for tail bytes.  */
5de29b
-	rldicr  0,31,0,59
5de29b
-	mtcrf   0x01,31
5de29b
-	beq	cr1,0f
5de29b
+	mtocrf	0x01,cnt
5de29b
+	beqlr	cr1
5de29b
 
5de29b
-	add	3,3,0
5de29b
-	add	12,12,0
5de29b
+	add	src,src,0
5de29b
 
5de29b
 	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
5de29b
-8:	/* Copy 8 bytes.  */
5de29b
+	/* Copy 8 bytes.  */
5de29b
 	bf	28,4f
5de29b
-
5de29b
-	lwz	6,0(12)
5de29b
-	lwz	7,4(12)
5de29b
-	addi    12,12,8
5de29b
-	stw	6,0(3)
5de29b
-	stw	7,4(3)
5de29b
-	addi    3,3,8
5de29b
-4:	/* Copy 4 bytes.  */
5de29b
-	bf	29,2f
5de29b
-
5de29b
-	lwz	6,0(12)
5de29b
-	addi    12,12,4
5de29b
-	stw	6,0(3)
5de29b
-	addi    3,3,4
5de29b
-2:	/* Copy 2~3 bytes.  */
5de29b
-	bf	30,1f
5de29b
-
5de29b
-	lhz	6,0(12)
5de29b
-	addi    12,12,2
5de29b
-	sth	6,0(3)
5de29b
-	addi    3,3,2
5de29b
-1:	/* Copy 1 byte.  */
5de29b
-	bf	31,0f
5de29b
-
5de29b
-	lbz	6,0(12)
5de29b
-	stb	6,0(3)
5de29b
-0:	/* Return original DST pointer.  */
5de29b
-	ld	31,-8(1)
5de29b
-	ld	3,-16(1)
5de29b
+	lwz	6,0(src)
5de29b
+	lwz	7,4(src)
5de29b
+	addi	src,src,8
5de29b
+	stw	6,0(dst)
5de29b
+	stw	7,4(dst)
5de29b
+	addi	dst,dst,8
5de29b
+4:	/* Copy 4~7 bytes.  */
5de29b
+	bf	29,L(tail2)
5de29b
+	lwz	6,0(src)
5de29b
+	stw	6,0(dst)
5de29b
+	bf	30,L(tail5)
5de29b
+	lhz	7,4(src)
5de29b
+	sth	7,4(dst)
5de29b
+	bflr	31
5de29b
+	lbz	8,6(src)
5de29b
+	stb	8,6(dst)
5de29b
+	/* Return original DST pointer.  */
5de29b
 	blr
5de29b
 
5de29b
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
5de29b
+END_GEN_TB (memcpy,TB_TOCLESS)
5de29b
 libc_hidden_builtin_def (memcpy)
12745e
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S
12745e
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
12745e
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
5de29b
@@ -367,13 +367,21 @@
5de29b
 	mr	11,12
5de29b
 	mtcrf	0x01,9
5de29b
 	cmpldi	cr6,9,1
5de29b
-	lvsl	5,0,12
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	lvsr    5,0,12
5de29b
+#else
5de29b
+	lvsl    5,0,12
5de29b
+#endif
5de29b
 	lvx	3,0,12
5de29b
 	bf	31,L(setup_unaligned_loop)
5de29b
 
5de29b
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
5de29b
 	lvx	4,12,6
5de29b
-	vperm	6,3,4,5
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm   6,4,3,5
5de29b
+#else
5de29b
+	vperm   6,3,4,5
5de29b
+#endif
5de29b
 	addi	11,12,16
5de29b
 	addi	10,3,16
5de29b
 	stvx	6,0,3
5de29b
@@ -393,11 +401,17 @@
5de29b
 	vector instructions though.  */
5de29b
 
5de29b
 	lvx	4,11,6	      /* vr4 = r11+16.  */
5de29b
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
5de29b
-				 of vr3/vr4 into vr6.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm   6,4,3,5
5de29b
+#else
5de29b
+	vperm   6,3,4,5
5de29b
+#endif
5de29b
 	lvx	3,11,7	      /* vr3 = r11+32.  */
5de29b
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
5de29b
-				 of vr3/vr4 into vr10.  */
5de29b
+#ifdef __LITTLE_ENDIAN__
5de29b
+	vperm   10,3,4,5
5de29b
+#else
5de29b
+	vperm   10,4,3,5
5de29b
+#endif
5de29b
 	addi	11,11,32
5de29b
 	stvx	6,0,10
5de29b
 	stvx	10,10,6