00db10
# commit 759cfef3ac4c07dba1ece0bbc1207e099348816d
00db10
# Author: Alan Modra <amodra@gmail.com>
00db10
# Date:   Sat Aug 17 18:47:22 2013 +0930
00db10
# 
00db10
#     PowerPC LE memcpy
00db10
#     http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html
00db10
#     
00db10
#     LIttle-endian support for memcpy.  I spent some time cleaning up the
00db10
#     64-bit power7 memcpy, in order to avoid the extra alignment traps
00db10
#     power7 takes for little-endian.  It probably would have been better
00db10
#     to copy the linux kernel version of memcpy.
00db10
#     
00db10
#         * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
00db10
#         * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
00db10
#         * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
00db10
#         * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
00db10
#         * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
00db10
#         * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
00db10
#         * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
00db10
#         * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
00db10
#         * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise.  Make better
00db10
#         use of regs.  Use power7 mtocrf.  Tidy function tails.
00db10
# 
00db10
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
@@ -205,15 +205,28 @@
00db10
     blt   cr6,5f
00db10
     srwi  7,6,16
00db10
     bgt	  cr6,3f
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    sth   7,0(3)
00db10
+#else
00db10
     sth   6,0(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 3:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,24
00db10
+    stb   6,0(3)
00db10
+    sth   7,1(3)
00db10
+#else
00db10
     stb   7,0(3)
00db10
     sth   6,1(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 5:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,8
00db10
+#endif
00db10
     stb   6,0(3)
00db10
 7:
00db10
     cmplwi	cr1,10,16
00db10
@@ -341,13 +354,23 @@
00db10
     bf      30,1f
00db10
 
00db10
     /* there are at least two words to copy, so copy them */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srw   0,6,10
00db10
+    slw   8,7,9
00db10
+#else
00db10
     slw   0,6,10  /* shift 1st src word to left align it in R0 */
00db10
     srw   8,7,9   /* shift 2nd src word to right align it in R8 */
00db10
+#endif
00db10
     or    0,0,8   /* or them to get word to store */
00db10
     lwz   6,8(5)  /* load the 3rd src word */
00db10
     stw   0,0(4)  /* store the 1st dst word */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srw   0,7,10
00db10
+    slw   8,6,9
00db10
+#else
00db10
     slw   0,7,10  /* now left align 2nd src word into R0 */
00db10
     srw   8,6,9   /* shift 3rd src word to right align it in R8 */
00db10
+#endif
00db10
     or    0,0,8   /* or them to get word to store */
00db10
     lwz   7,12(5)
00db10
     stw   0,4(4)  /* store the 2nd dst word */
00db10
@@ -355,8 +378,13 @@
00db10
     addi  5,5,16
00db10
     bf    31,4f
00db10
     /* there is a third word to copy, so copy it */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srw   0,6,10
00db10
+    slw   8,7,9
00db10
+#else
00db10
     slw   0,6,10  /* shift 3rd src word to left align it in R0 */
00db10
     srw   8,7,9   /* shift 4th src word to right align it in R8 */
00db10
+#endif
00db10
     or    0,0,8   /* or them to get word to store */
00db10
     stw   0,0(4)  /* store 3rd dst word */
00db10
     mr    6,7
00db10
@@ -366,8 +394,13 @@
00db10
     b     4f
00db10
     .align 4
00db10
 1:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srw     0,6,10
00db10
+    slw     8,7,9
00db10
+#else
00db10
     slw     0,6,10  /* shift 1st src word to left align it in R0 */
00db10
     srw     8,7,9   /* shift 2nd src word to right align it in R8 */
00db10
+#endif
00db10
     addi  5,5,8
00db10
     or    0,0,8   /* or them to get word to store */
00db10
     bf    31,4f
00db10
@@ -380,23 +413,43 @@
00db10
     .align  4
00db10
 4:
00db10
     /* copy 16 bytes at a time */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srw   0,6,10
00db10
+    slw   8,7,9
00db10
+#else
00db10
     slw   0,6,10
00db10
     srw   8,7,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     lwz   6,0(5)
00db10
     stw   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srw   0,7,10
00db10
+    slw   8,6,9
00db10
+#else
00db10
     slw   0,7,10
00db10
     srw   8,6,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     lwz   7,4(5)
00db10
     stw   0,4(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srw   0,6,10
00db10
+    slw   8,7,9
00db10
+#else
00db10
     slw   0,6,10
00db10
     srw   8,7,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     lwz   6,8(5)
00db10
     stw   0,8(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srw   0,7,10
00db10
+    slw   8,6,9
00db10
+#else
00db10
     slw   0,7,10
00db10
     srw   8,6,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     lwz   7,12(5)
00db10
     stw   0,12(4)
00db10
@@ -405,8 +458,13 @@
00db10
     bdnz+ 4b
00db10
 8:
00db10
     /* calculate and store the final word */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srw   0,6,10
00db10
+    slw   8,7,9
00db10
+#else
00db10
     slw   0,6,10
00db10
     srw   8,7,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     stw   0,0(4)
00db10
 3:
00db10
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
@@ -221,15 +221,28 @@
00db10
     blt   cr6,5f
00db10
     srwi  7,6,16
00db10
     bgt	  cr6,3f
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    sth   7,0(3)
00db10
+#else
00db10
     sth   6,0(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 3:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,24
00db10
+    stb   6,0(3)
00db10
+    sth   7,1(3)
00db10
+#else
00db10
     stb   7,0(3)
00db10
     sth   6,1(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 5:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,8
00db10
+#endif
00db10
     stb   6,0(3)
00db10
 7:
00db10
     cmplwi	cr1,10,16
00db10
@@ -579,7 +592,11 @@
00db10
     lwz     6,-1(4)
00db10
     cmplwi  cr6,31,4
00db10
     srwi    8,31,5    /* calculate the 32 byte loop count */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srwi    6,6,8
00db10
+#else
00db10
     slwi    6,6,8
00db10
+#endif
00db10
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
00db10
     blt     cr5,L(wdu1_32tail)
00db10
     mtctr   8
00db10
@@ -587,8 +604,12 @@
00db10
 
00db10
     lwz   8,3(4)
00db10
     lwz   7,4(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rldimi 6,8,24,32
00db10
+#else
00db10
 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
00db10
     rlwimi 6,8,8,(32-8),31
00db10
+#endif
00db10
     b      L(wdu1_loop32x)
00db10
     .align  4
00db10
 L(wdu1_loop32):
00db10
@@ -597,8 +618,12 @@
00db10
     lwz   7,4(4)
00db10
     stw   10,-8(3)
00db10
     stw   11,-4(3)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rldimi 6,8,24,32
00db10
+#else
00db10
 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
00db10
     rlwimi 6,8,8,(32-8),31
00db10
+#endif
00db10
 L(wdu1_loop32x):
00db10
     lwz   10,8(4)
00db10
     lwz   11,12(4)
00db10
@@ -615,7 +640,11 @@
00db10
     stw   6,16(3)
00db10
     stw   7,20(3)
00db10
     addi  3,3,32
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srwi  6,8,8
00db10
+#else
00db10
     slwi  6,8,8
00db10
+#endif
00db10
     bdnz+ L(wdu1_loop32)
00db10
     stw   10,-8(3)
00db10
     stw   11,-4(3)
00db10
@@ -626,8 +655,12 @@
00db10
     blt     cr6,L(wdu_4tail)
00db10
     /* calculate and store the final word */
00db10
     lwz   8,3(4)
00db10
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rldimi 6,8,24,32
00db10
+#else
00db10
+/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
00db10
     rlwimi 6,8,8,(32-8),31
00db10
+#endif
00db10
     b     L(wdu_32tailx)
00db10
 
00db10
 L(wdu2_32):
00db10
@@ -635,7 +668,11 @@
00db10
     lwz     6,-2(4)
00db10
     cmplwi  cr6,31,4
00db10
     srwi    8,31,5    /* calculate the 32 byte loop count */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srwi    6,6,16
00db10
+#else
00db10
     slwi    6,6,16
00db10
+#endif
00db10
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
00db10
     blt     cr5,L(wdu2_32tail)
00db10
     mtctr   8
00db10
@@ -643,8 +680,11 @@
00db10
 
00db10
     lwz   8,2(4)
00db10
     lwz   7,4(4)
00db10
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rldimi 6,8,16,32
00db10
+#else
00db10
     rlwimi 6,8,16,(32-16),31
00db10
+#endif
00db10
     b      L(wdu2_loop32x)
00db10
     .align  4
00db10
 L(wdu2_loop32):
00db10
@@ -653,8 +693,11 @@
00db10
     lwz   7,4(4)
00db10
     stw   10,-8(3)
00db10
     stw   11,-4(3)
00db10
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rldimi 6,8,16,32
00db10
+#else
00db10
     rlwimi 6,8,16,(32-16),31
00db10
+#endif
00db10
 L(wdu2_loop32x):
00db10
     lwz   10,8(4)
00db10
     lwz   11,12(4)
00db10
@@ -672,7 +715,11 @@
00db10
     stw   6,16(3)
00db10
     stw   7,20(3)
00db10
     addi  3,3,32
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srwi  6,8,16
00db10
+#else
00db10
     slwi  6,8,16
00db10
+#endif
00db10
     bdnz+ L(wdu2_loop32)
00db10
     stw   10,-8(3)
00db10
     stw   11,-4(3)
00db10
@@ -683,8 +730,11 @@
00db10
     blt     cr6,L(wdu_4tail)
00db10
     /* calculate and store the final word */
00db10
     lwz   8,2(4)
00db10
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rldimi 6,8,16,32
00db10
+#else
00db10
     rlwimi 6,8,16,(32-16),31
00db10
+#endif
00db10
     b     L(wdu_32tailx)
00db10
 
00db10
 L(wdu3_32):
00db10
@@ -692,7 +742,11 @@
00db10
     lwz     6,-3(4)
00db10
     cmplwi  cr6,31,4
00db10
     srwi    8,31,5    /* calculate the 32 byte loop count */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srwi    6,6,24
00db10
+#else
00db10
     slwi    6,6,24
00db10
+#endif
00db10
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
00db10
     blt     cr5,L(wdu3_32tail)
00db10
     mtctr   8
00db10
@@ -700,8 +754,11 @@
00db10
 
00db10
     lwz   8,1(4)
00db10
     lwz   7,4(4)
00db10
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rldimi 6,8,8,32
00db10
+#else
00db10
     rlwimi 6,8,24,(32-24),31
00db10
+#endif
00db10
     b      L(wdu3_loop32x)
00db10
     .align  4
00db10
 L(wdu3_loop32):
00db10
@@ -710,8 +767,11 @@
00db10
     lwz   7,4(4)
00db10
     stw   10,-8(3)
00db10
     stw   11,-4(3)
00db10
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rldimi 6,8,8,32
00db10
+#else
00db10
     rlwimi 6,8,24,(32-24),31
00db10
+#endif
00db10
 L(wdu3_loop32x):
00db10
     lwz   10,8(4)
00db10
     lwz   11,12(4)
00db10
@@ -728,7 +788,11 @@
00db10
     stw   6,16(3)
00db10
     stw   7,20(3)
00db10
     addi  3,3,32
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srwi  6,8,24
00db10
+#else
00db10
     slwi  6,8,24
00db10
+#endif
00db10
     bdnz+ L(wdu3_loop32)
00db10
     stw   10,-8(3)
00db10
     stw   11,-4(3)
00db10
@@ -739,8 +803,11 @@
00db10
     blt     cr6,L(wdu_4tail)
00db10
     /* calculate and store the final word */
00db10
     lwz   8,1(4)
00db10
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rldimi 6,8,8,32
00db10
+#else
00db10
     rlwimi 6,8,24,(32-24),31
00db10
+#endif
00db10
     b     L(wdu_32tailx)
00db10
     .align  4
00db10
 L(wdu_32tailx):
00db10
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
@@ -385,7 +385,7 @@
00db10
 
00db10
 	beq    L(copy_GE_32_unaligned_cont)
00db10
 
00db10
-	/* SRC is not quadword aligned, get it aligned.  */
00db10
+	/* DST is not quadword aligned, get it aligned.  */
00db10
 
00db10
 	mtcrf   0x01,0
00db10
 	subf    31,0,5
00db10
@@ -437,13 +437,21 @@
00db10
 	mr      11,12
00db10
 	mtcrf   0x01,9
00db10
 	cmplwi  cr6,9,1
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	lvsr    5,0,12
00db10
+#else
00db10
 	lvsl    5,0,12
00db10
+#endif
00db10
 	lvx     3,0,12
00db10
 	bf      31,L(setup_unaligned_loop)
00db10
 
00db10
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
00db10
 	lvx     4,12,6
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm   6,4,3,5
00db10
+#else
00db10
 	vperm   6,3,4,5
00db10
+#endif
00db10
 	addi    11,12,16
00db10
 	addi    10,3,16
00db10
 	stvx    6,0,3
00db10
@@ -463,11 +471,17 @@
00db10
 	vector instructions though.  */
00db10
 
00db10
 	lvx	4,11,6	      /* vr4 = r11+16.  */
00db10
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
00db10
-			      of vr3/vr4 into vr6.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm   6,4,3,5
00db10
+#else
00db10
+	vperm   6,3,4,5
00db10
+#endif
00db10
 	lvx	3,11,7	      /* vr3 = r11+32.  */
00db10
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
00db10
-			      of vr3/vr4 into vr10.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm   10,3,4,5
00db10
+#else
00db10
+	vperm   10,4,3,5
00db10
+#endif
00db10
 	addi    11,11,32
00db10
 	stvx    6,0,10
00db10
 	stvx    10,10,6
00db10
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
@@ -327,7 +327,7 @@
00db10
 
00db10
 	beq	L(copy_GE_32_unaligned_cont)
00db10
 
00db10
-	/* SRC is not quadword aligned, get it aligned.  */
00db10
+	/* DST is not quadword aligned, get it aligned.  */
00db10
 
00db10
 	mtcrf	0x01,0
00db10
 	subf	31,0,5
00db10
@@ -379,13 +379,21 @@
00db10
 	mr	11,12
00db10
 	mtcrf	0x01,9
00db10
 	cmplwi	cr6,9,1
00db10
-	lvsl	5,0,12
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	lvsr    5,0,12
00db10
+#else
00db10
+	lvsl    5,0,12
00db10
+#endif
00db10
 	lvx	3,0,12
00db10
 	bf	31,L(setup_unaligned_loop)
00db10
 
00db10
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
00db10
 	lvx	4,12,6
00db10
-	vperm	6,3,4,5
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm   6,4,3,5
00db10
+#else
00db10
+	vperm   6,3,4,5
00db10
+#endif
00db10
 	addi	11,12,16
00db10
 	addi	10,3,16
00db10
 	stvx	6,0,3
00db10
@@ -405,11 +413,17 @@
00db10
 	vector instructions though.  */
00db10
 
00db10
 	lvx	4,11,6	      /* vr4 = r11+16.  */
00db10
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
00db10
-				 of vr3/vr4 into vr6.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm   6,4,3,5
00db10
+#else
00db10
+	vperm   6,3,4,5
00db10
+#endif
00db10
 	lvx	3,11,7	      /* vr3 = r11+32.  */
00db10
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
00db10
-				 of vr3/vr4 into vr10.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm   10,3,4,5
00db10
+#else
00db10
+	vperm   10,4,3,5
00db10
+#endif
00db10
 	addi	11,11,32
00db10
 	stvx	6,0,10
00db10
 	stvx	10,10,6
00db10
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
@@ -214,15 +214,28 @@
00db10
     blt   cr6,5f
00db10
     srdi  7,6,16
00db10
     bgt	  cr6,3f
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    sth   7,0(3)
00db10
+#else
00db10
     sth   6,0(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 3:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,24
00db10
+    stb   6,0(3)
00db10
+    sth   7,1(3)
00db10
+#else
00db10
     stb   7,0(3)
00db10
     sth   6,1(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 5:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,8
00db10
+#endif
00db10
     stb   6,0(3)
00db10
 7:
00db10
     cmpldi	cr1,10,16
00db10
@@ -330,7 +343,11 @@
00db10
     ld    7,8(5)
00db10
     subfic  9,10,64
00db10
     beq   2f
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd   0,6,10
00db10
+#else
00db10
     sld   0,6,10
00db10
+#endif
00db10
     cmpldi  11,1
00db10
     mr    6,7
00db10
     addi  4,4,-8
00db10
@@ -338,15 +355,25 @@
00db10
     b     1f
00db10
 2:  addi  5,5,8
00db10
     .align  4
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+0:  srd   0,6,10
00db10
+    sld   8,7,9
00db10
+#else
00db10
 0:  sld   0,6,10
00db10
     srd   8,7,9
00db10
+#endif
00db10
     cmpldi  11,2
00db10
     ld    6,8(5)
00db10
     or    0,0,8
00db10
     addi  11,11,-2
00db10
     std   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd   0,7,10
00db10
+1:  sld   8,6,9
00db10
+#else
00db10
     sld   0,7,10
00db10
 1:  srd   8,6,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     beq   8f
00db10
     ld    7,16(5)
00db10
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:05:51.000000000 -0500
00db10
@@ -1,5 +1,5 @@
00db10
 /* Optimized memcpy implementation for PowerPC64.
00db10
-   Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
00db10
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
00db10
    This file is part of the GNU C Library.
00db10
 
00db10
    The GNU C Library is free software; you can redistribute it and/or
00db10
@@ -17,26 +17,24 @@
00db10
    <http://www.gnu.org/licenses/>.  */
00db10
 
00db10
 #include <sysdep.h>
00db10
-#include <bp-sym.h>
00db10
-#include <bp-asm.h>
00db10
 
00db10
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
00db10
    Returns 'dst'.
00db10
 
00db10
-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
00db10
-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
00db10
-   with the appropriate combination of byte and halfword load/stores. 
00db10
-   There is minimal effort to optimize the alignment of short moves.  
00db10
+   Memcpy handles short copies (< 32-bytes) using a binary move blocks
00db10
+   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
00db10
+   with the appropriate combination of byte and halfword load/stores.
00db10
+   There is minimal effort to optimize the alignment of short moves.
00db10
    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
00db10
-   of handling unligned load/stores that do not cross 32-byte boundries.
00db10
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
00db10
 
00db10
    Longer moves (>= 32-bytes) justify the effort to get at least the
00db10
    destination doubleword (8-byte) aligned.  Further optimization is
00db10
-   posible when both source and destination are doubleword aligned.
00db10
+   possible when both source and destination are doubleword aligned.
00db10
    Each case has a optimized unrolled loop.   */
00db10
 
00db10
 	.machine power4
00db10
-EALIGN (BP_SYM (memcpy), 5, 0)
00db10
+EALIGN (memcpy, 5, 0)
00db10
 	CALL_MCOUNT 3
00db10
 
00db10
     cmpldi cr1,5,31
00db10
@@ -44,20 +42,20 @@
00db10
     std   3,-16(1)
00db10
     std   31,-8(1)
00db10
     cfi_offset(31,-8)
00db10
-    andi. 11,3,7	/* check alignement of dst.  */
00db10
+    andi. 11,3,7	/* check alignment of dst.  */
00db10
     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
00db10
-    clrldi 10,4,61	/* check alignement of src.  */
00db10
+    clrldi 10,4,61	/* check alignment of src.  */
00db10
     cmpldi cr6,5,8
00db10
     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
00db10
-    cmpld cr6,10,11     
00db10
+    cmpld cr6,10,11
00db10
     mr    12,4
00db10
     srdi  9,5,3		/* Number of full double words remaining.  */
00db10
     mtcrf 0x01,0
00db10
     mr    31,5
00db10
     beq   .L0
00db10
-  
00db10
+
00db10
     subf  31,0,5
00db10
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.  */
00db10
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
00db10
 1:  bf    31,2f
00db10
     lbz   6,0(12)
00db10
     addi  12,12,1
00db10
@@ -74,17 +72,17 @@
00db10
     stw   6,0(3)
00db10
     addi  3,3,4
00db10
 0:
00db10
-    clrldi 10,12,61	/* check alignement of src again.  */     
00db10
+    clrldi 10,12,61	/* check alignment of src again.  */
00db10
     srdi  9,31,3	/* Number of full double words remaining.  */
00db10
-    
00db10
-  /* Copy doublewords from source to destination, assumpting the
00db10
+
00db10
+  /* Copy doublewords from source to destination, assuming the
00db10
      destination is aligned on a doubleword boundary.
00db10
 
00db10
      At this point we know there are at least 25 bytes left (32-7) to copy.
00db10
-     The next step is to determine if the source is also doubleword aligned. 
00db10
+     The next step is to determine if the source is also doubleword aligned.
00db10
      If not branch to the unaligned move code at .L6. which uses
00db10
      a load, shift, store strategy.
00db10
-     
00db10
+
00db10
      Otherwise source and destination are doubleword aligned, and we can
00db10
      the optimized doubleword copy loop.  */
00db10
 .L0:
00db10
@@ -97,14 +95,14 @@
00db10
      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
00db10
      If the copy is not an exact multiple of 32 bytes, 1-3
00db10
      doublewords are copied as needed to set up the main loop.  After
00db10
-     the main loop exits there may be a tail of 1-7 bytes. These byte are 
00db10
+     the main loop exits there may be a tail of 1-7 bytes. These byte are
00db10
      copied a word/halfword/byte at a time as needed to preserve alignment.  */
00db10
 
00db10
     srdi  8,31,5
00db10
     cmpldi	cr1,9,4
00db10
     cmpldi	cr6,11,0
00db10
     mr    11,12
00db10
-    
00db10
+
00db10
     bf    30,1f
00db10
     ld    6,0(12)
00db10
     ld    7,8(12)
00db10
@@ -115,7 +113,7 @@
00db10
     addi  10,3,16
00db10
     bf    31,4f
00db10
     ld    0,16(12)
00db10
-    std   0,16(3)    
00db10
+    std   0,16(3)
00db10
     blt   cr1,3f
00db10
     addi  11,12,24
00db10
     addi  10,3,24
00db10
@@ -129,7 +127,7 @@
00db10
     addi  11,12,8
00db10
     std   6,0(3)
00db10
     addi  10,3,8
00db10
-    
00db10
+
00db10
     .align  4
00db10
 4:
00db10
     ld    6,0(11)
00db10
@@ -144,7 +142,7 @@
00db10
     std   0,24(10)
00db10
     addi  10,10,32
00db10
     bdnz  4b
00db10
-3:  
00db10
+3:
00db10
 
00db10
     rldicr 0,31,0,60
00db10
     mtcrf 0x01,31
00db10
@@ -152,9 +150,9 @@
00db10
 .L9:
00db10
     add   3,3,0
00db10
     add   12,12,0
00db10
-    
00db10
+
00db10
 /*  At this point we have a tail of 0-7 bytes and we know that the
00db10
-    destiniation is double word aligned.  */
00db10
+    destination is double word aligned.  */
00db10
 4:  bf    29,2f
00db10
     lwz   6,0(12)
00db10
     addi  12,12,4
00db10
@@ -173,29 +171,29 @@
00db10
     ld 31,-8(1)
00db10
     ld 3,-16(1)
00db10
     blr
00db10
-       
00db10
-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
00db10
-   bytes.  Each case is handled without loops, using binary (1,2,4,8) 
00db10
-   tests.  
00db10
-   
00db10
+
00db10
+/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
00db10
+   bytes.  Each case is handled without loops, using binary (1,2,4,8)
00db10
+   tests.
00db10
+
00db10
    In the short (0-8 byte) case no attempt is made to force alignment
00db10
-   of either source or destination.  The hardware will handle the 
00db10
-   unaligned load/stores with small delays for crossing 32- 64-byte, and 
00db10
+   of either source or destination.  The hardware will handle the
00db10
+   unaligned load/stores with small delays for crossing 32- 64-byte, and
00db10
    4096-byte boundaries. Since these short moves are unlikely to be
00db10
-   unaligned or cross these boundaries, the overhead to force 
00db10
+   unaligned or cross these boundaries, the overhead to force
00db10
    alignment is not justified.
00db10
-   
00db10
+
00db10
    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
00db10
    boundaries.  Since only loads are sensitive to the 32-/64-byte
00db10
-   boundaries it is more important to align the source then the 
00db10
+   boundaries it is more important to align the source then the
00db10
    destination.  If the source is not already word aligned, we first
00db10
-   move 1-3 bytes as needed.  Since we are only word aligned we don't 
00db10
-   use double word load/stores to insure that all loads are aligned. 
00db10
+   move 1-3 bytes as needed.  Since we are only word aligned we don't
00db10
+   use double word load/stores to insure that all loads are aligned.
00db10
    While the destination and stores may still be unaligned, this
00db10
    is only an issue for page (4096 byte boundary) crossing, which
00db10
    should be rare for these short moves.  The hardware handles this
00db10
-   case automatically with a small delay.  */ 
00db10
-   
00db10
+   case automatically with a small delay.  */
00db10
+
00db10
     .align  4
00db10
 .L2:
00db10
     mtcrf 0x01,5
00db10
@@ -216,15 +214,28 @@
00db10
     blt   cr6,5f
00db10
     srdi  7,6,16
00db10
     bgt	  cr6,3f
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    sth   7,0(3)
00db10
+#else
00db10
     sth   6,0(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 3:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,24
00db10
+    stb   6,0(3)
00db10
+    sth   7,1(3)
00db10
+#else
00db10
     stb   7,0(3)
00db10
     sth   6,1(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 5:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,8
00db10
+#endif
00db10
     stb   6,0(3)
00db10
 7:
00db10
     cmpldi	cr1,10,16
00db10
@@ -258,11 +269,11 @@
00db10
     lwz   6,0(12)
00db10
     addi  12,12,4
00db10
     stw   6,0(3)
00db10
-    addi  3,3,4    
00db10
+    addi  3,3,4
00db10
 2:  /* Move 2-3 bytes.  */
00db10
     bf    30,1f
00db10
     lhz   6,0(12)
00db10
-    sth   6,0(3) 
00db10
+    sth   6,0(3)
00db10
     bf    31,0f
00db10
     lbz   7,2(12)
00db10
     stb   7,2(3)
00db10
@@ -283,8 +294,8 @@
00db10
     mr    12,4
00db10
     bne   cr6,4f
00db10
 /* Would have liked to use use ld/std here but the 630 processors are
00db10
-   slow for load/store doubles that are not at least word aligned.  
00db10
-   Unaligned Load/Store word execute with only a 1 cycle penaltity.  */
00db10
+   slow for load/store doubles that are not at least word aligned.
00db10
+   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
00db10
     lwz   6,0(4)
00db10
     lwz   7,4(4)
00db10
     stw   6,0(3)
00db10
@@ -299,14 +310,14 @@
00db10
 6:
00db10
     bf    30,5f
00db10
     lhz   7,4(4)
00db10
-    sth   7,4(3) 
00db10
+    sth   7,4(3)
00db10
     bf    31,0f
00db10
     lbz   8,6(4)
00db10
     stb   8,6(3)
00db10
     ld 3,-16(1)
00db10
     blr
00db10
     .align  4
00db10
-5:  
00db10
+5:
00db10
     bf    31,0f
00db10
     lbz   6,4(4)
00db10
     stb   6,4(3)
00db10
@@ -336,13 +347,23 @@
00db10
     bf      30,1f
00db10
 
00db10
     /* there are at least two DWs to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd     0,6,10
00db10
+    sld     8,7,9
00db10
+#else
00db10
     sld     0,6,10
00db10
     srd     8,7,9
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      6,16(5)
00db10
     std     0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd     0,7,10
00db10
+    sld     8,6,9
00db10
+#else
00db10
     sld     0,7,10
00db10
     srd     8,6,9
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      7,24(5)
00db10
     std     0,8(4)
00db10
@@ -351,8 +372,13 @@
00db10
     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
00db10
     bf      31,4f
00db10
     /* there is a third DW to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd     0,6,10
00db10
+    sld     8,7,9
00db10
+#else
00db10
     sld     0,6,10
00db10
     srd     8,7,9
00db10
+#endif
00db10
     or      0,0,8
00db10
     std     0,0(4)
00db10
     mr      6,7
00db10
@@ -363,8 +389,13 @@
00db10
     b       4f
00db10
     .align 4
00db10
 1:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd     0,6,10
00db10
+    sld     8,7,9
00db10
+#else
00db10
     sld     0,6,10
00db10
     srd     8,7,9
00db10
+#endif
00db10
     addi    5,5,16
00db10
     or      0,0,8
00db10
     bf      31,4f
00db10
@@ -375,23 +406,44 @@
00db10
     addi    4,4,8
00db10
     .align 4
00db10
 /* copy 32 bytes at a time */
00db10
-4:  sld   0,6,10
00db10
+4:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd   0,6,10
00db10
+    sld   8,7,9
00db10
+#else
00db10
+    sld   0,6,10
00db10
     srd   8,7,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,0(5)
00db10
     std   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd   0,7,10
00db10
+    sld   8,6,9
00db10
+#else
00db10
     sld   0,7,10
00db10
     srd   8,6,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,8(5)
00db10
     std   0,8(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd   0,6,10
00db10
+    sld   8,7,9
00db10
+#else
00db10
     sld   0,6,10
00db10
     srd   8,7,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,16(5)
00db10
     std   0,16(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd   0,7,10
00db10
+    sld   8,6,9
00db10
+#else
00db10
     sld   0,7,10
00db10
     srd   8,6,9
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,24(5)
00db10
     std   0,24(4)
00db10
@@ -401,9 +453,14 @@
00db10
     .align 4
00db10
 8:
00db10
     /* calculate and store the final DW */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srd   0,6,10
00db10
+    sld   8,7,9
00db10
+#else
00db10
     sld   0,6,10
00db10
     srd   8,7,9
00db10
-    or    0,0,8  
00db10
+#endif
00db10
+    or    0,0,8
00db10
     std   0,0(4)
00db10
 3:
00db10
     rldicr 0,31,0,60
00db10
@@ -413,5 +470,5 @@
00db10
     ld 31,-8(1)
00db10
     ld 3,-16(1)
00db10
     blr
00db10
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
00db10
+END_GEN_TB (memcpy,TB_TOCLESS)
00db10
 libc_hidden_builtin_def (memcpy)
00db10
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:05:27.000000000 -0500
00db10
@@ -1,5 +1,5 @@
00db10
 /* Optimized memcpy implementation for PowerPC64.
00db10
-   Copyright (C) 2003, 2006, 2007, 2011 Free Software Foundation, Inc.
00db10
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
00db10
    This file is part of the GNU C Library.
00db10
 
00db10
    The GNU C Library is free software; you can redistribute it and/or
00db10
@@ -17,52 +17,50 @@
00db10
    <http://www.gnu.org/licenses/>.  */
00db10
 
00db10
 #include <sysdep.h>
00db10
-#include <bp-sym.h>
00db10
-#include <bp-asm.h>
00db10
 
00db10
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
00db10
    Returns 'dst'.
00db10
 
00db10
-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
00db10
-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
00db10
-   with the appropriate combination of byte and halfword load/stores. 
00db10
-   There is minimal effort to optimize the alignment of short moves.  
00db10
+   Memcpy handles short copies (< 32-bytes) using a binary move blocks
00db10
+   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
00db10
+   with the appropriate combination of byte and halfword load/stores.
00db10
+   There is minimal effort to optimize the alignment of short moves.
00db10
    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
00db10
-   of handling unligned load/stores that do not cross 32-byte boundries.
00db10
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
00db10
 
00db10
    Longer moves (>= 32-bytes) justify the effort to get at least the
00db10
    destination doubleword (8-byte) aligned.  Further optimization is
00db10
-   posible when both source and destination are doubleword aligned.
00db10
-   Each case has a optimized unrolled loop.  
00db10
-     
00db10
-   For POWER6 unaligned loads will take a 20+ cycle hicup for any
00db10
+   possible when both source and destination are doubleword aligned.
00db10
+   Each case has a optimized unrolled loop.
00db10
+
00db10
+   For POWER6 unaligned loads will take a 20+ cycle hiccup for any
00db10
    L1 cache miss that crosses a 32- or 128-byte boundary.  Store
00db10
-   is more forgiving and does not take a hicup until page or 
00db10
-   segment boundaries.  So we require doubleword alignment for 
00db10
+   is more forgiving and does not take a hiccup until page or
00db10
+   segment boundaries.  So we require doubleword alignment for
00db10
    the source but may take a risk and only require word alignment
00db10
    for the destination.  */
00db10
 
00db10
 	.machine	"power6"
00db10
-EALIGN (BP_SYM (memcpy), 7, 0)
00db10
+EALIGN (memcpy, 7, 0)
00db10
 	CALL_MCOUNT 3
00db10
 
00db10
     cmpldi cr1,5,31
00db10
     neg   0,3
00db10
     std   3,-16(1)
00db10
     std   31,-8(1)
00db10
-    andi. 11,3,7	/* check alignement of dst.  */
00db10
+    andi. 11,3,7	/* check alignment of dst.  */
00db10
     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
00db10
-    clrldi 10,4,61	/* check alignement of src.  */
00db10
+    clrldi 10,4,61	/* check alignment of src.  */
00db10
     cmpldi cr6,5,8
00db10
     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
00db10
     mtcrf 0x01,0
00db10
-    cmpld cr6,10,11  
00db10
+    cmpld cr6,10,11
00db10
     srdi  9,5,3		/* Number of full double words remaining.  */
00db10
     beq   .L0
00db10
-  
00db10
+
00db10
     subf  5,0,5
00db10
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.
00db10
-     Duplicate some code to maximize fall-throught and minimize agen delays.  */
00db10
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.
00db10
+     Duplicate some code to maximize fall-through and minimize agen delays.  */
00db10
 1:  bf    31,2f
00db10
     lbz   6,0(4)
00db10
     stb   6,0(3)
00db10
@@ -78,7 +76,7 @@
00db10
     lwz   6,1(4)
00db10
     stw   6,1(3)
00db10
     b     0f
00db10
-    
00db10
+
00db10
 2:  bf    30,4f
00db10
     lhz   6,0(4)
00db10
     sth   6,0(3)
00db10
@@ -86,26 +84,26 @@
00db10
     lwz   6,2(4)
00db10
     stw   6,2(3)
00db10
     b     0f
00db10
-    
00db10
+
00db10
 4:  bf    29,0f
00db10
     lwz   6,0(4)
00db10
     stw   6,0(3)
00db10
-0: 
00db10
+0:
00db10
 /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
00db10
     add   4,4,0
00db10
     add   3,3,0
00db10
-    
00db10
-    clrldi 10,4,61	/* check alignement of src again.  */     
00db10
+
00db10
+    clrldi 10,4,61	/* check alignment of src again.  */
00db10
     srdi  9,5,3	/* Number of full double words remaining.  */
00db10
-    
00db10
-  /* Copy doublewords from source to destination, assumpting the
00db10
+
00db10
+  /* Copy doublewords from source to destination, assuming the
00db10
      destination is aligned on a doubleword boundary.
00db10
 
00db10
      At this point we know there are at least 25 bytes left (32-7) to copy.
00db10
-     The next step is to determine if the source is also doubleword aligned. 
00db10
+     The next step is to determine if the source is also doubleword aligned.
00db10
      If not branch to the unaligned move code at .L6. which uses
00db10
      a load, shift, store strategy.
00db10
-     
00db10
+
00db10
      Otherwise source and destination are doubleword aligned, and we can
00db10
      the optimized doubleword copy loop.  */
00db10
     .align  4
00db10
@@ -123,14 +121,14 @@
00db10
      the main loop exits there may be a tail of 1-7 bytes. These byte
00db10
      are copied a word/halfword/byte at a time as needed to preserve
00db10
      alignment.
00db10
-     
00db10
+
00db10
      For POWER6 the L1 is store-through and the L2 is store-in.  The
00db10
      L2 is clocked at half CPU clock so we can store 16 bytes every
00db10
      other cycle.  POWER6 also has a load/store bypass so we can do
00db10
-     load, load, store, store every 2 cycles.  
00db10
-     
00db10
+     load, load, store, store every 2 cycles.
00db10
+
00db10
      The following code is sensitive to cache line alignment.  Do not
00db10
-     make any change with out first making sure thay don't result in
00db10
+     make any change with out first making sure they don't result in
00db10
      splitting ld/std pairs across a cache line.  */
00db10
 
00db10
     mtcrf 0x02,5
00db10
@@ -273,7 +271,7 @@
00db10
     std   8,16+96(10)
00db10
     std   0,24+96(10)
00db10
     ble   cr5,L(das_loop_e)
00db10
-    
00db10
+
00db10
     mtctr   12
00db10
     .align  4
00db10
 L(das_loop2):
00db10
@@ -326,10 +324,10 @@
00db10
     .align  4
00db10
 L(das_tail):
00db10
     beq   cr1,0f
00db10
-    
00db10
+
00db10
 L(das_tail2):
00db10
 /*  At this point we have a tail of 0-7 bytes and we know that the
00db10
-    destiniation is double word aligned.  */
00db10
+    destination is double word aligned.  */
00db10
 4:  bf    29,2f
00db10
     lwz   6,0(4)
00db10
     stw   6,0(3)
00db10
@@ -344,7 +342,7 @@
00db10
     lbz   6,4(4)
00db10
     stb   6,4(3)
00db10
     b     0f
00db10
-  
00db10
+
00db10
 2:  bf    30,1f
00db10
     lhz   6,0(4)
00db10
     sth   6,0(3)
00db10
@@ -352,7 +350,7 @@
00db10
     lbz   6,2(4)
00db10
     stb   6,2(3)
00db10
     b     0f
00db10
-    
00db10
+
00db10
 1:  bf    31,0f
00db10
     lbz   6,0(4)
00db10
     stb   6,0(3)
00db10
@@ -361,7 +359,7 @@
00db10
     ld 3,-16(1)
00db10
     blr
00db10
 
00db10
-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
00db10
+/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
00db10
    bytes.  Each case is handled without loops, using binary (1,2,4,8)
00db10
    tests.
00db10
 
00db10
@@ -402,15 +400,28 @@
00db10
     blt   cr6,5f
00db10
     srdi  7,6,16
00db10
     bgt	  cr6,3f
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    sth   7,0(3)
00db10
+#else
00db10
     sth   6,0(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 3:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,24
00db10
+    stb   6,0(3)
00db10
+    sth   7,1(3)
00db10
+#else
00db10
     stb   7,0(3)
00db10
     sth   6,1(3)
00db10
+#endif
00db10
     b     7f
00db10
     .align  4
00db10
 5:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    rotlwi 6,6,8
00db10
+#endif
00db10
     stb   6,0(3)
00db10
 7:
00db10
     cmpldi	cr1,10,16
00db10
@@ -421,7 +432,7 @@
00db10
 /* At least 6 bytes left and the source is word aligned.  This allows
00db10
    some speculative loads up front.  */
00db10
 /* We need to special case the fall-through because the biggest delays
00db10
-   are due to address computation not being ready in time for the 
00db10
+   are due to address computation not being ready in time for the
00db10
    AGEN.  */
00db10
     lwz   6,0(12)
00db10
     lwz   7,4(12)
00db10
@@ -452,7 +463,7 @@
00db10
     ld    3,-16(1)
00db10
     blr
00db10
     .align  4
00db10
-L(dus_tail16p8):  /* less then 8 bytes left.  */
00db10
+L(dus_tail16p8):  /* less than 8 bytes left.  */
00db10
     beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
00db10
     cmpldi	cr1,10,20
00db10
     bf    29,L(dus_tail16p2)
00db10
@@ -466,7 +477,7 @@
00db10
     ld    3,-16(1)
00db10
     blr
00db10
     .align  4
00db10
-L(dus_tail16p4):  /* less then 4 bytes left.  */
00db10
+L(dus_tail16p4):  /* less than 4 bytes left.  */
00db10
     addi  12,12,24
00db10
     addi  3,3,24
00db10
     bgt   cr0,L(dus_tail2)
00db10
@@ -474,7 +485,7 @@
00db10
     ld    3,-16(1)
00db10
     blr
00db10
     .align  4
00db10
-L(dus_tail16p2):  /* 16 bytes moved, less then 4 bytes left.  */
00db10
+L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
00db10
     addi  12,12,16
00db10
     addi  3,3,16
00db10
     b     L(dus_tail2)
00db10
@@ -499,7 +510,7 @@
00db10
     ld    3,-16(1)
00db10
     blr
00db10
     .align  4
00db10
-L(dus_tail8p4):  /* less then 4 bytes left.  */
00db10
+L(dus_tail8p4):  /* less than 4 bytes left.  */
00db10
     addi  12,12,8
00db10
     addi  3,3,8
00db10
     bgt   cr1,L(dus_tail2)
00db10
@@ -510,14 +521,14 @@
00db10
     .align  4
00db10
 L(dus_tail4):  /* Move 4 bytes.  */
00db10
 /*  r6 already loaded speculatively.  If we are here we know there is
00db10
-    more then 4 bytes left.  So there is no need to test.  */
00db10
+    more than 4 bytes left.  So there is no need to test.  */
00db10
     addi  12,12,4
00db10
     stw   6,0(3)
00db10
     addi  3,3,4
00db10
 L(dus_tail2):  /* Move 2-3 bytes.  */
00db10
     bf    30,L(dus_tail1)
00db10
     lhz   6,0(12)
00db10
-    sth   6,0(3) 
00db10
+    sth   6,0(3)
00db10
     bf    31,L(dus_tailX)
00db10
     lbz   7,2(12)
00db10
     stb   7,2(3)
00db10
@@ -537,7 +548,7 @@
00db10
 .LE8:
00db10
     mr    12,4
00db10
     bne   cr6,L(dus_4)
00db10
-/* Exactly 8 bytes.  We may cross a 32-/128-byte boundry and take a ~20
00db10
+/* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
00db10
    cycle delay.  This case should be rare and any attempt to avoid this
00db10
    would take most of 20 cycles any way.  */
00db10
     ld   6,0(4)
00db10
@@ -552,7 +563,7 @@
00db10
     stw   6,0(3)
00db10
     bf    30,L(dus_5)
00db10
     lhz   7,4(4)
00db10
-    sth   7,4(3) 
00db10
+    sth   7,4(3)
00db10
     bf    31,L(dus_0)
00db10
     lbz   8,6(4)
00db10
     stb   8,6(3)
00db10
@@ -590,20 +601,31 @@
00db10
     bge     cr0, L(du4_do)
00db10
     blt     cr5, L(du1_do)
00db10
     beq     cr5, L(du2_do)
00db10
-    b       L(du3_do) 
00db10
-       
00db10
+    b       L(du3_do)
00db10
+
00db10
     .align 4
00db10
 L(du1_do):
00db10
     bf      30,L(du1_1dw)
00db10
 
00db10
     /* there are at least two DWs to copy */
00db10
+    /* FIXME: can combine last shift and "or" into "rldimi" */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 8
00db10
+    sldi     8,7, 64-8
00db10
+#else
00db10
     sldi     0,6, 8
00db10
     srdi     8,7, 64-8
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      6,16(5)
00db10
     std     0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,7, 8
00db10
+    sldi     8,6, 64-8
00db10
+#else
00db10
     sldi     0,7, 8
00db10
     srdi     8,6, 64-8
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      7,24(5)
00db10
     std     0,8(4)
00db10
@@ -612,8 +634,13 @@
00db10
     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
00db10
     bf      31,L(du1_loop)
00db10
     /* there is a third DW to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 8
00db10
+    sldi     8,7, 64-8
00db10
+#else
00db10
     sldi     0,6, 8
00db10
     srdi     8,7, 64-8
00db10
+#endif
00db10
     or      0,0,8
00db10
     std     0,0(4)
00db10
     mr      6,7
00db10
@@ -624,8 +651,13 @@
00db10
     b       L(du1_loop)
00db10
     .align 4
00db10
 L(du1_1dw):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 8
00db10
+    sldi     8,7, 64-8
00db10
+#else
00db10
     sldi     0,6, 8
00db10
     srdi     8,7, 64-8
00db10
+#endif
00db10
     addi    5,5,16
00db10
     or      0,0,8
00db10
     bf      31,L(du1_loop)
00db10
@@ -637,23 +669,43 @@
00db10
     .align 4
00db10
 /* copy 32 bytes at a time */
00db10
 L(du1_loop):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 8
00db10
+    sldi   8,7, 64-8
00db10
+#else
00db10
     sldi   0,6, 8
00db10
     srdi   8,7, 64-8
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,0(5)
00db10
     std   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 8
00db10
+    sldi   8,6, 64-8
00db10
+#else
00db10
     sldi   0,7, 8
00db10
     srdi   8,6, 64-8
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,8(5)
00db10
     std   0,8(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 8
00db10
+    sldi   8,7, 64-8
00db10
+#else
00db10
     sldi   0,6, 8
00db10
     srdi   8,7, 64-8
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,16(5)
00db10
     std   0,16(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 8
00db10
+    sldi   8,6, 64-8
00db10
+#else
00db10
     sldi   0,7, 8
00db10
     srdi   8,6, 64-8
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,24(5)
00db10
     std   0,24(4)
00db10
@@ -663,9 +715,14 @@
00db10
     .align 4
00db10
 L(du1_fini):
00db10
     /* calculate and store the final DW */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 8
00db10
+    sldi   8,7, 64-8
00db10
+#else
00db10
     sldi   0,6, 8
00db10
     srdi   8,7, 64-8
00db10
-    or    0,0,8  
00db10
+#endif
00db10
+    or    0,0,8
00db10
     std   0,0(4)
00db10
     b     L(du_done)
00db10
 
00db10
@@ -674,13 +731,23 @@
00db10
     bf      30,L(du2_1dw)
00db10
 
00db10
     /* there are at least two DWs to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 16
00db10
+    sldi     8,7, 64-16
00db10
+#else
00db10
     sldi     0,6, 16
00db10
     srdi     8,7, 64-16
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      6,16(5)
00db10
     std     0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,7, 16
00db10
+    sldi     8,6, 64-16
00db10
+#else
00db10
     sldi     0,7, 16
00db10
     srdi     8,6, 64-16
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      7,24(5)
00db10
     std     0,8(4)
00db10
@@ -689,8 +756,13 @@
00db10
     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
00db10
     bf      31,L(du2_loop)
00db10
     /* there is a third DW to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 16
00db10
+    sldi     8,7, 64-16
00db10
+#else
00db10
     sldi     0,6, 16
00db10
     srdi     8,7, 64-16
00db10
+#endif
00db10
     or      0,0,8
00db10
     std     0,0(4)
00db10
     mr      6,7
00db10
@@ -701,8 +773,13 @@
00db10
     b       L(du2_loop)
00db10
     .align 4
00db10
 L(du2_1dw):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 16
00db10
+    sldi     8,7, 64-16
00db10
+#else
00db10
     sldi     0,6, 16
00db10
     srdi     8,7, 64-16
00db10
+#endif
00db10
     addi    5,5,16
00db10
     or      0,0,8
00db10
     bf      31,L(du2_loop)
00db10
@@ -714,23 +791,43 @@
00db10
     .align 4
00db10
 /* copy 32 bytes at a time */
00db10
 L(du2_loop):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 16
00db10
+    sldi   8,7, 64-16
00db10
+#else
00db10
     sldi   0,6, 16
00db10
     srdi   8,7, 64-16
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,0(5)
00db10
     std   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 16
00db10
+    sldi   8,6, 64-16
00db10
+#else
00db10
     sldi   0,7, 16
00db10
     srdi   8,6, 64-16
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,8(5)
00db10
     std   0,8(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 16
00db10
+    sldi   8,7, 64-16
00db10
+#else
00db10
     sldi   0,6, 16
00db10
     srdi   8,7, 64-16
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,16(5)
00db10
     std   0,16(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 16
00db10
+    sldi   8,6, 64-16
00db10
+#else
00db10
     sldi   0,7, 16
00db10
     srdi   8,6, 64-16
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,24(5)
00db10
     std   0,24(4)
00db10
@@ -740,9 +837,14 @@
00db10
     .align 4
00db10
 L(du2_fini):
00db10
     /* calculate and store the final DW */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 16
00db10
+    sldi   8,7, 64-16
00db10
+#else
00db10
     sldi   0,6, 16
00db10
     srdi   8,7, 64-16
00db10
-    or    0,0,8  
00db10
+#endif
00db10
+    or    0,0,8
00db10
     std   0,0(4)
00db10
     b     L(du_done)
00db10
 
00db10
@@ -751,13 +853,23 @@
00db10
     bf      30,L(du3_1dw)
00db10
 
00db10
     /* there are at least two DWs to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 24
00db10
+    sldi     8,7, 64-24
00db10
+#else
00db10
     sldi     0,6, 24
00db10
     srdi     8,7, 64-24
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      6,16(5)
00db10
     std     0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,7, 24
00db10
+    sldi     8,6, 64-24
00db10
+#else
00db10
     sldi     0,7, 24
00db10
     srdi     8,6, 64-24
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      7,24(5)
00db10
     std     0,8(4)
00db10
@@ -766,8 +878,13 @@
00db10
     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
00db10
     bf      31,L(du3_loop)
00db10
     /* there is a third DW to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 24
00db10
+    sldi     8,7, 64-24
00db10
+#else
00db10
     sldi     0,6, 24
00db10
     srdi     8,7, 64-24
00db10
+#endif
00db10
     or      0,0,8
00db10
     std     0,0(4)
00db10
     mr      6,7
00db10
@@ -778,8 +895,13 @@
00db10
     b       L(du3_loop)
00db10
     .align 4
00db10
 L(du3_1dw):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 24
00db10
+    sldi     8,7, 64-24
00db10
+#else
00db10
     sldi     0,6, 24
00db10
     srdi     8,7, 64-24
00db10
+#endif
00db10
     addi    5,5,16
00db10
     or      0,0,8
00db10
     bf      31,L(du3_loop)
00db10
@@ -791,23 +913,43 @@
00db10
     .align 4
00db10
 /* copy 32 bytes at a time */
00db10
 L(du3_loop):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 24
00db10
+    sldi   8,7, 64-24
00db10
+#else
00db10
     sldi   0,6, 24
00db10
     srdi   8,7, 64-24
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,0(5)
00db10
     std   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 24
00db10
+    sldi   8,6, 64-24
00db10
+#else
00db10
     sldi   0,7, 24
00db10
     srdi   8,6, 64-24
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,8(5)
00db10
     std   0,8(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 24
00db10
+    sldi   8,7, 64-24
00db10
+#else
00db10
     sldi   0,6, 24
00db10
     srdi   8,7, 64-24
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,16(5)
00db10
     std   0,16(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 24
00db10
+    sldi   8,6, 64-24
00db10
+#else
00db10
     sldi   0,7, 24
00db10
     srdi   8,6, 64-24
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,24(5)
00db10
     std   0,24(4)
00db10
@@ -817,9 +959,14 @@
00db10
     .align 4
00db10
 L(du3_fini):
00db10
     /* calculate and store the final DW */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 24
00db10
+    sldi   8,7, 64-24
00db10
+#else
00db10
     sldi   0,6, 24
00db10
     srdi   8,7, 64-24
00db10
-    or    0,0,8  
00db10
+#endif
00db10
+    or    0,0,8
00db10
     std   0,0(4)
00db10
     b     L(du_done)
00db10
 
00db10
@@ -834,13 +981,23 @@
00db10
     bf      30,L(du4_1dw)
00db10
 
00db10
     /* there are at least two DWs to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 32
00db10
+    sldi     8,7, 64-32
00db10
+#else
00db10
     sldi     0,6, 32
00db10
     srdi     8,7, 64-32
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      6,16(5)
00db10
     std     0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,7, 32
00db10
+    sldi     8,6, 64-32
00db10
+#else
00db10
     sldi     0,7, 32
00db10
     srdi     8,6, 64-32
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      7,24(5)
00db10
     std     0,8(4)
00db10
@@ -849,8 +1006,13 @@
00db10
     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
00db10
     bf      31,L(du4_loop)
00db10
     /* there is a third DW to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 32
00db10
+    sldi     8,7, 64-32
00db10
+#else
00db10
     sldi     0,6, 32
00db10
     srdi     8,7, 64-32
00db10
+#endif
00db10
     or      0,0,8
00db10
     std     0,0(4)
00db10
     mr      6,7
00db10
@@ -861,8 +1023,13 @@
00db10
     b       L(du4_loop)
00db10
     .align 4
00db10
 L(du4_1dw):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 32
00db10
+    sldi     8,7, 64-32
00db10
+#else
00db10
     sldi     0,6, 32
00db10
     srdi     8,7, 64-32
00db10
+#endif
00db10
     addi    5,5,16
00db10
     or      0,0,8
00db10
     bf      31,L(du4_loop)
00db10
@@ -874,23 +1041,43 @@
00db10
     .align 4
00db10
 /* copy 32 bytes at a time */
00db10
 L(du4_loop):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 32
00db10
+    sldi   8,7, 64-32
00db10
+#else
00db10
     sldi   0,6, 32
00db10
     srdi   8,7, 64-32
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,0(5)
00db10
     std   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 32
00db10
+    sldi   8,6, 64-32
00db10
+#else
00db10
     sldi   0,7, 32
00db10
     srdi   8,6, 64-32
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,8(5)
00db10
     std   0,8(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 32
00db10
+    sldi   8,7, 64-32
00db10
+#else
00db10
     sldi   0,6, 32
00db10
     srdi   8,7, 64-32
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,16(5)
00db10
     std   0,16(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 32
00db10
+    sldi   8,6, 64-32
00db10
+#else
00db10
     sldi   0,7, 32
00db10
     srdi   8,6, 64-32
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,24(5)
00db10
     std   0,24(4)
00db10
@@ -900,9 +1087,14 @@
00db10
     .align 4
00db10
 L(du4_fini):
00db10
     /* calculate and store the final DW */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 32
00db10
+    sldi   8,7, 64-32
00db10
+#else
00db10
     sldi   0,6, 32
00db10
     srdi   8,7, 64-32
00db10
-    or    0,0,8  
00db10
+#endif
00db10
+    or    0,0,8
00db10
     std   0,0(4)
00db10
     b     L(du_done)
00db10
 
00db10
@@ -911,13 +1103,23 @@
00db10
     bf      30,L(du5_1dw)
00db10
 
00db10
     /* there are at least two DWs to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 40
00db10
+    sldi     8,7, 64-40
00db10
+#else
00db10
     sldi     0,6, 40
00db10
     srdi     8,7, 64-40
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      6,16(5)
00db10
     std     0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,7, 40
00db10
+    sldi     8,6, 64-40
00db10
+#else
00db10
     sldi     0,7, 40
00db10
     srdi     8,6, 64-40
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      7,24(5)
00db10
     std     0,8(4)
00db10
@@ -926,8 +1128,13 @@
00db10
     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
00db10
     bf      31,L(du5_loop)
00db10
     /* there is a third DW to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 40
00db10
+    sldi     8,7, 64-40
00db10
+#else
00db10
     sldi     0,6, 40
00db10
     srdi     8,7, 64-40
00db10
+#endif
00db10
     or      0,0,8
00db10
     std     0,0(4)
00db10
     mr      6,7
00db10
@@ -938,8 +1145,13 @@
00db10
     b       L(du5_loop)
00db10
     .align 4
00db10
 L(du5_1dw):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 40
00db10
+    sldi     8,7, 64-40
00db10
+#else
00db10
     sldi     0,6, 40
00db10
     srdi     8,7, 64-40
00db10
+#endif
00db10
     addi    5,5,16
00db10
     or      0,0,8
00db10
     bf      31,L(du5_loop)
00db10
@@ -951,23 +1163,43 @@
00db10
     .align 4
00db10
 /* copy 32 bytes at a time */
00db10
 L(du5_loop):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 40
00db10
+    sldi   8,7, 64-40
00db10
+#else
00db10
     sldi   0,6, 40
00db10
     srdi   8,7, 64-40
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,0(5)
00db10
     std   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 40
00db10
+    sldi   8,6, 64-40
00db10
+#else
00db10
     sldi   0,7, 40
00db10
     srdi   8,6, 64-40
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,8(5)
00db10
     std   0,8(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 40
00db10
+    sldi   8,7, 64-40
00db10
+#else
00db10
     sldi   0,6, 40
00db10
     srdi   8,7, 64-40
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,16(5)
00db10
     std   0,16(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 40
00db10
+    sldi   8,6, 64-40
00db10
+#else
00db10
     sldi   0,7, 40
00db10
     srdi   8,6, 64-40
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,24(5)
00db10
     std   0,24(4)
00db10
@@ -977,9 +1209,14 @@
00db10
     .align 4
00db10
 L(du5_fini):
00db10
     /* calculate and store the final DW */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 40
00db10
+    sldi   8,7, 64-40
00db10
+#else
00db10
     sldi   0,6, 40
00db10
     srdi   8,7, 64-40
00db10
-    or    0,0,8  
00db10
+#endif
00db10
+    or    0,0,8
00db10
     std   0,0(4)
00db10
     b     L(du_done)
00db10
 
00db10
@@ -988,13 +1225,23 @@
00db10
     bf      30,L(du6_1dw)
00db10
 
00db10
     /* there are at least two DWs to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 48
00db10
+    sldi     8,7, 64-48
00db10
+#else
00db10
     sldi     0,6, 48
00db10
     srdi     8,7, 64-48
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      6,16(5)
00db10
     std     0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,7, 48
00db10
+    sldi     8,6, 64-48
00db10
+#else
00db10
     sldi     0,7, 48
00db10
     srdi     8,6, 64-48
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      7,24(5)
00db10
     std     0,8(4)
00db10
@@ -1003,8 +1250,13 @@
00db10
     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
00db10
     bf      31,L(du6_loop)
00db10
     /* there is a third DW to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 48
00db10
+    sldi     8,7, 64-48
00db10
+#else
00db10
     sldi     0,6, 48
00db10
     srdi     8,7, 64-48
00db10
+#endif
00db10
     or      0,0,8
00db10
     std     0,0(4)
00db10
     mr      6,7
00db10
@@ -1015,8 +1267,13 @@
00db10
     b       L(du6_loop)
00db10
     .align 4
00db10
 L(du6_1dw):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 48
00db10
+    sldi     8,7, 64-48
00db10
+#else
00db10
     sldi     0,6, 48
00db10
     srdi     8,7, 64-48
00db10
+#endif
00db10
     addi    5,5,16
00db10
     or      0,0,8
00db10
     bf      31,L(du6_loop)
00db10
@@ -1028,23 +1285,43 @@
00db10
     .align 4
00db10
 /* copy 32 bytes at a time */
00db10
 L(du6_loop):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 48
00db10
+    sldi   8,7, 64-48
00db10
+#else
00db10
     sldi   0,6, 48
00db10
     srdi   8,7, 64-48
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,0(5)
00db10
     std   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 48
00db10
+    sldi   8,6, 64-48
00db10
+#else
00db10
     sldi   0,7, 48
00db10
     srdi   8,6, 64-48
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,8(5)
00db10
     std   0,8(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 48
00db10
+    sldi   8,7, 64-48
00db10
+#else
00db10
     sldi   0,6, 48
00db10
     srdi   8,7, 64-48
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,16(5)
00db10
     std   0,16(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 48
00db10
+    sldi   8,6, 64-48
00db10
+#else
00db10
     sldi   0,7, 48
00db10
     srdi   8,6, 64-48
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,24(5)
00db10
     std   0,24(4)
00db10
@@ -1054,9 +1331,14 @@
00db10
     .align 4
00db10
 L(du6_fini):
00db10
     /* calculate and store the final DW */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 48
00db10
+    sldi   8,7, 64-48
00db10
+#else
00db10
     sldi   0,6, 48
00db10
     srdi   8,7, 64-48
00db10
-    or    0,0,8  
00db10
+#endif
00db10
+    or    0,0,8
00db10
     std   0,0(4)
00db10
     b     L(du_done)
00db10
 
00db10
@@ -1065,13 +1347,23 @@
00db10
     bf      30,L(du7_1dw)
00db10
 
00db10
     /* there are at least two DWs to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 56
00db10
+    sldi     8,7, 64-56
00db10
+#else
00db10
     sldi     0,6, 56
00db10
     srdi     8,7, 64-56
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      6,16(5)
00db10
     std     0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,7, 56
00db10
+    sldi     8,6, 64-56
00db10
+#else
00db10
     sldi     0,7, 56
00db10
     srdi     8,6, 64-56
00db10
+#endif
00db10
     or      0,0,8
00db10
     ld      7,24(5)
00db10
     std     0,8(4)
00db10
@@ -1080,8 +1372,13 @@
00db10
     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
00db10
     bf      31,L(du7_loop)
00db10
     /* there is a third DW to copy */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 56
00db10
+    sldi     8,7, 64-56
00db10
+#else
00db10
     sldi     0,6, 56
00db10
     srdi     8,7, 64-56
00db10
+#endif
00db10
     or      0,0,8
00db10
     std     0,0(4)
00db10
     mr      6,7
00db10
@@ -1092,8 +1389,13 @@
00db10
     b       L(du7_loop)
00db10
     .align 4
00db10
 L(du7_1dw):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi     0,6, 56
00db10
+    sldi     8,7, 64-56
00db10
+#else
00db10
     sldi     0,6, 56
00db10
     srdi     8,7, 64-56
00db10
+#endif
00db10
     addi    5,5,16
00db10
     or      0,0,8
00db10
     bf      31,L(du7_loop)
00db10
@@ -1105,23 +1407,43 @@
00db10
     .align 4
00db10
 /* copy 32 bytes at a time */
00db10
 L(du7_loop):
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 56
00db10
+    sldi   8,7, 64-56
00db10
+#else
00db10
     sldi   0,6, 56
00db10
     srdi   8,7, 64-56
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,0(5)
00db10
     std   0,0(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 56
00db10
+    sldi   8,6, 64-56
00db10
+#else
00db10
     sldi   0,7, 56
00db10
     srdi   8,6, 64-56
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,8(5)
00db10
     std   0,8(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 56
00db10
+    sldi   8,7, 64-56
00db10
+#else
00db10
     sldi   0,6, 56
00db10
     srdi   8,7, 64-56
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    6,16(5)
00db10
     std   0,16(4)
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,7, 56
00db10
+    sldi   8,6, 64-56
00db10
+#else
00db10
     sldi   0,7, 56
00db10
     srdi   8,6, 64-56
00db10
+#endif
00db10
     or    0,0,8
00db10
     ld    7,24(5)
00db10
     std   0,24(4)
00db10
@@ -1131,12 +1453,17 @@
00db10
     .align 4
00db10
 L(du7_fini):
00db10
     /* calculate and store the final DW */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+    srdi   0,6, 56
00db10
+    sldi   8,7, 64-56
00db10
+#else
00db10
     sldi   0,6, 56
00db10
     srdi   8,7, 64-56
00db10
-    or    0,0,8  
00db10
+#endif
00db10
+    or    0,0,8
00db10
     std   0,0(4)
00db10
     b     L(du_done)
00db10
-    
00db10
+
00db10
     .align 4
00db10
 L(du_done):
00db10
     rldicr 0,31,0,60
00db10
@@ -1144,9 +1471,9 @@
00db10
     beq   cr1,0f	/* If the tail is 0 bytes we are done!  */
00db10
 
00db10
     add   3,3,0
00db10
-    add   12,12,0    
00db10
+    add   12,12,0
00db10
 /*  At this point we have a tail of 0-7 bytes and we know that the
00db10
-    destiniation is double word aligned.  */
00db10
+    destination is double word aligned.  */
00db10
 4:  bf    29,2f
00db10
     lwz   6,0(12)
00db10
     addi  12,12,4
00db10
@@ -1165,5 +1492,5 @@
00db10
     ld 31,-8(1)
00db10
     ld 3,-16(1)
00db10
     blr
00db10
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
00db10
+END_GEN_TB (memcpy,TB_TOCLESS)
00db10
 libc_hidden_builtin_def (memcpy)
00db10
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:05:40.000000000 -0500
00db10
@@ -1,5 +1,5 @@
00db10
 /* Optimized memcpy implementation for PowerPC64/POWER7.
00db10
-   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
00db10
+   Copyright (C) 2010-2014 Free Software Foundation, Inc.
00db10
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
00db10
    This file is part of the GNU C Library.
00db10
 
00db10
@@ -18,425 +18,366 @@
00db10
    <http://www.gnu.org/licenses/>.  */
00db10
 
00db10
 #include <sysdep.h>
00db10
-#include <bp-sym.h>
00db10
-#include <bp-asm.h>
00db10
 
00db10
 
00db10
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
00db10
    Returns 'dst'.  */
00db10
 
00db10
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
00db10
+#define src 4
00db10
+#define cnt 5
00db10
+
00db10
 	.machine power7
00db10
-EALIGN (BP_SYM (memcpy), 5, 0)
00db10
+EALIGN (memcpy, 5, 0)
00db10
 	CALL_MCOUNT 3
00db10
 
00db10
-	cmpldi  cr1,5,31
00db10
+	cmpldi	cr1,cnt,31
00db10
 	neg	0,3
00db10
-	std	3,-16(1)
00db10
-	std	31,-8(1)
00db10
-	cfi_offset(31,-8)
00db10
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
00db10
 				    code.  */
00db10
 
00db10
-	andi.   11,3,7	      /* Check alignment of DST.  */
00db10
-
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
00db10
+   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
00db10
+   loop is only used for quadword aligned copies.  */
00db10
+	andi.	10,3,15
00db10
+	clrldi	11,4,60
00db10
+#else
00db10
+	andi.	10,3,7		/* Check alignment of DST.  */
00db10
+	clrldi	11,4,61		/* Check alignment of SRC.  */
00db10
+#endif
00db10
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
00db10
 
00db10
-	clrldi  10,4,61       /* Check alignment of SRC.  */
00db10
-	cmpld   cr6,10,11     /* SRC and DST alignments match?  */
00db10
-	mr	12,4
00db10
-	mr	31,5
00db10
+	mr	dst,3
00db10
 	bne	cr6,L(copy_GE_32_unaligned)
00db10
+	beq	L(aligned_copy)
00db10
 
00db10
-	srdi    9,5,3	      /* Number of full quadwords remaining.  */
00db10
-
00db10
-	beq    L(copy_GE_32_aligned_cont)
00db10
-
00db10
-	clrldi  0,0,61
00db10
-	mtcrf   0x01,0
00db10
-	subf    31,0,5
00db10
-
00db10
-	/* Get the SRC aligned to 8 bytes.  */
00db10
-
00db10
-1:	bf	31,2f
00db10
-	lbz	6,0(12)
00db10
-	addi    12,12,1
00db10
-	stb	6,0(3)
00db10
-	addi    3,3,1
00db10
-2:	bf      30,4f
00db10
-	lhz     6,0(12)
00db10
-	addi    12,12,2
00db10
-	sth     6,0(3)
00db10
-	addi    3,3,2
00db10
-4:	bf      29,0f
00db10
-	lwz     6,0(12)
00db10
-	addi    12,12,4
00db10
-	stw     6,0(3)
00db10
-	addi    3,3,4
00db10
-0:
00db10
-	clrldi  10,12,61      /* Check alignment of SRC again.  */
00db10
-	srdi    9,31,3	      /* Number of full doublewords remaining.  */
00db10
-
00db10
-L(copy_GE_32_aligned_cont):
00db10
-
00db10
-	clrldi  11,31,61
00db10
-	mtcrf   0x01,9
00db10
-
00db10
-	srdi    8,31,5
00db10
-	cmpldi  cr1,9,4
00db10
-	cmpldi  cr6,11,0
00db10
-	mr	11,12
00db10
+	mtocrf	0x01,0
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	clrldi	0,0,60
00db10
+#else
00db10
+	clrldi	0,0,61
00db10
+#endif
00db10
 
00db10
-	/* Copy 1~3 doublewords so the main loop starts
00db10
-	at a multiple of 32 bytes.  */
00db10
-
00db10
-	bf	30,1f
00db10
-	ld      6,0(12)
00db10
-	ld      7,8(12)
00db10
-	addi    11,12,16
00db10
-	mtctr   8
00db10
-	std     6,0(3)
00db10
-	std     7,8(3)
00db10
-	addi    10,3,16
00db10
-	bf      31,4f
00db10
-	ld      0,16(12)
00db10
-	std     0,16(3)
00db10
-	blt     cr1,3f
00db10
-	addi    11,12,24
00db10
-	addi    10,3,24
00db10
-	b       4f
00db10
-
00db10
-	.align  4
00db10
-1:	/* Copy 1 doubleword and set the counter.  */
00db10
-	mr	10,3
00db10
-	mtctr   8
00db10
-	bf      31,4f
00db10
-	ld      6,0(12)
00db10
-	addi    11,12,8
00db10
-	std     6,0(3)
00db10
-	addi    10,3,8
00db10
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
00db10
+1:
00db10
+	bf	31,2f
00db10
+	lbz	6,0(src)
00db10
+	addi	src,src,1
00db10
+	stb	6,0(dst)
00db10
+	addi	dst,dst,1
00db10
+2:
00db10
+	bf	30,4f
00db10
+	lhz	6,0(src)
00db10
+	addi	src,src,2
00db10
+	sth	6,0(dst)
00db10
+	addi	dst,dst,2
00db10
+4:
00db10
+	bf	29,8f
00db10
+	lwz	6,0(src)
00db10
+	addi	src,src,4
00db10
+	stw	6,0(dst)
00db10
+	addi	dst,dst,4
00db10
+8:
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	bf	28,16f
00db10
+	ld	6,0(src)
00db10
+	addi	src,src,8
00db10
+	std	6,0(dst)
00db10
+	addi	dst,dst,8
00db10
+16:
00db10
+#endif
00db10
+	subf	cnt,0,cnt
00db10
 
00db10
+/* Main aligned copy loop. Copies 128 bytes at a time. */
00db10
 L(aligned_copy):
00db10
-	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
00db10
-	.align  4
00db10
-4:
00db10
-	/* check for any 32-byte or 64-byte lumps that are outside of a
00db10
-	   nice 128-byte range.  R8 contains the number of 32-byte
00db10
-	   lumps, so drop this into the CR, and use the SO/EQ bits to help
00db10
-	   handle the 32- or 64- byte lumps.  Then handle the rest with an
00db10
-	   unrolled 128-bytes-at-a-time copy loop. */
00db10
-	mtocrf	1,8
00db10
-	li	6,16	# 16() index
00db10
-	li	7,32	# 32() index
00db10
-	li	8,48	# 48() index
00db10
-
00db10
-L(aligned_32byte):
00db10
-	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
00db10
-	bns	cr7,L(aligned_64byte)
00db10
-	lxvd2x	6,0,11
00db10
-	lxvd2x	7,11,6
00db10
-	addi	11,11,32
00db10
-	stxvd2x	6,0,10
00db10
-	stxvd2x	7,10,6
00db10
-	addi	10,10,32
00db10
-
00db10
-L(aligned_64byte):
00db10
-	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
00db10
-	bne	cr7,L(aligned_128setup)
00db10
-	lxvd2x	6,0,11
00db10
-	lxvd2x	7,11,6
00db10
-	lxvd2x	8,11,7
00db10
-	lxvd2x	9,11,8
00db10
-	addi	11,11,64
00db10
-	stxvd2x	6,0,10
00db10
-	stxvd2x	7,10,6
00db10
-	stxvd2x	8,10,7
00db10
-	stxvd2x	9,10,8
00db10
-	addi	10,10,64
00db10
-
00db10
-L(aligned_128setup):
00db10
-	/* Set up for the 128-byte at a time copy loop.  */
00db10
-	srdi	8,31,7
00db10
-	cmpdi	8,0	# Any 4x lumps left?
00db10
-	beq	3f	# if not, move along.
00db10
-	lxvd2x	6,0,11
00db10
-	lxvd2x	7,11,6
00db10
-	mtctr	8	# otherwise, load the ctr and begin.
00db10
-	li	8,48	# 48() index
00db10
+	li	6,16
00db10
+	li	7,32
00db10
+	li	8,48
00db10
+	mtocrf	0x02,cnt
00db10
+	srdi	12,cnt,7
00db10
+	cmpdi	12,0
00db10
+	beq	L(aligned_tail)
00db10
+	lxvd2x	6,0,src
00db10
+	lxvd2x	7,src,6
00db10
+	mtctr	12
00db10
 	b	L(aligned_128loop)
00db10
 
00db10
+	.align  4
00db10
 L(aligned_128head):
00db10
 	/* for the 2nd + iteration of this loop. */
00db10
-	lxvd2x	6,0,11
00db10
-	lxvd2x	7,11,6
00db10
+	lxvd2x	6,0,src
00db10
+	lxvd2x	7,src,6
00db10
 L(aligned_128loop):
00db10
-	lxvd2x	8,11,7
00db10
-	lxvd2x	9,11,8
00db10
-	stxvd2x	6,0,10
00db10
-	addi	11,11,64
00db10
-	stxvd2x	7,10,6
00db10
-	stxvd2x	8,10,7
00db10
-	stxvd2x	9,10,8
00db10
-	lxvd2x	6,0,11
00db10
-	lxvd2x	7,11,6
00db10
-	addi	10,10,64
00db10
-	lxvd2x	8,11,7
00db10
-	lxvd2x	9,11,8
00db10
-	addi	11,11,64
00db10
-	stxvd2x	6,0,10
00db10
-	stxvd2x	7,10,6
00db10
-	stxvd2x	8,10,7
00db10
-	stxvd2x	9,10,8
00db10
-	addi	10,10,64
00db10
+	lxvd2x	8,src,7
00db10
+	lxvd2x	9,src,8
00db10
+	stxvd2x	6,0,dst
00db10
+	addi	src,src,64
00db10
+	stxvd2x	7,dst,6
00db10
+	stxvd2x	8,dst,7
00db10
+	stxvd2x	9,dst,8
00db10
+	lxvd2x	6,0,src
00db10
+	lxvd2x	7,src,6
00db10
+	addi	dst,dst,64
00db10
+	lxvd2x	8,src,7
00db10
+	lxvd2x	9,src,8
00db10
+	addi	src,src,64
00db10
+	stxvd2x	6,0,dst
00db10
+	stxvd2x	7,dst,6
00db10
+	stxvd2x	8,dst,7
00db10
+	stxvd2x	9,dst,8
00db10
+	addi	dst,dst,64
00db10
 	bdnz	L(aligned_128head)
00db10
 
00db10
-3:
00db10
-	/* Check for tail bytes.  */
00db10
-	rldicr  0,31,0,60
00db10
-	mtcrf   0x01,31
00db10
-	beq	cr6,0f
00db10
-
00db10
-.L9:
00db10
-	add	3,3,0
00db10
-	add	12,12,0
00db10
-
00db10
-	/*  At this point we have a tail of 0-7 bytes and we know that the
00db10
-	destination is doubleword-aligned.  */
00db10
-4:	/* Copy 4 bytes.  */
00db10
-	bf	29,2f
00db10
-
00db10
-	lwz     6,0(12)
00db10
-	addi    12,12,4
00db10
-	stw     6,0(3)
00db10
-	addi    3,3,4
00db10
-2:	/* Copy 2 bytes.  */
00db10
-	bf	30,1f
00db10
-
00db10
-	lhz     6,0(12)
00db10
-	addi    12,12,2
00db10
-	sth     6,0(3)
00db10
-	addi    3,3,2
00db10
-1:	/* Copy 1 byte.  */
00db10
-	bf	31,0f
00db10
-
00db10
-	lbz	6,0(12)
00db10
-	stb	6,0(3)
00db10
-0:	/* Return original DST pointer.  */
00db10
-	ld	31,-8(1)
00db10
-	ld	3,-16(1)
00db10
+L(aligned_tail):
00db10
+	mtocrf	0x01,cnt
00db10
+	bf	25,32f
00db10
+	lxvd2x	6,0,src
00db10
+	lxvd2x	7,src,6
00db10
+	lxvd2x	8,src,7
00db10
+	lxvd2x	9,src,8
00db10
+	addi	src,src,64
00db10
+	stxvd2x	6,0,dst
00db10
+	stxvd2x	7,dst,6
00db10
+	stxvd2x	8,dst,7
00db10
+	stxvd2x	9,dst,8
00db10
+	addi	dst,dst,64
00db10
+32:
00db10
+	bf	26,16f
00db10
+	lxvd2x	6,0,src
00db10
+	lxvd2x	7,src,6
00db10
+	addi	src,src,32
00db10
+	stxvd2x	6,0,dst
00db10
+	stxvd2x	7,dst,6
00db10
+	addi	dst,dst,32
00db10
+16:
00db10
+	bf	27,8f
00db10
+	lxvd2x	6,0,src
00db10
+	addi	src,src,16
00db10
+	stxvd2x	6,0,dst
00db10
+	addi	dst,dst,16
00db10
+8:
00db10
+	bf	28,4f
00db10
+	ld	6,0(src)
00db10
+	addi	src,src,8
00db10
+	std     6,0(dst)
00db10
+	addi	dst,dst,8
00db10
+4:	/* Copies 4~7 bytes.  */
00db10
+	bf	29,L(tail2)
00db10
+	lwz	6,0(src)
00db10
+	stw     6,0(dst)
00db10
+	bf      30,L(tail5)
00db10
+	lhz     7,4(src)
00db10
+	sth     7,4(dst)
00db10
+	bflr	31
00db10
+	lbz     8,6(src)
00db10
+	stb     8,6(dst)
00db10
+	/* Return original DST pointer.  */
00db10
 	blr
00db10
 
00db10
-	/* Handle copies of 0~31 bytes.  */
00db10
-	.align  4
00db10
+
00db10
+/* Handle copies of 0~31 bytes.  */
00db10
+	.align	4
00db10
 L(copy_LT_32):
00db10
-	cmpldi  cr6,5,8
00db10
-	mr	12,4
00db10
-	mtcrf   0x01,5
00db10
+	mr	dst,3
00db10
+	cmpldi	cr6,cnt,8
00db10
+	mtocrf	0x01,cnt
00db10
 	ble	cr6,L(copy_LE_8)
00db10
 
00db10
 	/* At least 9 bytes to go.  */
00db10
 	neg	8,4
00db10
-	clrrdi  11,4,2
00db10
-	andi.   0,8,3
00db10
-	cmpldi  cr1,5,16
00db10
-	mr	10,5
00db10
+	andi.	0,8,3
00db10
+	cmpldi	cr1,cnt,16
00db10
 	beq	L(copy_LT_32_aligned)
00db10
 
00db10
-	/* Force 4-bytes alignment for SRC.  */
00db10
-	mtocrf  0x01,0
00db10
-	subf    10,0,5
00db10
-2:	bf	30,1f
00db10
-
00db10
-	lhz	6,0(12)
00db10
-	addi    12,12,2
00db10
-	sth	6,0(3)
00db10
-	addi    3,3,2
00db10
-1:	bf	31,L(end_4bytes_alignment)
00db10
-
00db10
-	lbz	6,0(12)
00db10
-	addi    12,12,1
00db10
-	stb	6,0(3)
00db10
-	addi    3,3,1
00db10
+	/* Force 4-byte alignment for SRC.  */
00db10
+	mtocrf	0x01,0
00db10
+	subf	cnt,0,cnt
00db10
+2:
00db10
+	bf	30,1f
00db10
+	lhz	6,0(src)
00db10
+	addi	src,src,2
00db10
+	sth	6,0(dst)
00db10
+	addi	dst,dst,2
00db10
+1:
00db10
+	bf	31,L(end_4bytes_alignment)
00db10
+	lbz	6,0(src)
00db10
+	addi	src,src,1
00db10
+	stb	6,0(dst)
00db10
+	addi	dst,dst,1
00db10
 
00db10
-	.align  4
00db10
+	.align	4
00db10
 L(end_4bytes_alignment):
00db10
-	cmpldi  cr1,10,16
00db10
-	mtcrf   0x01,10
00db10
+	cmpldi	cr1,cnt,16
00db10
+	mtocrf	0x01,cnt
00db10
 
00db10
 L(copy_LT_32_aligned):
00db10
 	/* At least 6 bytes to go, and SRC is word-aligned.  */
00db10
 	blt	cr1,8f
00db10
 
00db10
 	/* Copy 16 bytes.  */
00db10
-	lwz	6,0(12)
00db10
-	lwz     7,4(12)
00db10
-	stw     6,0(3)
00db10
-	lwz     8,8(12)
00db10
-	stw     7,4(3)
00db10
-	lwz     6,12(12)
00db10
-	addi    12,12,16
00db10
-	stw     8,8(3)
00db10
-	stw     6,12(3)
00db10
-	addi    3,3,16
00db10
+	lwz	6,0(src)
00db10
+	lwz	7,4(src)
00db10
+	stw	6,0(dst)
00db10
+	lwz	8,8(src)
00db10
+	stw	7,4(dst)
00db10
+	lwz	6,12(src)
00db10
+	addi	src,src,16
00db10
+	stw	8,8(dst)
00db10
+	stw	6,12(dst)
00db10
+	addi	dst,dst,16
00db10
 8:	/* Copy 8 bytes.  */
00db10
-	bf	28,4f
00db10
+	bf	28,L(tail4)
00db10
+	lwz	6,0(src)
00db10
+	lwz	7,4(src)
00db10
+	addi	src,src,8
00db10
+	stw	6,0(dst)
00db10
+	stw	7,4(dst)
00db10
+	addi	dst,dst,8
00db10
+
00db10
+	.align	4
00db10
+/* Copies 4~7 bytes.  */
00db10
+L(tail4):
00db10
+	bf	29,L(tail2)
00db10
+	lwz	6,0(src)
00db10
+	stw	6,0(dst)
00db10
+	bf	30,L(tail5)
00db10
+	lhz	7,4(src)
00db10
+	sth	7,4(dst)
00db10
+	bflr	31
00db10
+	lbz	8,6(src)
00db10
+	stb	8,6(dst)
00db10
+	/* Return original DST pointer.  */
00db10
+	blr
00db10
 
00db10
-	lwz     6,0(12)
00db10
-	lwz     7,4(12)
00db10
-	addi    12,12,8
00db10
-	stw     6,0(3)
00db10
-	stw     7,4(3)
00db10
-	addi    3,3,8
00db10
-4:	/* Copy 4 bytes.  */
00db10
-	bf	29,2f
00db10
-
00db10
-	lwz     6,0(12)
00db10
-	addi    12,12,4
00db10
-	stw     6,0(3)
00db10
-	addi    3,3,4
00db10
-2:	/* Copy 2-3 bytes.  */
00db10
+	.align	4
00db10
+/* Copies 2~3 bytes.  */
00db10
+L(tail2):
00db10
 	bf	30,1f
00db10
-
00db10
-	lhz     6,0(12)
00db10
-	sth     6,0(3)
00db10
-	bf      31,0f
00db10
-	lbz     7,2(12)
00db10
-	stb     7,2(3)
00db10
-	ld	3,-16(1)
00db10
+	lhz	6,0(src)
00db10
+	sth	6,0(dst)
00db10
+	bflr	31
00db10
+	lbz	7,2(src)
00db10
+	stb	7,2(dst)
00db10
 	blr
00db10
 
00db10
-	.align  4
00db10
-1:	/* Copy 1 byte.  */
00db10
-	bf	31,0f
00db10
+	.align	4
00db10
+L(tail5):
00db10
+	bflr	31
00db10
+	lbz	6,4(src)
00db10
+	stb	6,4(dst)
00db10
+	blr
00db10
 
00db10
-	lbz	6,0(12)
00db10
-	stb	6,0(3)
00db10
-0:	/* Return original DST pointer.  */
00db10
-	ld	3,-16(1)
00db10
+	.align	4
00db10
+1:
00db10
+	bflr	31
00db10
+	lbz	6,0(src)
00db10
+	stb	6,0(dst)
00db10
+	/* Return original DST pointer.  */
00db10
 	blr
00db10
 
00db10
-	/* Handles copies of 0~8 bytes.  */
00db10
-	.align  4
00db10
+
00db10
+/* Handles copies of 0~8 bytes.  */
00db10
+	.align	4
00db10
 L(copy_LE_8):
00db10
-	bne	cr6,4f
00db10
+	bne	cr6,L(tail4)
00db10
 
00db10
 	/* Though we could've used ld/std here, they are still
00db10
 	slow for unaligned cases.  */
00db10
 
00db10
-	lwz	6,0(4)
00db10
-	lwz     7,4(4)
00db10
-	stw     6,0(3)
00db10
-	stw     7,4(3)
00db10
-	ld      3,-16(1)      /* Return original DST pointers.  */
00db10
+	lwz	6,0(src)
00db10
+	lwz	7,4(src)
00db10
+	stw	6,0(dst)
00db10
+	stw	7,4(dst)
00db10
 	blr
00db10
 
00db10
-	.align  4
00db10
-4:	/* Copies 4~7 bytes.  */
00db10
-	bf	29,2b
00db10
 
00db10
-	lwz	6,0(4)
00db10
-	stw     6,0(3)
00db10
-	bf      30,5f
00db10
-	lhz     7,4(4)
00db10
-	sth     7,4(3)
00db10
-	bf      31,0f
00db10
-	lbz     8,6(4)
00db10
-	stb     8,6(3)
00db10
-	ld	3,-16(1)
00db10
-	blr
00db10
-
00db10
-	.align  4
00db10
-5:	/* Copy 1 byte.  */
00db10
-	bf	31,0f
00db10
-
00db10
-	lbz	6,4(4)
00db10
-	stb	6,4(3)
00db10
-
00db10
-0:	/* Return original DST pointer.  */
00db10
-	ld	3,-16(1)
00db10
-	blr
00db10
-
00db10
-	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
00db10
-	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
00db10
-	the data, allowing for aligned DST stores.  */
00db10
-	.align  4
00db10
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
00db10
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
00db10
+   the data, allowing for aligned DST stores.  */
00db10
+	.align	4
00db10
 L(copy_GE_32_unaligned):
00db10
-	clrldi  0,0,60	      /* Number of bytes until the 1st
00db10
-			      quadword.  */
00db10
-	andi.   11,3,15       /* Check alignment of DST (against
00db10
-			      quadwords).  */
00db10
-	srdi    9,5,4	      /* Number of full quadwords remaining.  */
00db10
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
00db10
+#ifndef __LITTLE_ENDIAN__
00db10
+	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
00db10
+#endif
00db10
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
00db10
 
00db10
 	beq	L(copy_GE_32_unaligned_cont)
00db10
 
00db10
-	/* SRC is not quadword aligned, get it aligned.  */
00db10
+	/* DST is not quadword aligned, get it aligned.  */
00db10
 
00db10
-	mtcrf   0x01,0
00db10
-	subf    31,0,5
00db10
+	mtocrf	0x01,0
00db10
+	subf	cnt,0,cnt
00db10
 
00db10
 	/* Vector instructions work best when proper alignment (16-bytes)
00db10
 	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
00db10
-1:	/* Copy 1 byte.  */
00db10
+1:
00db10
 	bf	31,2f
00db10
-
00db10
-	lbz	6,0(12)
00db10
-	addi    12,12,1
00db10
-	stb	6,0(3)
00db10
-	addi    3,3,1
00db10
-2:	/* Copy 2 bytes.  */
00db10
+	lbz	6,0(src)
00db10
+	addi	src,src,1
00db10
+	stb	6,0(dst)
00db10
+	addi	dst,dst,1
00db10
+2:
00db10
 	bf	30,4f
00db10
-
00db10
-	lhz     6,0(12)
00db10
-	addi    12,12,2
00db10
-	sth     6,0(3)
00db10
-	addi    3,3,2
00db10
-4:	/* Copy 4 bytes.  */
00db10
+	lhz	6,0(src)
00db10
+	addi	src,src,2
00db10
+	sth	6,0(dst)
00db10
+	addi	dst,dst,2
00db10
+4:
00db10
 	bf	29,8f
00db10
-
00db10
-	lwz     6,0(12)
00db10
-	addi    12,12,4
00db10
-	stw     6,0(3)
00db10
-	addi    3,3,4
00db10
-8:	/* Copy 8 bytes.  */
00db10
+	lwz	6,0(src)
00db10
+	addi	src,src,4
00db10
+	stw	6,0(dst)
00db10
+	addi	dst,dst,4
00db10
+8:
00db10
 	bf	28,0f
00db10
-
00db10
-	ld	6,0(12)
00db10
-	addi    12,12,8
00db10
-	std	6,0(3)
00db10
-	addi    3,3,8
00db10
+	ld	6,0(src)
00db10
+	addi	src,src,8
00db10
+	std	6,0(dst)
00db10
+	addi	dst,dst,8
00db10
 0:
00db10
-	clrldi  10,12,60      /* Check alignment of SRC.  */
00db10
-	srdi    9,31,4	      /* Number of full quadwords remaining.  */
00db10
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
00db10
 
00db10
 	/* The proper alignment is present, it is OK to copy the bytes now.  */
00db10
 L(copy_GE_32_unaligned_cont):
00db10
 
00db10
 	/* Setup two indexes to speed up the indexed vector operations.  */
00db10
-	clrldi  11,31,60
00db10
-	li      6,16	      /* Index for 16-bytes offsets.  */
00db10
+	clrldi	10,cnt,60
00db10
+	li	6,16	      /* Index for 16-bytes offsets.  */
00db10
 	li	7,32	      /* Index for 32-bytes offsets.  */
00db10
-	cmpldi  cr1,11,0
00db10
-	srdi    8,31,5	      /* Setup the loop counter.  */
00db10
-	mr      10,3
00db10
-	mr      11,12
00db10
-	mtcrf   0x01,9
00db10
-	cmpldi  cr6,9,1
00db10
-	lvsl    5,0,12
00db10
-	lvx     3,0,12
00db10
-	bf      31,L(setup_unaligned_loop)
00db10
-
00db10
-	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
00db10
-	lvx     4,12,6
00db10
-	vperm   6,3,4,5
00db10
-	addi    11,12,16
00db10
-	addi    10,3,16
00db10
-	stvx    6,0,3
00db10
+	cmpldi	cr1,10,0
00db10
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
00db10
+	mtocrf	0x01,9
00db10
+	cmpldi	cr6,9,1
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	lvsr	5,0,src
00db10
+#else
00db10
+	lvsl	5,0,src
00db10
+#endif
00db10
+	lvx	3,0,src
00db10
+	li	0,0
00db10
+	bf	31,L(setup_unaligned_loop)
00db10
+
00db10
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
00db10
+	lvx	4,src,6
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm	6,4,3,5
00db10
+#else
00db10
+	vperm	6,3,4,5
00db10
+#endif
00db10
+	addi	src,src,16
00db10
+	stvx	6,0,dst
00db10
+	addi	dst,dst,16
00db10
 	vor	3,4,4
00db10
+	clrrdi	0,src,60
00db10
 
00db10
 L(setup_unaligned_loop):
00db10
-	mtctr   8
00db10
-	ble     cr6,L(end_unaligned_loop)
00db10
+	mtctr	8
00db10
+	ble	cr6,L(end_unaligned_loop)
00db10
 
00db10
 	/* Copy 32 bytes at a time using vector instructions.  */
00db10
-	.align  4
00db10
+	.align	4
00db10
 L(unaligned_loop):
00db10
 
00db10
 	/* Note: vr6/vr10 may contain data that was already copied,
00db10
@@ -444,63 +385,56 @@
00db10
 	some portions again. This is faster than having unaligned
00db10
 	vector instructions though.  */
00db10
 
00db10
-	lvx	4,11,6	      /* vr4 = r11+16.  */
00db10
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
00db10
-			      of vr3/vr4 into vr6.  */
00db10
-	lvx	3,11,7	      /* vr3 = r11+32.  */
00db10
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
00db10
-			      of vr3/vr4 into vr10.  */
00db10
-	addi    11,11,32
00db10
-	stvx    6,0,10
00db10
-	stvx    10,10,6
00db10
-	addi    10,10,32
00db10
-
00db10
+	lvx	4,src,6
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm	6,4,3,5
00db10
+#else
00db10
+	vperm	6,3,4,5
00db10
+#endif
00db10
+	lvx	3,src,7
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm	10,3,4,5
00db10
+#else
00db10
+	vperm	10,4,3,5
00db10
+#endif
00db10
+	addi	src,src,32
00db10
+	stvx	6,0,dst
00db10
+	stvx	10,dst,6
00db10
+	addi	dst,dst,32
00db10
 	bdnz	L(unaligned_loop)
00db10
 
00db10
-	.align  4
00db10
+	clrrdi	0,src,60
00db10
+
00db10
+	.align	4
00db10
 L(end_unaligned_loop):
00db10
 
00db10
 	/* Check for tail bytes.  */
00db10
-	rldicr  0,31,0,59
00db10
-	mtcrf   0x01,31
00db10
-	beq	cr1,0f
00db10
+	mtocrf	0x01,cnt
00db10
+	beqlr	cr1
00db10
 
00db10
-	add	3,3,0
00db10
-	add	12,12,0
00db10
+	add	src,src,0
00db10
 
00db10
 	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
00db10
-8:	/* Copy 8 bytes.  */
00db10
+	/* Copy 8 bytes.  */
00db10
 	bf	28,4f
00db10
-
00db10
-	lwz	6,0(12)
00db10
-	lwz	7,4(12)
00db10
-	addi    12,12,8
00db10
-	stw	6,0(3)
00db10
-	stw	7,4(3)
00db10
-	addi    3,3,8
00db10
-4:	/* Copy 4 bytes.  */
00db10
-	bf	29,2f
00db10
-
00db10
-	lwz	6,0(12)
00db10
-	addi    12,12,4
00db10
-	stw	6,0(3)
00db10
-	addi    3,3,4
00db10
-2:	/* Copy 2~3 bytes.  */
00db10
-	bf	30,1f
00db10
-
00db10
-	lhz	6,0(12)
00db10
-	addi    12,12,2
00db10
-	sth	6,0(3)
00db10
-	addi    3,3,2
00db10
-1:	/* Copy 1 byte.  */
00db10
-	bf	31,0f
00db10
-
00db10
-	lbz	6,0(12)
00db10
-	stb	6,0(3)
00db10
-0:	/* Return original DST pointer.  */
00db10
-	ld	31,-8(1)
00db10
-	ld	3,-16(1)
00db10
+	lwz	6,0(src)
00db10
+	lwz	7,4(src)
00db10
+	addi	src,src,8
00db10
+	stw	6,0(dst)
00db10
+	stw	7,4(dst)
00db10
+	addi	dst,dst,8
00db10
+4:	/* Copy 4~7 bytes.  */
00db10
+	bf	29,L(tail2)
00db10
+	lwz	6,0(src)
00db10
+	stw	6,0(dst)
00db10
+	bf	30,L(tail5)
00db10
+	lhz	7,4(src)
00db10
+	sth	7,4(dst)
00db10
+	bflr	31
00db10
+	lbz	8,6(src)
00db10
+	stb	8,6(dst)
00db10
+	/* Return original DST pointer.  */
00db10
 	blr
00db10
 
00db10
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
00db10
+END_GEN_TB (memcpy,TB_TOCLESS)
00db10
 libc_hidden_builtin_def (memcpy)
00db10
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S
00db10
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
00db10
@@ -367,13 +367,21 @@
00db10
 	mr	11,12
00db10
 	mtcrf	0x01,9
00db10
 	cmpldi	cr6,9,1
00db10
-	lvsl	5,0,12
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	lvsr    5,0,12
00db10
+#else
00db10
+	lvsl    5,0,12
00db10
+#endif
00db10
 	lvx	3,0,12
00db10
 	bf	31,L(setup_unaligned_loop)
00db10
 
00db10
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
00db10
 	lvx	4,12,6
00db10
-	vperm	6,3,4,5
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm   6,4,3,5
00db10
+#else
00db10
+	vperm   6,3,4,5
00db10
+#endif
00db10
 	addi	11,12,16
00db10
 	addi	10,3,16
00db10
 	stvx	6,0,3
00db10
@@ -393,11 +401,17 @@
00db10
 	vector instructions though.  */
00db10
 
00db10
 	lvx	4,11,6	      /* vr4 = r11+16.  */
00db10
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
00db10
-				 of vr3/vr4 into vr6.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm   6,4,3,5
00db10
+#else
00db10
+	vperm   6,3,4,5
00db10
+#endif
00db10
 	lvx	3,11,7	      /* vr3 = r11+32.  */
00db10
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
00db10
-				 of vr3/vr4 into vr10.  */
00db10
+#ifdef __LITTLE_ENDIAN__
00db10
+	vperm   10,3,4,5
00db10
+#else
00db10
+	vperm   10,4,3,5
00db10
+#endif
00db10
 	addi	11,11,32
00db10
 	stvx	6,0,10
00db10
 	stvx	10,10,6