olga / rpms / glibc

Forked from rpms/glibc 5 years ago
Clone
ce426f
# commit 759cfef3ac4c07dba1ece0bbc1207e099348816d
ce426f
# Author: Alan Modra <amodra@gmail.com>
ce426f
# Date:   Sat Aug 17 18:47:22 2013 +0930
ce426f
# 
ce426f
#     PowerPC LE memcpy
ce426f
#     http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html
ce426f
#     
ce426f
#     LIttle-endian support for memcpy.  I spent some time cleaning up the
ce426f
#     64-bit power7 memcpy, in order to avoid the extra alignment traps
ce426f
#     power7 takes for little-endian.  It probably would have been better
ce426f
#     to copy the linux kernel version of memcpy.
ce426f
#     
ce426f
#         * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
ce426f
#         * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
ce426f
#         * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
ce426f
#         * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
ce426f
#         * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
ce426f
#         * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
ce426f
#         * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
ce426f
#         * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
ce426f
#         * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise.  Make better
ce426f
#         use of regs.  Use power7 mtocrf.  Tidy function tails.
ce426f
# 
ce426f
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
@@ -205,15 +205,28 @@
ce426f
     blt   cr6,5f
ce426f
     srwi  7,6,16
ce426f
     bgt	  cr6,3f
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    sth   7,0(3)
ce426f
+#else
ce426f
     sth   6,0(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 3:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,24
ce426f
+    stb   6,0(3)
ce426f
+    sth   7,1(3)
ce426f
+#else
ce426f
     stb   7,0(3)
ce426f
     sth   6,1(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 5:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,8
ce426f
+#endif
ce426f
     stb   6,0(3)
ce426f
 7:
ce426f
     cmplwi	cr1,10,16
ce426f
@@ -341,13 +354,23 @@
ce426f
     bf      30,1f
ce426f
 
ce426f
     /* there are at least two words to copy, so copy them */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srw   0,6,10
ce426f
+    slw   8,7,9
ce426f
+#else
ce426f
     slw   0,6,10  /* shift 1st src word to left align it in R0 */
ce426f
     srw   8,7,9   /* shift 2nd src word to right align it in R8 */
ce426f
+#endif
ce426f
     or    0,0,8   /* or them to get word to store */
ce426f
     lwz   6,8(5)  /* load the 3rd src word */
ce426f
     stw   0,0(4)  /* store the 1st dst word */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srw   0,7,10
ce426f
+    slw   8,6,9
ce426f
+#else
ce426f
     slw   0,7,10  /* now left align 2nd src word into R0 */
ce426f
     srw   8,6,9   /* shift 3rd src word to right align it in R8 */
ce426f
+#endif
ce426f
     or    0,0,8   /* or them to get word to store */
ce426f
     lwz   7,12(5)
ce426f
     stw   0,4(4)  /* store the 2nd dst word */
ce426f
@@ -355,8 +378,13 @@
ce426f
     addi  5,5,16
ce426f
     bf    31,4f
ce426f
     /* there is a third word to copy, so copy it */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srw   0,6,10
ce426f
+    slw   8,7,9
ce426f
+#else
ce426f
     slw   0,6,10  /* shift 3rd src word to left align it in R0 */
ce426f
     srw   8,7,9   /* shift 4th src word to right align it in R8 */
ce426f
+#endif
ce426f
     or    0,0,8   /* or them to get word to store */
ce426f
     stw   0,0(4)  /* store 3rd dst word */
ce426f
     mr    6,7
ce426f
@@ -366,8 +394,13 @@
ce426f
     b     4f
ce426f
     .align 4
ce426f
 1:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srw     0,6,10
ce426f
+    slw     8,7,9
ce426f
+#else
ce426f
     slw     0,6,10  /* shift 1st src word to left align it in R0 */
ce426f
     srw     8,7,9   /* shift 2nd src word to right align it in R8 */
ce426f
+#endif
ce426f
     addi  5,5,8
ce426f
     or    0,0,8   /* or them to get word to store */
ce426f
     bf    31,4f
ce426f
@@ -380,23 +413,43 @@
ce426f
     .align  4
ce426f
 4:
ce426f
     /* copy 16 bytes at a time */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srw   0,6,10
ce426f
+    slw   8,7,9
ce426f
+#else
ce426f
     slw   0,6,10
ce426f
     srw   8,7,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     lwz   6,0(5)
ce426f
     stw   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srw   0,7,10
ce426f
+    slw   8,6,9
ce426f
+#else
ce426f
     slw   0,7,10
ce426f
     srw   8,6,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     lwz   7,4(5)
ce426f
     stw   0,4(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srw   0,6,10
ce426f
+    slw   8,7,9
ce426f
+#else
ce426f
     slw   0,6,10
ce426f
     srw   8,7,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     lwz   6,8(5)
ce426f
     stw   0,8(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srw   0,7,10
ce426f
+    slw   8,6,9
ce426f
+#else
ce426f
     slw   0,7,10
ce426f
     srw   8,6,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     lwz   7,12(5)
ce426f
     stw   0,12(4)
ce426f
@@ -405,8 +458,13 @@
ce426f
     bdnz+ 4b
ce426f
 8:
ce426f
     /* calculate and store the final word */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srw   0,6,10
ce426f
+    slw   8,7,9
ce426f
+#else
ce426f
     slw   0,6,10
ce426f
     srw   8,7,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     stw   0,0(4)
ce426f
 3:
ce426f
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
@@ -221,15 +221,28 @@
ce426f
     blt   cr6,5f
ce426f
     srwi  7,6,16
ce426f
     bgt	  cr6,3f
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    sth   7,0(3)
ce426f
+#else
ce426f
     sth   6,0(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 3:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,24
ce426f
+    stb   6,0(3)
ce426f
+    sth   7,1(3)
ce426f
+#else
ce426f
     stb   7,0(3)
ce426f
     sth   6,1(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 5:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,8
ce426f
+#endif
ce426f
     stb   6,0(3)
ce426f
 7:
ce426f
     cmplwi	cr1,10,16
ce426f
@@ -579,7 +592,11 @@
ce426f
     lwz     6,-1(4)
ce426f
     cmplwi  cr6,31,4
ce426f
     srwi    8,31,5    /* calculate the 32 byte loop count */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srwi    6,6,8
ce426f
+#else
ce426f
     slwi    6,6,8
ce426f
+#endif
ce426f
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
ce426f
     blt     cr5,L(wdu1_32tail)
ce426f
     mtctr   8
ce426f
@@ -587,8 +604,12 @@
ce426f
 
ce426f
     lwz   8,3(4)
ce426f
     lwz   7,4(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rldimi 6,8,24,32
ce426f
+#else
ce426f
 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
ce426f
     rlwimi 6,8,8,(32-8),31
ce426f
+#endif
ce426f
     b      L(wdu1_loop32x)
ce426f
     .align  4
ce426f
 L(wdu1_loop32):
ce426f
@@ -597,8 +618,12 @@
ce426f
     lwz   7,4(4)
ce426f
     stw   10,-8(3)
ce426f
     stw   11,-4(3)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rldimi 6,8,24,32
ce426f
+#else
ce426f
 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
ce426f
     rlwimi 6,8,8,(32-8),31
ce426f
+#endif
ce426f
 L(wdu1_loop32x):
ce426f
     lwz   10,8(4)
ce426f
     lwz   11,12(4)
ce426f
@@ -615,7 +640,11 @@
ce426f
     stw   6,16(3)
ce426f
     stw   7,20(3)
ce426f
     addi  3,3,32
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srwi  6,8,8
ce426f
+#else
ce426f
     slwi  6,8,8
ce426f
+#endif
ce426f
     bdnz+ L(wdu1_loop32)
ce426f
     stw   10,-8(3)
ce426f
     stw   11,-4(3)
ce426f
@@ -626,8 +655,12 @@
ce426f
     blt     cr6,L(wdu_4tail)
ce426f
     /* calculate and store the final word */
ce426f
     lwz   8,3(4)
ce426f
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rldimi 6,8,24,32
ce426f
+#else
ce426f
+/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
ce426f
     rlwimi 6,8,8,(32-8),31
ce426f
+#endif
ce426f
     b     L(wdu_32tailx)
ce426f
 
ce426f
 L(wdu2_32):
ce426f
@@ -635,7 +668,11 @@
ce426f
     lwz     6,-2(4)
ce426f
     cmplwi  cr6,31,4
ce426f
     srwi    8,31,5    /* calculate the 32 byte loop count */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srwi    6,6,16
ce426f
+#else
ce426f
     slwi    6,6,16
ce426f
+#endif
ce426f
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
ce426f
     blt     cr5,L(wdu2_32tail)
ce426f
     mtctr   8
ce426f
@@ -643,8 +680,11 @@
ce426f
 
ce426f
     lwz   8,2(4)
ce426f
     lwz   7,4(4)
ce426f
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rldimi 6,8,16,32
ce426f
+#else
ce426f
     rlwimi 6,8,16,(32-16),31
ce426f
+#endif
ce426f
     b      L(wdu2_loop32x)
ce426f
     .align  4
ce426f
 L(wdu2_loop32):
ce426f
@@ -653,8 +693,11 @@
ce426f
     lwz   7,4(4)
ce426f
     stw   10,-8(3)
ce426f
     stw   11,-4(3)
ce426f
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rldimi 6,8,16,32
ce426f
+#else
ce426f
     rlwimi 6,8,16,(32-16),31
ce426f
+#endif
ce426f
 L(wdu2_loop32x):
ce426f
     lwz   10,8(4)
ce426f
     lwz   11,12(4)
ce426f
@@ -672,7 +715,11 @@
ce426f
     stw   6,16(3)
ce426f
     stw   7,20(3)
ce426f
     addi  3,3,32
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srwi  6,8,16
ce426f
+#else
ce426f
     slwi  6,8,16
ce426f
+#endif
ce426f
     bdnz+ L(wdu2_loop32)
ce426f
     stw   10,-8(3)
ce426f
     stw   11,-4(3)
ce426f
@@ -683,8 +730,11 @@
ce426f
     blt     cr6,L(wdu_4tail)
ce426f
     /* calculate and store the final word */
ce426f
     lwz   8,2(4)
ce426f
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rldimi 6,8,16,32
ce426f
+#else
ce426f
     rlwimi 6,8,16,(32-16),31
ce426f
+#endif
ce426f
     b     L(wdu_32tailx)
ce426f
 
ce426f
 L(wdu3_32):
ce426f
@@ -692,7 +742,11 @@
ce426f
     lwz     6,-3(4)
ce426f
     cmplwi  cr6,31,4
ce426f
     srwi    8,31,5    /* calculate the 32 byte loop count */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srwi    6,6,24
ce426f
+#else
ce426f
     slwi    6,6,24
ce426f
+#endif
ce426f
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
ce426f
     blt     cr5,L(wdu3_32tail)
ce426f
     mtctr   8
ce426f
@@ -700,8 +754,11 @@
ce426f
 
ce426f
     lwz   8,1(4)
ce426f
     lwz   7,4(4)
ce426f
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rldimi 6,8,8,32
ce426f
+#else
ce426f
     rlwimi 6,8,24,(32-24),31
ce426f
+#endif
ce426f
     b      L(wdu3_loop32x)
ce426f
     .align  4
ce426f
 L(wdu3_loop32):
ce426f
@@ -710,8 +767,11 @@
ce426f
     lwz   7,4(4)
ce426f
     stw   10,-8(3)
ce426f
     stw   11,-4(3)
ce426f
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rldimi 6,8,8,32
ce426f
+#else
ce426f
     rlwimi 6,8,24,(32-24),31
ce426f
+#endif
ce426f
 L(wdu3_loop32x):
ce426f
     lwz   10,8(4)
ce426f
     lwz   11,12(4)
ce426f
@@ -728,7 +788,11 @@
ce426f
     stw   6,16(3)
ce426f
     stw   7,20(3)
ce426f
     addi  3,3,32
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srwi  6,8,24
ce426f
+#else
ce426f
     slwi  6,8,24
ce426f
+#endif
ce426f
     bdnz+ L(wdu3_loop32)
ce426f
     stw   10,-8(3)
ce426f
     stw   11,-4(3)
ce426f
@@ -739,8 +803,11 @@
ce426f
     blt     cr6,L(wdu_4tail)
ce426f
     /* calculate and store the final word */
ce426f
     lwz   8,1(4)
ce426f
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rldimi 6,8,8,32
ce426f
+#else
ce426f
     rlwimi 6,8,24,(32-24),31
ce426f
+#endif
ce426f
     b     L(wdu_32tailx)
ce426f
     .align  4
ce426f
 L(wdu_32tailx):
ce426f
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
@@ -385,7 +385,7 @@
ce426f
 
ce426f
 	beq    L(copy_GE_32_unaligned_cont)
ce426f
 
ce426f
-	/* SRC is not quadword aligned, get it aligned.  */
ce426f
+	/* DST is not quadword aligned, get it aligned.  */
ce426f
 
ce426f
 	mtcrf   0x01,0
ce426f
 	subf    31,0,5
ce426f
@@ -437,13 +437,21 @@
ce426f
 	mr      11,12
ce426f
 	mtcrf   0x01,9
ce426f
 	cmplwi  cr6,9,1
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	lvsr    5,0,12
ce426f
+#else
ce426f
 	lvsl    5,0,12
ce426f
+#endif
ce426f
 	lvx     3,0,12
ce426f
 	bf      31,L(setup_unaligned_loop)
ce426f
 
ce426f
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
ce426f
 	lvx     4,12,6
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm   6,4,3,5
ce426f
+#else
ce426f
 	vperm   6,3,4,5
ce426f
+#endif
ce426f
 	addi    11,12,16
ce426f
 	addi    10,3,16
ce426f
 	stvx    6,0,3
ce426f
@@ -463,11 +471,17 @@
ce426f
 	vector instructions though.  */
ce426f
 
ce426f
 	lvx	4,11,6	      /* vr4 = r11+16.  */
ce426f
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
ce426f
-			      of vr3/vr4 into vr6.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm   6,4,3,5
ce426f
+#else
ce426f
+	vperm   6,3,4,5
ce426f
+#endif
ce426f
 	lvx	3,11,7	      /* vr3 = r11+32.  */
ce426f
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
ce426f
-			      of vr3/vr4 into vr10.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm   10,3,4,5
ce426f
+#else
ce426f
+	vperm   10,4,3,5
ce426f
+#endif
ce426f
 	addi    11,11,32
ce426f
 	stvx    6,0,10
ce426f
 	stvx    10,10,6
ce426f
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
@@ -327,7 +327,7 @@
ce426f
 
ce426f
 	beq	L(copy_GE_32_unaligned_cont)
ce426f
 
ce426f
-	/* SRC is not quadword aligned, get it aligned.  */
ce426f
+	/* DST is not quadword aligned, get it aligned.  */
ce426f
 
ce426f
 	mtcrf	0x01,0
ce426f
 	subf	31,0,5
ce426f
@@ -379,13 +379,21 @@
ce426f
 	mr	11,12
ce426f
 	mtcrf	0x01,9
ce426f
 	cmplwi	cr6,9,1
ce426f
-	lvsl	5,0,12
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	lvsr    5,0,12
ce426f
+#else
ce426f
+	lvsl    5,0,12
ce426f
+#endif
ce426f
 	lvx	3,0,12
ce426f
 	bf	31,L(setup_unaligned_loop)
ce426f
 
ce426f
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
ce426f
 	lvx	4,12,6
ce426f
-	vperm	6,3,4,5
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm   6,4,3,5
ce426f
+#else
ce426f
+	vperm   6,3,4,5
ce426f
+#endif
ce426f
 	addi	11,12,16
ce426f
 	addi	10,3,16
ce426f
 	stvx	6,0,3
ce426f
@@ -405,11 +413,17 @@
ce426f
 	vector instructions though.  */
ce426f
 
ce426f
 	lvx	4,11,6	      /* vr4 = r11+16.  */
ce426f
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
ce426f
-				 of vr3/vr4 into vr6.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm   6,4,3,5
ce426f
+#else
ce426f
+	vperm   6,3,4,5
ce426f
+#endif
ce426f
 	lvx	3,11,7	      /* vr3 = r11+32.  */
ce426f
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
ce426f
-				 of vr3/vr4 into vr10.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm   10,3,4,5
ce426f
+#else
ce426f
+	vperm   10,4,3,5
ce426f
+#endif
ce426f
 	addi	11,11,32
ce426f
 	stvx	6,0,10
ce426f
 	stvx	10,10,6
ce426f
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
@@ -214,15 +214,28 @@
ce426f
     blt   cr6,5f
ce426f
     srdi  7,6,16
ce426f
     bgt	  cr6,3f
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    sth   7,0(3)
ce426f
+#else
ce426f
     sth   6,0(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 3:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,24
ce426f
+    stb   6,0(3)
ce426f
+    sth   7,1(3)
ce426f
+#else
ce426f
     stb   7,0(3)
ce426f
     sth   6,1(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 5:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,8
ce426f
+#endif
ce426f
     stb   6,0(3)
ce426f
 7:
ce426f
     cmpldi	cr1,10,16
ce426f
@@ -330,7 +343,11 @@
ce426f
     ld    7,8(5)
ce426f
     subfic  9,10,64
ce426f
     beq   2f
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd   0,6,10
ce426f
+#else
ce426f
     sld   0,6,10
ce426f
+#endif
ce426f
     cmpldi  11,1
ce426f
     mr    6,7
ce426f
     addi  4,4,-8
ce426f
@@ -338,15 +355,25 @@
ce426f
     b     1f
ce426f
 2:  addi  5,5,8
ce426f
     .align  4
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+0:  srd   0,6,10
ce426f
+    sld   8,7,9
ce426f
+#else
ce426f
 0:  sld   0,6,10
ce426f
     srd   8,7,9
ce426f
+#endif
ce426f
     cmpldi  11,2
ce426f
     ld    6,8(5)
ce426f
     or    0,0,8
ce426f
     addi  11,11,-2
ce426f
     std   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd   0,7,10
ce426f
+1:  sld   8,6,9
ce426f
+#else
ce426f
     sld   0,7,10
ce426f
 1:  srd   8,6,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     beq   8f
ce426f
     ld    7,16(5)
ce426f
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:05:51.000000000 -0500
ce426f
@@ -1,5 +1,5 @@
ce426f
 /* Optimized memcpy implementation for PowerPC64.
ce426f
-   Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
ce426f
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
ce426f
    This file is part of the GNU C Library.
ce426f
 
ce426f
    The GNU C Library is free software; you can redistribute it and/or
ce426f
@@ -17,26 +17,24 @@
ce426f
    <http://www.gnu.org/licenses/>.  */
ce426f
 
ce426f
 #include <sysdep.h>
ce426f
-#include <bp-sym.h>
ce426f
-#include <bp-asm.h>
ce426f
 
ce426f
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
ce426f
    Returns 'dst'.
ce426f
 
ce426f
-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
ce426f
-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
ce426f
-   with the appropriate combination of byte and halfword load/stores. 
ce426f
-   There is minimal effort to optimize the alignment of short moves.  
ce426f
+   Memcpy handles short copies (< 32-bytes) using a binary move blocks
ce426f
+   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
ce426f
+   with the appropriate combination of byte and halfword load/stores.
ce426f
+   There is minimal effort to optimize the alignment of short moves.
ce426f
    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
ce426f
-   of handling unligned load/stores that do not cross 32-byte boundries.
ce426f
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
ce426f
 
ce426f
    Longer moves (>= 32-bytes) justify the effort to get at least the
ce426f
    destination doubleword (8-byte) aligned.  Further optimization is
ce426f
-   posible when both source and destination are doubleword aligned.
ce426f
+   possible when both source and destination are doubleword aligned.
ce426f
    Each case has a optimized unrolled loop.   */
ce426f
 
ce426f
 	.machine power4
ce426f
-EALIGN (BP_SYM (memcpy), 5, 0)
ce426f
+EALIGN (memcpy, 5, 0)
ce426f
 	CALL_MCOUNT 3
ce426f
 
ce426f
     cmpldi cr1,5,31
ce426f
@@ -44,20 +42,20 @@
ce426f
     std   3,-16(1)
ce426f
     std   31,-8(1)
ce426f
     cfi_offset(31,-8)
ce426f
-    andi. 11,3,7	/* check alignement of dst.  */
ce426f
+    andi. 11,3,7	/* check alignment of dst.  */
ce426f
     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
ce426f
-    clrldi 10,4,61	/* check alignement of src.  */
ce426f
+    clrldi 10,4,61	/* check alignment of src.  */
ce426f
     cmpldi cr6,5,8
ce426f
     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
ce426f
-    cmpld cr6,10,11     
ce426f
+    cmpld cr6,10,11
ce426f
     mr    12,4
ce426f
     srdi  9,5,3		/* Number of full double words remaining.  */
ce426f
     mtcrf 0x01,0
ce426f
     mr    31,5
ce426f
     beq   .L0
ce426f
-  
ce426f
+
ce426f
     subf  31,0,5
ce426f
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.  */
ce426f
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
ce426f
 1:  bf    31,2f
ce426f
     lbz   6,0(12)
ce426f
     addi  12,12,1
ce426f
@@ -74,17 +72,17 @@
ce426f
     stw   6,0(3)
ce426f
     addi  3,3,4
ce426f
 0:
ce426f
-    clrldi 10,12,61	/* check alignement of src again.  */     
ce426f
+    clrldi 10,12,61	/* check alignment of src again.  */
ce426f
     srdi  9,31,3	/* Number of full double words remaining.  */
ce426f
-    
ce426f
-  /* Copy doublewords from source to destination, assumpting the
ce426f
+
ce426f
+  /* Copy doublewords from source to destination, assuming the
ce426f
      destination is aligned on a doubleword boundary.
ce426f
 
ce426f
      At this point we know there are at least 25 bytes left (32-7) to copy.
ce426f
-     The next step is to determine if the source is also doubleword aligned. 
ce426f
+     The next step is to determine if the source is also doubleword aligned.
ce426f
      If not branch to the unaligned move code at .L6. which uses
ce426f
      a load, shift, store strategy.
ce426f
-     
ce426f
+
ce426f
      Otherwise source and destination are doubleword aligned, and we can
ce426f
      the optimized doubleword copy loop.  */
ce426f
 .L0:
ce426f
@@ -97,14 +95,14 @@
ce426f
      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
ce426f
      If the copy is not an exact multiple of 32 bytes, 1-3
ce426f
      doublewords are copied as needed to set up the main loop.  After
ce426f
-     the main loop exits there may be a tail of 1-7 bytes. These byte are 
ce426f
+     the main loop exits there may be a tail of 1-7 bytes. These byte are
ce426f
      copied a word/halfword/byte at a time as needed to preserve alignment.  */
ce426f
 
ce426f
     srdi  8,31,5
ce426f
     cmpldi	cr1,9,4
ce426f
     cmpldi	cr6,11,0
ce426f
     mr    11,12
ce426f
-    
ce426f
+
ce426f
     bf    30,1f
ce426f
     ld    6,0(12)
ce426f
     ld    7,8(12)
ce426f
@@ -115,7 +113,7 @@
ce426f
     addi  10,3,16
ce426f
     bf    31,4f
ce426f
     ld    0,16(12)
ce426f
-    std   0,16(3)    
ce426f
+    std   0,16(3)
ce426f
     blt   cr1,3f
ce426f
     addi  11,12,24
ce426f
     addi  10,3,24
ce426f
@@ -129,7 +127,7 @@
ce426f
     addi  11,12,8
ce426f
     std   6,0(3)
ce426f
     addi  10,3,8
ce426f
-    
ce426f
+
ce426f
     .align  4
ce426f
 4:
ce426f
     ld    6,0(11)
ce426f
@@ -144,7 +142,7 @@
ce426f
     std   0,24(10)
ce426f
     addi  10,10,32
ce426f
     bdnz  4b
ce426f
-3:  
ce426f
+3:
ce426f
 
ce426f
     rldicr 0,31,0,60
ce426f
     mtcrf 0x01,31
ce426f
@@ -152,9 +150,9 @@
ce426f
 .L9:
ce426f
     add   3,3,0
ce426f
     add   12,12,0
ce426f
-    
ce426f
+
ce426f
 /*  At this point we have a tail of 0-7 bytes and we know that the
ce426f
-    destiniation is double word aligned.  */
ce426f
+    destination is double word aligned.  */
ce426f
 4:  bf    29,2f
ce426f
     lwz   6,0(12)
ce426f
     addi  12,12,4
ce426f
@@ -173,29 +171,29 @@
ce426f
     ld 31,-8(1)
ce426f
     ld 3,-16(1)
ce426f
     blr
ce426f
-       
ce426f
-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
ce426f
-   bytes.  Each case is handled without loops, using binary (1,2,4,8) 
ce426f
-   tests.  
ce426f
-   
ce426f
+
ce426f
+/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
ce426f
+   bytes.  Each case is handled without loops, using binary (1,2,4,8)
ce426f
+   tests.
ce426f
+
ce426f
    In the short (0-8 byte) case no attempt is made to force alignment
ce426f
-   of either source or destination.  The hardware will handle the 
ce426f
-   unaligned load/stores with small delays for crossing 32- 64-byte, and 
ce426f
+   of either source or destination.  The hardware will handle the
ce426f
+   unaligned load/stores with small delays for crossing 32- 64-byte, and
ce426f
    4096-byte boundaries. Since these short moves are unlikely to be
ce426f
-   unaligned or cross these boundaries, the overhead to force 
ce426f
+   unaligned or cross these boundaries, the overhead to force
ce426f
    alignment is not justified.
ce426f
-   
ce426f
+
ce426f
    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
ce426f
    boundaries.  Since only loads are sensitive to the 32-/64-byte
ce426f
-   boundaries it is more important to align the source then the 
ce426f
+   boundaries it is more important to align the source then the
ce426f
    destination.  If the source is not already word aligned, we first
ce426f
-   move 1-3 bytes as needed.  Since we are only word aligned we don't 
ce426f
-   use double word load/stores to insure that all loads are aligned. 
ce426f
+   move 1-3 bytes as needed.  Since we are only word aligned we don't
ce426f
+   use double word load/stores to insure that all loads are aligned.
ce426f
    While the destination and stores may still be unaligned, this
ce426f
    is only an issue for page (4096 byte boundary) crossing, which
ce426f
    should be rare for these short moves.  The hardware handles this
ce426f
-   case automatically with a small delay.  */ 
ce426f
-   
ce426f
+   case automatically with a small delay.  */
ce426f
+
ce426f
     .align  4
ce426f
 .L2:
ce426f
     mtcrf 0x01,5
ce426f
@@ -216,15 +214,28 @@
ce426f
     blt   cr6,5f
ce426f
     srdi  7,6,16
ce426f
     bgt	  cr6,3f
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    sth   7,0(3)
ce426f
+#else
ce426f
     sth   6,0(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 3:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,24
ce426f
+    stb   6,0(3)
ce426f
+    sth   7,1(3)
ce426f
+#else
ce426f
     stb   7,0(3)
ce426f
     sth   6,1(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 5:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,8
ce426f
+#endif
ce426f
     stb   6,0(3)
ce426f
 7:
ce426f
     cmpldi	cr1,10,16
ce426f
@@ -258,11 +269,11 @@
ce426f
     lwz   6,0(12)
ce426f
     addi  12,12,4
ce426f
     stw   6,0(3)
ce426f
-    addi  3,3,4    
ce426f
+    addi  3,3,4
ce426f
 2:  /* Move 2-3 bytes.  */
ce426f
     bf    30,1f
ce426f
     lhz   6,0(12)
ce426f
-    sth   6,0(3) 
ce426f
+    sth   6,0(3)
ce426f
     bf    31,0f
ce426f
     lbz   7,2(12)
ce426f
     stb   7,2(3)
ce426f
@@ -283,8 +294,8 @@
ce426f
     mr    12,4
ce426f
     bne   cr6,4f
ce426f
 /* Would have liked to use use ld/std here but the 630 processors are
ce426f
-   slow for load/store doubles that are not at least word aligned.  
ce426f
-   Unaligned Load/Store word execute with only a 1 cycle penaltity.  */
ce426f
+   slow for load/store doubles that are not at least word aligned.
ce426f
+   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
ce426f
     lwz   6,0(4)
ce426f
     lwz   7,4(4)
ce426f
     stw   6,0(3)
ce426f
@@ -299,14 +310,14 @@
ce426f
 6:
ce426f
     bf    30,5f
ce426f
     lhz   7,4(4)
ce426f
-    sth   7,4(3) 
ce426f
+    sth   7,4(3)
ce426f
     bf    31,0f
ce426f
     lbz   8,6(4)
ce426f
     stb   8,6(3)
ce426f
     ld 3,-16(1)
ce426f
     blr
ce426f
     .align  4
ce426f
-5:  
ce426f
+5:
ce426f
     bf    31,0f
ce426f
     lbz   6,4(4)
ce426f
     stb   6,4(3)
ce426f
@@ -336,13 +347,23 @@
ce426f
     bf      30,1f
ce426f
 
ce426f
     /* there are at least two DWs to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd     0,6,10
ce426f
+    sld     8,7,9
ce426f
+#else
ce426f
     sld     0,6,10
ce426f
     srd     8,7,9
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      6,16(5)
ce426f
     std     0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd     0,7,10
ce426f
+    sld     8,6,9
ce426f
+#else
ce426f
     sld     0,7,10
ce426f
     srd     8,6,9
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      7,24(5)
ce426f
     std     0,8(4)
ce426f
@@ -351,8 +372,13 @@
ce426f
     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
ce426f
     bf      31,4f
ce426f
     /* there is a third DW to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd     0,6,10
ce426f
+    sld     8,7,9
ce426f
+#else
ce426f
     sld     0,6,10
ce426f
     srd     8,7,9
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     std     0,0(4)
ce426f
     mr      6,7
ce426f
@@ -363,8 +389,13 @@
ce426f
     b       4f
ce426f
     .align 4
ce426f
 1:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd     0,6,10
ce426f
+    sld     8,7,9
ce426f
+#else
ce426f
     sld     0,6,10
ce426f
     srd     8,7,9
ce426f
+#endif
ce426f
     addi    5,5,16
ce426f
     or      0,0,8
ce426f
     bf      31,4f
ce426f
@@ -375,23 +406,44 @@
ce426f
     addi    4,4,8
ce426f
     .align 4
ce426f
 /* copy 32 bytes at a time */
ce426f
-4:  sld   0,6,10
ce426f
+4:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd   0,6,10
ce426f
+    sld   8,7,9
ce426f
+#else
ce426f
+    sld   0,6,10
ce426f
     srd   8,7,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,0(5)
ce426f
     std   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd   0,7,10
ce426f
+    sld   8,6,9
ce426f
+#else
ce426f
     sld   0,7,10
ce426f
     srd   8,6,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,8(5)
ce426f
     std   0,8(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd   0,6,10
ce426f
+    sld   8,7,9
ce426f
+#else
ce426f
     sld   0,6,10
ce426f
     srd   8,7,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,16(5)
ce426f
     std   0,16(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd   0,7,10
ce426f
+    sld   8,6,9
ce426f
+#else
ce426f
     sld   0,7,10
ce426f
     srd   8,6,9
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,24(5)
ce426f
     std   0,24(4)
ce426f
@@ -401,9 +453,14 @@
ce426f
     .align 4
ce426f
 8:
ce426f
     /* calculate and store the final DW */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srd   0,6,10
ce426f
+    sld   8,7,9
ce426f
+#else
ce426f
     sld   0,6,10
ce426f
     srd   8,7,9
ce426f
-    or    0,0,8  
ce426f
+#endif
ce426f
+    or    0,0,8
ce426f
     std   0,0(4)
ce426f
 3:
ce426f
     rldicr 0,31,0,60
ce426f
@@ -413,5 +470,5 @@
ce426f
     ld 31,-8(1)
ce426f
     ld 3,-16(1)
ce426f
     blr
ce426f
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
ce426f
+END_GEN_TB (memcpy,TB_TOCLESS)
ce426f
 libc_hidden_builtin_def (memcpy)
ce426f
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:05:27.000000000 -0500
ce426f
@@ -1,5 +1,5 @@
ce426f
 /* Optimized memcpy implementation for PowerPC64.
ce426f
-   Copyright (C) 2003, 2006, 2007, 2011 Free Software Foundation, Inc.
ce426f
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
ce426f
    This file is part of the GNU C Library.
ce426f
 
ce426f
    The GNU C Library is free software; you can redistribute it and/or
ce426f
@@ -17,52 +17,50 @@
ce426f
    <http://www.gnu.org/licenses/>.  */
ce426f
 
ce426f
 #include <sysdep.h>
ce426f
-#include <bp-sym.h>
ce426f
-#include <bp-asm.h>
ce426f
 
ce426f
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
ce426f
    Returns 'dst'.
ce426f
 
ce426f
-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
ce426f
-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
ce426f
-   with the appropriate combination of byte and halfword load/stores. 
ce426f
-   There is minimal effort to optimize the alignment of short moves.  
ce426f
+   Memcpy handles short copies (< 32-bytes) using a binary move blocks
ce426f
+   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
ce426f
+   with the appropriate combination of byte and halfword load/stores.
ce426f
+   There is minimal effort to optimize the alignment of short moves.
ce426f
    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
ce426f
-   of handling unligned load/stores that do not cross 32-byte boundries.
ce426f
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
ce426f
 
ce426f
    Longer moves (>= 32-bytes) justify the effort to get at least the
ce426f
    destination doubleword (8-byte) aligned.  Further optimization is
ce426f
-   posible when both source and destination are doubleword aligned.
ce426f
-   Each case has a optimized unrolled loop.  
ce426f
-     
ce426f
-   For POWER6 unaligned loads will take a 20+ cycle hicup for any
ce426f
+   possible when both source and destination are doubleword aligned.
ce426f
+   Each case has a optimized unrolled loop.
ce426f
+
ce426f
+   For POWER6 unaligned loads will take a 20+ cycle hiccup for any
ce426f
    L1 cache miss that crosses a 32- or 128-byte boundary.  Store
ce426f
-   is more forgiving and does not take a hicup until page or 
ce426f
-   segment boundaries.  So we require doubleword alignment for 
ce426f
+   is more forgiving and does not take a hiccup until page or
ce426f
+   segment boundaries.  So we require doubleword alignment for
ce426f
    the source but may take a risk and only require word alignment
ce426f
    for the destination.  */
ce426f
 
ce426f
 	.machine	"power6"
ce426f
-EALIGN (BP_SYM (memcpy), 7, 0)
ce426f
+EALIGN (memcpy, 7, 0)
ce426f
 	CALL_MCOUNT 3
ce426f
 
ce426f
     cmpldi cr1,5,31
ce426f
     neg   0,3
ce426f
     std   3,-16(1)
ce426f
     std   31,-8(1)
ce426f
-    andi. 11,3,7	/* check alignement of dst.  */
ce426f
+    andi. 11,3,7	/* check alignment of dst.  */
ce426f
     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
ce426f
-    clrldi 10,4,61	/* check alignement of src.  */
ce426f
+    clrldi 10,4,61	/* check alignment of src.  */
ce426f
     cmpldi cr6,5,8
ce426f
     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
ce426f
     mtcrf 0x01,0
ce426f
-    cmpld cr6,10,11  
ce426f
+    cmpld cr6,10,11
ce426f
     srdi  9,5,3		/* Number of full double words remaining.  */
ce426f
     beq   .L0
ce426f
-  
ce426f
+
ce426f
     subf  5,0,5
ce426f
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.
ce426f
-     Duplicate some code to maximize fall-throught and minimize agen delays.  */
ce426f
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.
ce426f
+     Duplicate some code to maximize fall-through and minimize agen delays.  */
ce426f
 1:  bf    31,2f
ce426f
     lbz   6,0(4)
ce426f
     stb   6,0(3)
ce426f
@@ -78,7 +76,7 @@
ce426f
     lwz   6,1(4)
ce426f
     stw   6,1(3)
ce426f
     b     0f
ce426f
-    
ce426f
+
ce426f
 2:  bf    30,4f
ce426f
     lhz   6,0(4)
ce426f
     sth   6,0(3)
ce426f
@@ -86,26 +84,26 @@
ce426f
     lwz   6,2(4)
ce426f
     stw   6,2(3)
ce426f
     b     0f
ce426f
-    
ce426f
+
ce426f
 4:  bf    29,0f
ce426f
     lwz   6,0(4)
ce426f
     stw   6,0(3)
ce426f
-0: 
ce426f
+0:
ce426f
 /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
ce426f
     add   4,4,0
ce426f
     add   3,3,0
ce426f
-    
ce426f
-    clrldi 10,4,61	/* check alignement of src again.  */     
ce426f
+
ce426f
+    clrldi 10,4,61	/* check alignment of src again.  */
ce426f
     srdi  9,5,3	/* Number of full double words remaining.  */
ce426f
-    
ce426f
-  /* Copy doublewords from source to destination, assumpting the
ce426f
+
ce426f
+  /* Copy doublewords from source to destination, assuming the
ce426f
      destination is aligned on a doubleword boundary.
ce426f
 
ce426f
      At this point we know there are at least 25 bytes left (32-7) to copy.
ce426f
-     The next step is to determine if the source is also doubleword aligned. 
ce426f
+     The next step is to determine if the source is also doubleword aligned.
ce426f
      If not branch to the unaligned move code at .L6. which uses
ce426f
      a load, shift, store strategy.
ce426f
-     
ce426f
+
ce426f
      Otherwise source and destination are doubleword aligned, and we can
ce426f
      the optimized doubleword copy loop.  */
ce426f
     .align  4
ce426f
@@ -123,14 +121,14 @@
ce426f
      the main loop exits there may be a tail of 1-7 bytes. These byte
ce426f
      are copied a word/halfword/byte at a time as needed to preserve
ce426f
      alignment.
ce426f
-     
ce426f
+
ce426f
      For POWER6 the L1 is store-through and the L2 is store-in.  The
ce426f
      L2 is clocked at half CPU clock so we can store 16 bytes every
ce426f
      other cycle.  POWER6 also has a load/store bypass so we can do
ce426f
-     load, load, store, store every 2 cycles.  
ce426f
-     
ce426f
+     load, load, store, store every 2 cycles.
ce426f
+
ce426f
      The following code is sensitive to cache line alignment.  Do not
ce426f
-     make any change with out first making sure thay don't result in
ce426f
+     make any change with out first making sure they don't result in
ce426f
      splitting ld/std pairs across a cache line.  */
ce426f
 
ce426f
     mtcrf 0x02,5
ce426f
@@ -273,7 +271,7 @@
ce426f
     std   8,16+96(10)
ce426f
     std   0,24+96(10)
ce426f
     ble   cr5,L(das_loop_e)
ce426f
-    
ce426f
+
ce426f
     mtctr   12
ce426f
     .align  4
ce426f
 L(das_loop2):
ce426f
@@ -326,10 +324,10 @@
ce426f
     .align  4
ce426f
 L(das_tail):
ce426f
     beq   cr1,0f
ce426f
-    
ce426f
+
ce426f
 L(das_tail2):
ce426f
 /*  At this point we have a tail of 0-7 bytes and we know that the
ce426f
-    destiniation is double word aligned.  */
ce426f
+    destination is double word aligned.  */
ce426f
 4:  bf    29,2f
ce426f
     lwz   6,0(4)
ce426f
     stw   6,0(3)
ce426f
@@ -344,7 +342,7 @@
ce426f
     lbz   6,4(4)
ce426f
     stb   6,4(3)
ce426f
     b     0f
ce426f
-  
ce426f
+
ce426f
 2:  bf    30,1f
ce426f
     lhz   6,0(4)
ce426f
     sth   6,0(3)
ce426f
@@ -352,7 +350,7 @@
ce426f
     lbz   6,2(4)
ce426f
     stb   6,2(3)
ce426f
     b     0f
ce426f
-    
ce426f
+
ce426f
 1:  bf    31,0f
ce426f
     lbz   6,0(4)
ce426f
     stb   6,0(3)
ce426f
@@ -361,7 +359,7 @@
ce426f
     ld 3,-16(1)
ce426f
     blr
ce426f
 
ce426f
-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
ce426f
+/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
ce426f
    bytes.  Each case is handled without loops, using binary (1,2,4,8)
ce426f
    tests.
ce426f
 
ce426f
@@ -402,15 +400,28 @@
ce426f
     blt   cr6,5f
ce426f
     srdi  7,6,16
ce426f
     bgt	  cr6,3f
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    sth   7,0(3)
ce426f
+#else
ce426f
     sth   6,0(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 3:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,24
ce426f
+    stb   6,0(3)
ce426f
+    sth   7,1(3)
ce426f
+#else
ce426f
     stb   7,0(3)
ce426f
     sth   6,1(3)
ce426f
+#endif
ce426f
     b     7f
ce426f
     .align  4
ce426f
 5:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    rotlwi 6,6,8
ce426f
+#endif
ce426f
     stb   6,0(3)
ce426f
 7:
ce426f
     cmpldi	cr1,10,16
ce426f
@@ -421,7 +432,7 @@
ce426f
 /* At least 6 bytes left and the source is word aligned.  This allows
ce426f
    some speculative loads up front.  */
ce426f
 /* We need to special case the fall-through because the biggest delays
ce426f
-   are due to address computation not being ready in time for the 
ce426f
+   are due to address computation not being ready in time for the
ce426f
    AGEN.  */
ce426f
     lwz   6,0(12)
ce426f
     lwz   7,4(12)
ce426f
@@ -452,7 +463,7 @@
ce426f
     ld    3,-16(1)
ce426f
     blr
ce426f
     .align  4
ce426f
-L(dus_tail16p8):  /* less then 8 bytes left.  */
ce426f
+L(dus_tail16p8):  /* less than 8 bytes left.  */
ce426f
     beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
ce426f
     cmpldi	cr1,10,20
ce426f
     bf    29,L(dus_tail16p2)
ce426f
@@ -466,7 +477,7 @@
ce426f
     ld    3,-16(1)
ce426f
     blr
ce426f
     .align  4
ce426f
-L(dus_tail16p4):  /* less then 4 bytes left.  */
ce426f
+L(dus_tail16p4):  /* less than 4 bytes left.  */
ce426f
     addi  12,12,24
ce426f
     addi  3,3,24
ce426f
     bgt   cr0,L(dus_tail2)
ce426f
@@ -474,7 +485,7 @@
ce426f
     ld    3,-16(1)
ce426f
     blr
ce426f
     .align  4
ce426f
-L(dus_tail16p2):  /* 16 bytes moved, less then 4 bytes left.  */
ce426f
+L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
ce426f
     addi  12,12,16
ce426f
     addi  3,3,16
ce426f
     b     L(dus_tail2)
ce426f
@@ -499,7 +510,7 @@
ce426f
     ld    3,-16(1)
ce426f
     blr
ce426f
     .align  4
ce426f
-L(dus_tail8p4):  /* less then 4 bytes left.  */
ce426f
+L(dus_tail8p4):  /* less than 4 bytes left.  */
ce426f
     addi  12,12,8
ce426f
     addi  3,3,8
ce426f
     bgt   cr1,L(dus_tail2)
ce426f
@@ -510,14 +521,14 @@
ce426f
     .align  4
ce426f
 L(dus_tail4):  /* Move 4 bytes.  */
ce426f
 /*  r6 already loaded speculatively.  If we are here we know there is
ce426f
-    more then 4 bytes left.  So there is no need to test.  */
ce426f
+    more than 4 bytes left.  So there is no need to test.  */
ce426f
     addi  12,12,4
ce426f
     stw   6,0(3)
ce426f
     addi  3,3,4
ce426f
 L(dus_tail2):  /* Move 2-3 bytes.  */
ce426f
     bf    30,L(dus_tail1)
ce426f
     lhz   6,0(12)
ce426f
-    sth   6,0(3) 
ce426f
+    sth   6,0(3)
ce426f
     bf    31,L(dus_tailX)
ce426f
     lbz   7,2(12)
ce426f
     stb   7,2(3)
ce426f
@@ -537,7 +548,7 @@
ce426f
 .LE8:
ce426f
     mr    12,4
ce426f
     bne   cr6,L(dus_4)
ce426f
-/* Exactly 8 bytes.  We may cross a 32-/128-byte boundry and take a ~20
ce426f
+/* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
ce426f
    cycle delay.  This case should be rare and any attempt to avoid this
ce426f
    would take most of 20 cycles any way.  */
ce426f
     ld   6,0(4)
ce426f
@@ -552,7 +563,7 @@
ce426f
     stw   6,0(3)
ce426f
     bf    30,L(dus_5)
ce426f
     lhz   7,4(4)
ce426f
-    sth   7,4(3) 
ce426f
+    sth   7,4(3)
ce426f
     bf    31,L(dus_0)
ce426f
     lbz   8,6(4)
ce426f
     stb   8,6(3)
ce426f
@@ -590,20 +601,31 @@
ce426f
     bge     cr0, L(du4_do)
ce426f
     blt     cr5, L(du1_do)
ce426f
     beq     cr5, L(du2_do)
ce426f
-    b       L(du3_do) 
ce426f
-       
ce426f
+    b       L(du3_do)
ce426f
+
ce426f
     .align 4
ce426f
 L(du1_do):
ce426f
     bf      30,L(du1_1dw)
ce426f
 
ce426f
     /* there are at least two DWs to copy */
ce426f
+    /* FIXME: can combine last shift and "or" into "rldimi" */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 8
ce426f
+    sldi     8,7, 64-8
ce426f
+#else
ce426f
     sldi     0,6, 8
ce426f
     srdi     8,7, 64-8
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      6,16(5)
ce426f
     std     0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,7, 8
ce426f
+    sldi     8,6, 64-8
ce426f
+#else
ce426f
     sldi     0,7, 8
ce426f
     srdi     8,6, 64-8
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      7,24(5)
ce426f
     std     0,8(4)
ce426f
@@ -612,8 +634,13 @@
ce426f
     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
ce426f
     bf      31,L(du1_loop)
ce426f
     /* there is a third DW to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 8
ce426f
+    sldi     8,7, 64-8
ce426f
+#else
ce426f
     sldi     0,6, 8
ce426f
     srdi     8,7, 64-8
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     std     0,0(4)
ce426f
     mr      6,7
ce426f
@@ -624,8 +651,13 @@
ce426f
     b       L(du1_loop)
ce426f
     .align 4
ce426f
 L(du1_1dw):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 8
ce426f
+    sldi     8,7, 64-8
ce426f
+#else
ce426f
     sldi     0,6, 8
ce426f
     srdi     8,7, 64-8
ce426f
+#endif
ce426f
     addi    5,5,16
ce426f
     or      0,0,8
ce426f
     bf      31,L(du1_loop)
ce426f
@@ -637,23 +669,43 @@
ce426f
     .align 4
ce426f
 /* copy 32 bytes at a time */
ce426f
 L(du1_loop):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 8
ce426f
+    sldi   8,7, 64-8
ce426f
+#else
ce426f
     sldi   0,6, 8
ce426f
     srdi   8,7, 64-8
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,0(5)
ce426f
     std   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 8
ce426f
+    sldi   8,6, 64-8
ce426f
+#else
ce426f
     sldi   0,7, 8
ce426f
     srdi   8,6, 64-8
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,8(5)
ce426f
     std   0,8(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 8
ce426f
+    sldi   8,7, 64-8
ce426f
+#else
ce426f
     sldi   0,6, 8
ce426f
     srdi   8,7, 64-8
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,16(5)
ce426f
     std   0,16(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 8
ce426f
+    sldi   8,6, 64-8
ce426f
+#else
ce426f
     sldi   0,7, 8
ce426f
     srdi   8,6, 64-8
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,24(5)
ce426f
     std   0,24(4)
ce426f
@@ -663,9 +715,14 @@
ce426f
     .align 4
ce426f
 L(du1_fini):
ce426f
     /* calculate and store the final DW */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 8
ce426f
+    sldi   8,7, 64-8
ce426f
+#else
ce426f
     sldi   0,6, 8
ce426f
     srdi   8,7, 64-8
ce426f
-    or    0,0,8  
ce426f
+#endif
ce426f
+    or    0,0,8
ce426f
     std   0,0(4)
ce426f
     b     L(du_done)
ce426f
 
ce426f
@@ -674,13 +731,23 @@
ce426f
     bf      30,L(du2_1dw)
ce426f
 
ce426f
     /* there are at least two DWs to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 16
ce426f
+    sldi     8,7, 64-16
ce426f
+#else
ce426f
     sldi     0,6, 16
ce426f
     srdi     8,7, 64-16
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      6,16(5)
ce426f
     std     0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,7, 16
ce426f
+    sldi     8,6, 64-16
ce426f
+#else
ce426f
     sldi     0,7, 16
ce426f
     srdi     8,6, 64-16
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      7,24(5)
ce426f
     std     0,8(4)
ce426f
@@ -689,8 +756,13 @@
ce426f
     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
ce426f
     bf      31,L(du2_loop)
ce426f
     /* there is a third DW to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 16
ce426f
+    sldi     8,7, 64-16
ce426f
+#else
ce426f
     sldi     0,6, 16
ce426f
     srdi     8,7, 64-16
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     std     0,0(4)
ce426f
     mr      6,7
ce426f
@@ -701,8 +773,13 @@
ce426f
     b       L(du2_loop)
ce426f
     .align 4
ce426f
 L(du2_1dw):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 16
ce426f
+    sldi     8,7, 64-16
ce426f
+#else
ce426f
     sldi     0,6, 16
ce426f
     srdi     8,7, 64-16
ce426f
+#endif
ce426f
     addi    5,5,16
ce426f
     or      0,0,8
ce426f
     bf      31,L(du2_loop)
ce426f
@@ -714,23 +791,43 @@
ce426f
     .align 4
ce426f
 /* copy 32 bytes at a time */
ce426f
 L(du2_loop):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 16
ce426f
+    sldi   8,7, 64-16
ce426f
+#else
ce426f
     sldi   0,6, 16
ce426f
     srdi   8,7, 64-16
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,0(5)
ce426f
     std   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 16
ce426f
+    sldi   8,6, 64-16
ce426f
+#else
ce426f
     sldi   0,7, 16
ce426f
     srdi   8,6, 64-16
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,8(5)
ce426f
     std   0,8(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 16
ce426f
+    sldi   8,7, 64-16
ce426f
+#else
ce426f
     sldi   0,6, 16
ce426f
     srdi   8,7, 64-16
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,16(5)
ce426f
     std   0,16(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 16
ce426f
+    sldi   8,6, 64-16
ce426f
+#else
ce426f
     sldi   0,7, 16
ce426f
     srdi   8,6, 64-16
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,24(5)
ce426f
     std   0,24(4)
ce426f
@@ -740,9 +837,14 @@
ce426f
     .align 4
ce426f
 L(du2_fini):
ce426f
     /* calculate and store the final DW */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 16
ce426f
+    sldi   8,7, 64-16
ce426f
+#else
ce426f
     sldi   0,6, 16
ce426f
     srdi   8,7, 64-16
ce426f
-    or    0,0,8  
ce426f
+#endif
ce426f
+    or    0,0,8
ce426f
     std   0,0(4)
ce426f
     b     L(du_done)
ce426f
 
ce426f
@@ -751,13 +853,23 @@
ce426f
     bf      30,L(du3_1dw)
ce426f
 
ce426f
     /* there are at least two DWs to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 24
ce426f
+    sldi     8,7, 64-24
ce426f
+#else
ce426f
     sldi     0,6, 24
ce426f
     srdi     8,7, 64-24
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      6,16(5)
ce426f
     std     0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,7, 24
ce426f
+    sldi     8,6, 64-24
ce426f
+#else
ce426f
     sldi     0,7, 24
ce426f
     srdi     8,6, 64-24
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      7,24(5)
ce426f
     std     0,8(4)
ce426f
@@ -766,8 +878,13 @@
ce426f
     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
ce426f
     bf      31,L(du3_loop)
ce426f
     /* there is a third DW to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 24
ce426f
+    sldi     8,7, 64-24
ce426f
+#else
ce426f
     sldi     0,6, 24
ce426f
     srdi     8,7, 64-24
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     std     0,0(4)
ce426f
     mr      6,7
ce426f
@@ -778,8 +895,13 @@
ce426f
     b       L(du3_loop)
ce426f
     .align 4
ce426f
 L(du3_1dw):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 24
ce426f
+    sldi     8,7, 64-24
ce426f
+#else
ce426f
     sldi     0,6, 24
ce426f
     srdi     8,7, 64-24
ce426f
+#endif
ce426f
     addi    5,5,16
ce426f
     or      0,0,8
ce426f
     bf      31,L(du3_loop)
ce426f
@@ -791,23 +913,43 @@
ce426f
     .align 4
ce426f
 /* copy 32 bytes at a time */
ce426f
 L(du3_loop):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 24
ce426f
+    sldi   8,7, 64-24
ce426f
+#else
ce426f
     sldi   0,6, 24
ce426f
     srdi   8,7, 64-24
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,0(5)
ce426f
     std   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 24
ce426f
+    sldi   8,6, 64-24
ce426f
+#else
ce426f
     sldi   0,7, 24
ce426f
     srdi   8,6, 64-24
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,8(5)
ce426f
     std   0,8(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 24
ce426f
+    sldi   8,7, 64-24
ce426f
+#else
ce426f
     sldi   0,6, 24
ce426f
     srdi   8,7, 64-24
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,16(5)
ce426f
     std   0,16(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 24
ce426f
+    sldi   8,6, 64-24
ce426f
+#else
ce426f
     sldi   0,7, 24
ce426f
     srdi   8,6, 64-24
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,24(5)
ce426f
     std   0,24(4)
ce426f
@@ -817,9 +959,14 @@
ce426f
     .align 4
ce426f
 L(du3_fini):
ce426f
     /* calculate and store the final DW */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 24
ce426f
+    sldi   8,7, 64-24
ce426f
+#else
ce426f
     sldi   0,6, 24
ce426f
     srdi   8,7, 64-24
ce426f
-    or    0,0,8  
ce426f
+#endif
ce426f
+    or    0,0,8
ce426f
     std   0,0(4)
ce426f
     b     L(du_done)
ce426f
 
ce426f
@@ -834,13 +981,23 @@
ce426f
     bf      30,L(du4_1dw)
ce426f
 
ce426f
     /* there are at least two DWs to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 32
ce426f
+    sldi     8,7, 64-32
ce426f
+#else
ce426f
     sldi     0,6, 32
ce426f
     srdi     8,7, 64-32
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      6,16(5)
ce426f
     std     0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,7, 32
ce426f
+    sldi     8,6, 64-32
ce426f
+#else
ce426f
     sldi     0,7, 32
ce426f
     srdi     8,6, 64-32
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      7,24(5)
ce426f
     std     0,8(4)
ce426f
@@ -849,8 +1006,13 @@
ce426f
     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
ce426f
     bf      31,L(du4_loop)
ce426f
     /* there is a third DW to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 32
ce426f
+    sldi     8,7, 64-32
ce426f
+#else
ce426f
     sldi     0,6, 32
ce426f
     srdi     8,7, 64-32
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     std     0,0(4)
ce426f
     mr      6,7
ce426f
@@ -861,8 +1023,13 @@
ce426f
     b       L(du4_loop)
ce426f
     .align 4
ce426f
 L(du4_1dw):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 32
ce426f
+    sldi     8,7, 64-32
ce426f
+#else
ce426f
     sldi     0,6, 32
ce426f
     srdi     8,7, 64-32
ce426f
+#endif
ce426f
     addi    5,5,16
ce426f
     or      0,0,8
ce426f
     bf      31,L(du4_loop)
ce426f
@@ -874,23 +1041,43 @@
ce426f
     .align 4
ce426f
 /* copy 32 bytes at a time */
ce426f
 L(du4_loop):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 32
ce426f
+    sldi   8,7, 64-32
ce426f
+#else
ce426f
     sldi   0,6, 32
ce426f
     srdi   8,7, 64-32
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,0(5)
ce426f
     std   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 32
ce426f
+    sldi   8,6, 64-32
ce426f
+#else
ce426f
     sldi   0,7, 32
ce426f
     srdi   8,6, 64-32
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,8(5)
ce426f
     std   0,8(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 32
ce426f
+    sldi   8,7, 64-32
ce426f
+#else
ce426f
     sldi   0,6, 32
ce426f
     srdi   8,7, 64-32
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,16(5)
ce426f
     std   0,16(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 32
ce426f
+    sldi   8,6, 64-32
ce426f
+#else
ce426f
     sldi   0,7, 32
ce426f
     srdi   8,6, 64-32
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,24(5)
ce426f
     std   0,24(4)
ce426f
@@ -900,9 +1087,14 @@
ce426f
     .align 4
ce426f
 L(du4_fini):
ce426f
     /* calculate and store the final DW */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 32
ce426f
+    sldi   8,7, 64-32
ce426f
+#else
ce426f
     sldi   0,6, 32
ce426f
     srdi   8,7, 64-32
ce426f
-    or    0,0,8  
ce426f
+#endif
ce426f
+    or    0,0,8
ce426f
     std   0,0(4)
ce426f
     b     L(du_done)
ce426f
 
ce426f
@@ -911,13 +1103,23 @@
ce426f
     bf      30,L(du5_1dw)
ce426f
 
ce426f
     /* there are at least two DWs to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 40
ce426f
+    sldi     8,7, 64-40
ce426f
+#else
ce426f
     sldi     0,6, 40
ce426f
     srdi     8,7, 64-40
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      6,16(5)
ce426f
     std     0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,7, 40
ce426f
+    sldi     8,6, 64-40
ce426f
+#else
ce426f
     sldi     0,7, 40
ce426f
     srdi     8,6, 64-40
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      7,24(5)
ce426f
     std     0,8(4)
ce426f
@@ -926,8 +1128,13 @@
ce426f
     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
ce426f
     bf      31,L(du5_loop)
ce426f
     /* there is a third DW to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 40
ce426f
+    sldi     8,7, 64-40
ce426f
+#else
ce426f
     sldi     0,6, 40
ce426f
     srdi     8,7, 64-40
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     std     0,0(4)
ce426f
     mr      6,7
ce426f
@@ -938,8 +1145,13 @@
ce426f
     b       L(du5_loop)
ce426f
     .align 4
ce426f
 L(du5_1dw):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 40
ce426f
+    sldi     8,7, 64-40
ce426f
+#else
ce426f
     sldi     0,6, 40
ce426f
     srdi     8,7, 64-40
ce426f
+#endif
ce426f
     addi    5,5,16
ce426f
     or      0,0,8
ce426f
     bf      31,L(du5_loop)
ce426f
@@ -951,23 +1163,43 @@
ce426f
     .align 4
ce426f
 /* copy 32 bytes at a time */
ce426f
 L(du5_loop):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 40
ce426f
+    sldi   8,7, 64-40
ce426f
+#else
ce426f
     sldi   0,6, 40
ce426f
     srdi   8,7, 64-40
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,0(5)
ce426f
     std   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 40
ce426f
+    sldi   8,6, 64-40
ce426f
+#else
ce426f
     sldi   0,7, 40
ce426f
     srdi   8,6, 64-40
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,8(5)
ce426f
     std   0,8(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 40
ce426f
+    sldi   8,7, 64-40
ce426f
+#else
ce426f
     sldi   0,6, 40
ce426f
     srdi   8,7, 64-40
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,16(5)
ce426f
     std   0,16(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 40
ce426f
+    sldi   8,6, 64-40
ce426f
+#else
ce426f
     sldi   0,7, 40
ce426f
     srdi   8,6, 64-40
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,24(5)
ce426f
     std   0,24(4)
ce426f
@@ -977,9 +1209,14 @@
ce426f
     .align 4
ce426f
 L(du5_fini):
ce426f
     /* calculate and store the final DW */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 40
ce426f
+    sldi   8,7, 64-40
ce426f
+#else
ce426f
     sldi   0,6, 40
ce426f
     srdi   8,7, 64-40
ce426f
-    or    0,0,8  
ce426f
+#endif
ce426f
+    or    0,0,8
ce426f
     std   0,0(4)
ce426f
     b     L(du_done)
ce426f
 
ce426f
@@ -988,13 +1225,23 @@
ce426f
     bf      30,L(du6_1dw)
ce426f
 
ce426f
     /* there are at least two DWs to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 48
ce426f
+    sldi     8,7, 64-48
ce426f
+#else
ce426f
     sldi     0,6, 48
ce426f
     srdi     8,7, 64-48
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      6,16(5)
ce426f
     std     0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,7, 48
ce426f
+    sldi     8,6, 64-48
ce426f
+#else
ce426f
     sldi     0,7, 48
ce426f
     srdi     8,6, 64-48
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      7,24(5)
ce426f
     std     0,8(4)
ce426f
@@ -1003,8 +1250,13 @@
ce426f
     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
ce426f
     bf      31,L(du6_loop)
ce426f
     /* there is a third DW to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 48
ce426f
+    sldi     8,7, 64-48
ce426f
+#else
ce426f
     sldi     0,6, 48
ce426f
     srdi     8,7, 64-48
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     std     0,0(4)
ce426f
     mr      6,7
ce426f
@@ -1015,8 +1267,13 @@
ce426f
     b       L(du6_loop)
ce426f
     .align 4
ce426f
 L(du6_1dw):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 48
ce426f
+    sldi     8,7, 64-48
ce426f
+#else
ce426f
     sldi     0,6, 48
ce426f
     srdi     8,7, 64-48
ce426f
+#endif
ce426f
     addi    5,5,16
ce426f
     or      0,0,8
ce426f
     bf      31,L(du6_loop)
ce426f
@@ -1028,23 +1285,43 @@
ce426f
     .align 4
ce426f
 /* copy 32 bytes at a time */
ce426f
 L(du6_loop):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 48
ce426f
+    sldi   8,7, 64-48
ce426f
+#else
ce426f
     sldi   0,6, 48
ce426f
     srdi   8,7, 64-48
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,0(5)
ce426f
     std   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 48
ce426f
+    sldi   8,6, 64-48
ce426f
+#else
ce426f
     sldi   0,7, 48
ce426f
     srdi   8,6, 64-48
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,8(5)
ce426f
     std   0,8(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 48
ce426f
+    sldi   8,7, 64-48
ce426f
+#else
ce426f
     sldi   0,6, 48
ce426f
     srdi   8,7, 64-48
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,16(5)
ce426f
     std   0,16(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 48
ce426f
+    sldi   8,6, 64-48
ce426f
+#else
ce426f
     sldi   0,7, 48
ce426f
     srdi   8,6, 64-48
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,24(5)
ce426f
     std   0,24(4)
ce426f
@@ -1054,9 +1331,14 @@
ce426f
     .align 4
ce426f
 L(du6_fini):
ce426f
     /* calculate and store the final DW */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 48
ce426f
+    sldi   8,7, 64-48
ce426f
+#else
ce426f
     sldi   0,6, 48
ce426f
     srdi   8,7, 64-48
ce426f
-    or    0,0,8  
ce426f
+#endif
ce426f
+    or    0,0,8
ce426f
     std   0,0(4)
ce426f
     b     L(du_done)
ce426f
 
ce426f
@@ -1065,13 +1347,23 @@
ce426f
     bf      30,L(du7_1dw)
ce426f
 
ce426f
     /* there are at least two DWs to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 56
ce426f
+    sldi     8,7, 64-56
ce426f
+#else
ce426f
     sldi     0,6, 56
ce426f
     srdi     8,7, 64-56
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      6,16(5)
ce426f
     std     0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,7, 56
ce426f
+    sldi     8,6, 64-56
ce426f
+#else
ce426f
     sldi     0,7, 56
ce426f
     srdi     8,6, 64-56
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     ld      7,24(5)
ce426f
     std     0,8(4)
ce426f
@@ -1080,8 +1372,13 @@
ce426f
     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
ce426f
     bf      31,L(du7_loop)
ce426f
     /* there is a third DW to copy */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 56
ce426f
+    sldi     8,7, 64-56
ce426f
+#else
ce426f
     sldi     0,6, 56
ce426f
     srdi     8,7, 64-56
ce426f
+#endif
ce426f
     or      0,0,8
ce426f
     std     0,0(4)
ce426f
     mr      6,7
ce426f
@@ -1092,8 +1389,13 @@
ce426f
     b       L(du7_loop)
ce426f
     .align 4
ce426f
 L(du7_1dw):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi     0,6, 56
ce426f
+    sldi     8,7, 64-56
ce426f
+#else
ce426f
     sldi     0,6, 56
ce426f
     srdi     8,7, 64-56
ce426f
+#endif
ce426f
     addi    5,5,16
ce426f
     or      0,0,8
ce426f
     bf      31,L(du7_loop)
ce426f
@@ -1105,23 +1407,43 @@
ce426f
     .align 4
ce426f
 /* copy 32 bytes at a time */
ce426f
 L(du7_loop):
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 56
ce426f
+    sldi   8,7, 64-56
ce426f
+#else
ce426f
     sldi   0,6, 56
ce426f
     srdi   8,7, 64-56
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,0(5)
ce426f
     std   0,0(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 56
ce426f
+    sldi   8,6, 64-56
ce426f
+#else
ce426f
     sldi   0,7, 56
ce426f
     srdi   8,6, 64-56
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,8(5)
ce426f
     std   0,8(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 56
ce426f
+    sldi   8,7, 64-56
ce426f
+#else
ce426f
     sldi   0,6, 56
ce426f
     srdi   8,7, 64-56
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    6,16(5)
ce426f
     std   0,16(4)
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,7, 56
ce426f
+    sldi   8,6, 64-56
ce426f
+#else
ce426f
     sldi   0,7, 56
ce426f
     srdi   8,6, 64-56
ce426f
+#endif
ce426f
     or    0,0,8
ce426f
     ld    7,24(5)
ce426f
     std   0,24(4)
ce426f
@@ -1131,12 +1453,17 @@
ce426f
     .align 4
ce426f
 L(du7_fini):
ce426f
     /* calculate and store the final DW */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+    srdi   0,6, 56
ce426f
+    sldi   8,7, 64-56
ce426f
+#else
ce426f
     sldi   0,6, 56
ce426f
     srdi   8,7, 64-56
ce426f
-    or    0,0,8  
ce426f
+#endif
ce426f
+    or    0,0,8
ce426f
     std   0,0(4)
ce426f
     b     L(du_done)
ce426f
-    
ce426f
+
ce426f
     .align 4
ce426f
 L(du_done):
ce426f
     rldicr 0,31,0,60
ce426f
@@ -1144,9 +1471,9 @@
ce426f
     beq   cr1,0f	/* If the tail is 0 bytes we are done!  */
ce426f
 
ce426f
     add   3,3,0
ce426f
-    add   12,12,0    
ce426f
+    add   12,12,0
ce426f
 /*  At this point we have a tail of 0-7 bytes and we know that the
ce426f
-    destiniation is double word aligned.  */
ce426f
+    destination is double word aligned.  */
ce426f
 4:  bf    29,2f
ce426f
     lwz   6,0(12)
ce426f
     addi  12,12,4
ce426f
@@ -1165,5 +1492,5 @@
ce426f
     ld 31,-8(1)
ce426f
     ld 3,-16(1)
ce426f
     blr
ce426f
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
ce426f
+END_GEN_TB (memcpy,TB_TOCLESS)
ce426f
 libc_hidden_builtin_def (memcpy)
ce426f
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:05:40.000000000 -0500
ce426f
@@ -1,5 +1,5 @@
ce426f
 /* Optimized memcpy implementation for PowerPC64/POWER7.
ce426f
-   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
ce426f
+   Copyright (C) 2010-2014 Free Software Foundation, Inc.
ce426f
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
ce426f
    This file is part of the GNU C Library.
ce426f
 
ce426f
@@ -18,425 +18,366 @@
ce426f
    <http://www.gnu.org/licenses/>.  */
ce426f
 
ce426f
 #include <sysdep.h>
ce426f
-#include <bp-sym.h>
ce426f
-#include <bp-asm.h>
ce426f
 
ce426f
 
ce426f
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
ce426f
    Returns 'dst'.  */
ce426f
 
ce426f
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
ce426f
+#define src 4
ce426f
+#define cnt 5
ce426f
+
ce426f
 	.machine power7
ce426f
-EALIGN (BP_SYM (memcpy), 5, 0)
ce426f
+EALIGN (memcpy, 5, 0)
ce426f
 	CALL_MCOUNT 3
ce426f
 
ce426f
-	cmpldi  cr1,5,31
ce426f
+	cmpldi	cr1,cnt,31
ce426f
 	neg	0,3
ce426f
-	std	3,-16(1)
ce426f
-	std	31,-8(1)
ce426f
-	cfi_offset(31,-8)
ce426f
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
ce426f
 				    code.  */
ce426f
 
ce426f
-	andi.   11,3,7	      /* Check alignment of DST.  */
ce426f
-
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
ce426f
+   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
ce426f
+   loop is only used for quadword aligned copies.  */
ce426f
+	andi.	10,3,15
ce426f
+	clrldi	11,4,60
ce426f
+#else
ce426f
+	andi.	10,3,7		/* Check alignment of DST.  */
ce426f
+	clrldi	11,4,61		/* Check alignment of SRC.  */
ce426f
+#endif
ce426f
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
ce426f
 
ce426f
-	clrldi  10,4,61       /* Check alignment of SRC.  */
ce426f
-	cmpld   cr6,10,11     /* SRC and DST alignments match?  */
ce426f
-	mr	12,4
ce426f
-	mr	31,5
ce426f
+	mr	dst,3
ce426f
 	bne	cr6,L(copy_GE_32_unaligned)
ce426f
+	beq	L(aligned_copy)
ce426f
 
ce426f
-	srdi    9,5,3	      /* Number of full quadwords remaining.  */
ce426f
-
ce426f
-	beq    L(copy_GE_32_aligned_cont)
ce426f
-
ce426f
-	clrldi  0,0,61
ce426f
-	mtcrf   0x01,0
ce426f
-	subf    31,0,5
ce426f
-
ce426f
-	/* Get the SRC aligned to 8 bytes.  */
ce426f
-
ce426f
-1:	bf	31,2f
ce426f
-	lbz	6,0(12)
ce426f
-	addi    12,12,1
ce426f
-	stb	6,0(3)
ce426f
-	addi    3,3,1
ce426f
-2:	bf      30,4f
ce426f
-	lhz     6,0(12)
ce426f
-	addi    12,12,2
ce426f
-	sth     6,0(3)
ce426f
-	addi    3,3,2
ce426f
-4:	bf      29,0f
ce426f
-	lwz     6,0(12)
ce426f
-	addi    12,12,4
ce426f
-	stw     6,0(3)
ce426f
-	addi    3,3,4
ce426f
-0:
ce426f
-	clrldi  10,12,61      /* Check alignment of SRC again.  */
ce426f
-	srdi    9,31,3	      /* Number of full doublewords remaining.  */
ce426f
-
ce426f
-L(copy_GE_32_aligned_cont):
ce426f
-
ce426f
-	clrldi  11,31,61
ce426f
-	mtcrf   0x01,9
ce426f
-
ce426f
-	srdi    8,31,5
ce426f
-	cmpldi  cr1,9,4
ce426f
-	cmpldi  cr6,11,0
ce426f
-	mr	11,12
ce426f
+	mtocrf	0x01,0
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	clrldi	0,0,60
ce426f
+#else
ce426f
+	clrldi	0,0,61
ce426f
+#endif
ce426f
 
ce426f
-	/* Copy 1~3 doublewords so the main loop starts
ce426f
-	at a multiple of 32 bytes.  */
ce426f
-
ce426f
-	bf	30,1f
ce426f
-	ld      6,0(12)
ce426f
-	ld      7,8(12)
ce426f
-	addi    11,12,16
ce426f
-	mtctr   8
ce426f
-	std     6,0(3)
ce426f
-	std     7,8(3)
ce426f
-	addi    10,3,16
ce426f
-	bf      31,4f
ce426f
-	ld      0,16(12)
ce426f
-	std     0,16(3)
ce426f
-	blt     cr1,3f
ce426f
-	addi    11,12,24
ce426f
-	addi    10,3,24
ce426f
-	b       4f
ce426f
-
ce426f
-	.align  4
ce426f
-1:	/* Copy 1 doubleword and set the counter.  */
ce426f
-	mr	10,3
ce426f
-	mtctr   8
ce426f
-	bf      31,4f
ce426f
-	ld      6,0(12)
ce426f
-	addi    11,12,8
ce426f
-	std     6,0(3)
ce426f
-	addi    10,3,8
ce426f
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
ce426f
+1:
ce426f
+	bf	31,2f
ce426f
+	lbz	6,0(src)
ce426f
+	addi	src,src,1
ce426f
+	stb	6,0(dst)
ce426f
+	addi	dst,dst,1
ce426f
+2:
ce426f
+	bf	30,4f
ce426f
+	lhz	6,0(src)
ce426f
+	addi	src,src,2
ce426f
+	sth	6,0(dst)
ce426f
+	addi	dst,dst,2
ce426f
+4:
ce426f
+	bf	29,8f
ce426f
+	lwz	6,0(src)
ce426f
+	addi	src,src,4
ce426f
+	stw	6,0(dst)
ce426f
+	addi	dst,dst,4
ce426f
+8:
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	bf	28,16f
ce426f
+	ld	6,0(src)
ce426f
+	addi	src,src,8
ce426f
+	std	6,0(dst)
ce426f
+	addi	dst,dst,8
ce426f
+16:
ce426f
+#endif
ce426f
+	subf	cnt,0,cnt
ce426f
 
ce426f
+/* Main aligned copy loop. Copies 128 bytes at a time. */
ce426f
 L(aligned_copy):
ce426f
-	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
ce426f
-	.align  4
ce426f
-4:
ce426f
-	/* check for any 32-byte or 64-byte lumps that are outside of a
ce426f
-	   nice 128-byte range.  R8 contains the number of 32-byte
ce426f
-	   lumps, so drop this into the CR, and use the SO/EQ bits to help
ce426f
-	   handle the 32- or 64- byte lumps.  Then handle the rest with an
ce426f
-	   unrolled 128-bytes-at-a-time copy loop. */
ce426f
-	mtocrf	1,8
ce426f
-	li	6,16	# 16() index
ce426f
-	li	7,32	# 32() index
ce426f
-	li	8,48	# 48() index
ce426f
-
ce426f
-L(aligned_32byte):
ce426f
-	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
ce426f
-	bns	cr7,L(aligned_64byte)
ce426f
-	lxvd2x	6,0,11
ce426f
-	lxvd2x	7,11,6
ce426f
-	addi	11,11,32
ce426f
-	stxvd2x	6,0,10
ce426f
-	stxvd2x	7,10,6
ce426f
-	addi	10,10,32
ce426f
-
ce426f
-L(aligned_64byte):
ce426f
-	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
ce426f
-	bne	cr7,L(aligned_128setup)
ce426f
-	lxvd2x	6,0,11
ce426f
-	lxvd2x	7,11,6
ce426f
-	lxvd2x	8,11,7
ce426f
-	lxvd2x	9,11,8
ce426f
-	addi	11,11,64
ce426f
-	stxvd2x	6,0,10
ce426f
-	stxvd2x	7,10,6
ce426f
-	stxvd2x	8,10,7
ce426f
-	stxvd2x	9,10,8
ce426f
-	addi	10,10,64
ce426f
-
ce426f
-L(aligned_128setup):
ce426f
-	/* Set up for the 128-byte at a time copy loop.  */
ce426f
-	srdi	8,31,7
ce426f
-	cmpdi	8,0	# Any 4x lumps left?
ce426f
-	beq	3f	# if not, move along.
ce426f
-	lxvd2x	6,0,11
ce426f
-	lxvd2x	7,11,6
ce426f
-	mtctr	8	# otherwise, load the ctr and begin.
ce426f
-	li	8,48	# 48() index
ce426f
+	li	6,16
ce426f
+	li	7,32
ce426f
+	li	8,48
ce426f
+	mtocrf	0x02,cnt
ce426f
+	srdi	12,cnt,7
ce426f
+	cmpdi	12,0
ce426f
+	beq	L(aligned_tail)
ce426f
+	lxvd2x	6,0,src
ce426f
+	lxvd2x	7,src,6
ce426f
+	mtctr	12
ce426f
 	b	L(aligned_128loop)
ce426f
 
ce426f
+	.align  4
ce426f
 L(aligned_128head):
ce426f
 	/* for the 2nd + iteration of this loop. */
ce426f
-	lxvd2x	6,0,11
ce426f
-	lxvd2x	7,11,6
ce426f
+	lxvd2x	6,0,src
ce426f
+	lxvd2x	7,src,6
ce426f
 L(aligned_128loop):
ce426f
-	lxvd2x	8,11,7
ce426f
-	lxvd2x	9,11,8
ce426f
-	stxvd2x	6,0,10
ce426f
-	addi	11,11,64
ce426f
-	stxvd2x	7,10,6
ce426f
-	stxvd2x	8,10,7
ce426f
-	stxvd2x	9,10,8
ce426f
-	lxvd2x	6,0,11
ce426f
-	lxvd2x	7,11,6
ce426f
-	addi	10,10,64
ce426f
-	lxvd2x	8,11,7
ce426f
-	lxvd2x	9,11,8
ce426f
-	addi	11,11,64
ce426f
-	stxvd2x	6,0,10
ce426f
-	stxvd2x	7,10,6
ce426f
-	stxvd2x	8,10,7
ce426f
-	stxvd2x	9,10,8
ce426f
-	addi	10,10,64
ce426f
+	lxvd2x	8,src,7
ce426f
+	lxvd2x	9,src,8
ce426f
+	stxvd2x	6,0,dst
ce426f
+	addi	src,src,64
ce426f
+	stxvd2x	7,dst,6
ce426f
+	stxvd2x	8,dst,7
ce426f
+	stxvd2x	9,dst,8
ce426f
+	lxvd2x	6,0,src
ce426f
+	lxvd2x	7,src,6
ce426f
+	addi	dst,dst,64
ce426f
+	lxvd2x	8,src,7
ce426f
+	lxvd2x	9,src,8
ce426f
+	addi	src,src,64
ce426f
+	stxvd2x	6,0,dst
ce426f
+	stxvd2x	7,dst,6
ce426f
+	stxvd2x	8,dst,7
ce426f
+	stxvd2x	9,dst,8
ce426f
+	addi	dst,dst,64
ce426f
 	bdnz	L(aligned_128head)
ce426f
 
ce426f
-3:
ce426f
-	/* Check for tail bytes.  */
ce426f
-	rldicr  0,31,0,60
ce426f
-	mtcrf   0x01,31
ce426f
-	beq	cr6,0f
ce426f
-
ce426f
-.L9:
ce426f
-	add	3,3,0
ce426f
-	add	12,12,0
ce426f
-
ce426f
-	/*  At this point we have a tail of 0-7 bytes and we know that the
ce426f
-	destination is doubleword-aligned.  */
ce426f
-4:	/* Copy 4 bytes.  */
ce426f
-	bf	29,2f
ce426f
-
ce426f
-	lwz     6,0(12)
ce426f
-	addi    12,12,4
ce426f
-	stw     6,0(3)
ce426f
-	addi    3,3,4
ce426f
-2:	/* Copy 2 bytes.  */
ce426f
-	bf	30,1f
ce426f
-
ce426f
-	lhz     6,0(12)
ce426f
-	addi    12,12,2
ce426f
-	sth     6,0(3)
ce426f
-	addi    3,3,2
ce426f
-1:	/* Copy 1 byte.  */
ce426f
-	bf	31,0f
ce426f
-
ce426f
-	lbz	6,0(12)
ce426f
-	stb	6,0(3)
ce426f
-0:	/* Return original DST pointer.  */
ce426f
-	ld	31,-8(1)
ce426f
-	ld	3,-16(1)
ce426f
+L(aligned_tail):
ce426f
+	mtocrf	0x01,cnt
ce426f
+	bf	25,32f
ce426f
+	lxvd2x	6,0,src
ce426f
+	lxvd2x	7,src,6
ce426f
+	lxvd2x	8,src,7
ce426f
+	lxvd2x	9,src,8
ce426f
+	addi	src,src,64
ce426f
+	stxvd2x	6,0,dst
ce426f
+	stxvd2x	7,dst,6
ce426f
+	stxvd2x	8,dst,7
ce426f
+	stxvd2x	9,dst,8
ce426f
+	addi	dst,dst,64
ce426f
+32:
ce426f
+	bf	26,16f
ce426f
+	lxvd2x	6,0,src
ce426f
+	lxvd2x	7,src,6
ce426f
+	addi	src,src,32
ce426f
+	stxvd2x	6,0,dst
ce426f
+	stxvd2x	7,dst,6
ce426f
+	addi	dst,dst,32
ce426f
+16:
ce426f
+	bf	27,8f
ce426f
+	lxvd2x	6,0,src
ce426f
+	addi	src,src,16
ce426f
+	stxvd2x	6,0,dst
ce426f
+	addi	dst,dst,16
ce426f
+8:
ce426f
+	bf	28,4f
ce426f
+	ld	6,0(src)
ce426f
+	addi	src,src,8
ce426f
+	std     6,0(dst)
ce426f
+	addi	dst,dst,8
ce426f
+4:	/* Copies 4~7 bytes.  */
ce426f
+	bf	29,L(tail2)
ce426f
+	lwz	6,0(src)
ce426f
+	stw     6,0(dst)
ce426f
+	bf      30,L(tail5)
ce426f
+	lhz     7,4(src)
ce426f
+	sth     7,4(dst)
ce426f
+	bflr	31
ce426f
+	lbz     8,6(src)
ce426f
+	stb     8,6(dst)
ce426f
+	/* Return original DST pointer.  */
ce426f
 	blr
ce426f
 
ce426f
-	/* Handle copies of 0~31 bytes.  */
ce426f
-	.align  4
ce426f
+
ce426f
+/* Handle copies of 0~31 bytes.  */
ce426f
+	.align	4
ce426f
 L(copy_LT_32):
ce426f
-	cmpldi  cr6,5,8
ce426f
-	mr	12,4
ce426f
-	mtcrf   0x01,5
ce426f
+	mr	dst,3
ce426f
+	cmpldi	cr6,cnt,8
ce426f
+	mtocrf	0x01,cnt
ce426f
 	ble	cr6,L(copy_LE_8)
ce426f
 
ce426f
 	/* At least 9 bytes to go.  */
ce426f
 	neg	8,4
ce426f
-	clrrdi  11,4,2
ce426f
-	andi.   0,8,3
ce426f
-	cmpldi  cr1,5,16
ce426f
-	mr	10,5
ce426f
+	andi.	0,8,3
ce426f
+	cmpldi	cr1,cnt,16
ce426f
 	beq	L(copy_LT_32_aligned)
ce426f
 
ce426f
-	/* Force 4-bytes alignment for SRC.  */
ce426f
-	mtocrf  0x01,0
ce426f
-	subf    10,0,5
ce426f
-2:	bf	30,1f
ce426f
-
ce426f
-	lhz	6,0(12)
ce426f
-	addi    12,12,2
ce426f
-	sth	6,0(3)
ce426f
-	addi    3,3,2
ce426f
-1:	bf	31,L(end_4bytes_alignment)
ce426f
-
ce426f
-	lbz	6,0(12)
ce426f
-	addi    12,12,1
ce426f
-	stb	6,0(3)
ce426f
-	addi    3,3,1
ce426f
+	/* Force 4-byte alignment for SRC.  */
ce426f
+	mtocrf	0x01,0
ce426f
+	subf	cnt,0,cnt
ce426f
+2:
ce426f
+	bf	30,1f
ce426f
+	lhz	6,0(src)
ce426f
+	addi	src,src,2
ce426f
+	sth	6,0(dst)
ce426f
+	addi	dst,dst,2
ce426f
+1:
ce426f
+	bf	31,L(end_4bytes_alignment)
ce426f
+	lbz	6,0(src)
ce426f
+	addi	src,src,1
ce426f
+	stb	6,0(dst)
ce426f
+	addi	dst,dst,1
ce426f
 
ce426f
-	.align  4
ce426f
+	.align	4
ce426f
 L(end_4bytes_alignment):
ce426f
-	cmpldi  cr1,10,16
ce426f
-	mtcrf   0x01,10
ce426f
+	cmpldi	cr1,cnt,16
ce426f
+	mtocrf	0x01,cnt
ce426f
 
ce426f
 L(copy_LT_32_aligned):
ce426f
 	/* At least 6 bytes to go, and SRC is word-aligned.  */
ce426f
 	blt	cr1,8f
ce426f
 
ce426f
 	/* Copy 16 bytes.  */
ce426f
-	lwz	6,0(12)
ce426f
-	lwz     7,4(12)
ce426f
-	stw     6,0(3)
ce426f
-	lwz     8,8(12)
ce426f
-	stw     7,4(3)
ce426f
-	lwz     6,12(12)
ce426f
-	addi    12,12,16
ce426f
-	stw     8,8(3)
ce426f
-	stw     6,12(3)
ce426f
-	addi    3,3,16
ce426f
+	lwz	6,0(src)
ce426f
+	lwz	7,4(src)
ce426f
+	stw	6,0(dst)
ce426f
+	lwz	8,8(src)
ce426f
+	stw	7,4(dst)
ce426f
+	lwz	6,12(src)
ce426f
+	addi	src,src,16
ce426f
+	stw	8,8(dst)
ce426f
+	stw	6,12(dst)
ce426f
+	addi	dst,dst,16
ce426f
 8:	/* Copy 8 bytes.  */
ce426f
-	bf	28,4f
ce426f
+	bf	28,L(tail4)
ce426f
+	lwz	6,0(src)
ce426f
+	lwz	7,4(src)
ce426f
+	addi	src,src,8
ce426f
+	stw	6,0(dst)
ce426f
+	stw	7,4(dst)
ce426f
+	addi	dst,dst,8
ce426f
+
ce426f
+	.align	4
ce426f
+/* Copies 4~7 bytes.  */
ce426f
+L(tail4):
ce426f
+	bf	29,L(tail2)
ce426f
+	lwz	6,0(src)
ce426f
+	stw	6,0(dst)
ce426f
+	bf	30,L(tail5)
ce426f
+	lhz	7,4(src)
ce426f
+	sth	7,4(dst)
ce426f
+	bflr	31
ce426f
+	lbz	8,6(src)
ce426f
+	stb	8,6(dst)
ce426f
+	/* Return original DST pointer.  */
ce426f
+	blr
ce426f
 
ce426f
-	lwz     6,0(12)
ce426f
-	lwz     7,4(12)
ce426f
-	addi    12,12,8
ce426f
-	stw     6,0(3)
ce426f
-	stw     7,4(3)
ce426f
-	addi    3,3,8
ce426f
-4:	/* Copy 4 bytes.  */
ce426f
-	bf	29,2f
ce426f
-
ce426f
-	lwz     6,0(12)
ce426f
-	addi    12,12,4
ce426f
-	stw     6,0(3)
ce426f
-	addi    3,3,4
ce426f
-2:	/* Copy 2-3 bytes.  */
ce426f
+	.align	4
ce426f
+/* Copies 2~3 bytes.  */
ce426f
+L(tail2):
ce426f
 	bf	30,1f
ce426f
-
ce426f
-	lhz     6,0(12)
ce426f
-	sth     6,0(3)
ce426f
-	bf      31,0f
ce426f
-	lbz     7,2(12)
ce426f
-	stb     7,2(3)
ce426f
-	ld	3,-16(1)
ce426f
+	lhz	6,0(src)
ce426f
+	sth	6,0(dst)
ce426f
+	bflr	31
ce426f
+	lbz	7,2(src)
ce426f
+	stb	7,2(dst)
ce426f
 	blr
ce426f
 
ce426f
-	.align  4
ce426f
-1:	/* Copy 1 byte.  */
ce426f
-	bf	31,0f
ce426f
+	.align	4
ce426f
+L(tail5):
ce426f
+	bflr	31
ce426f
+	lbz	6,4(src)
ce426f
+	stb	6,4(dst)
ce426f
+	blr
ce426f
 
ce426f
-	lbz	6,0(12)
ce426f
-	stb	6,0(3)
ce426f
-0:	/* Return original DST pointer.  */
ce426f
-	ld	3,-16(1)
ce426f
+	.align	4
ce426f
+1:
ce426f
+	bflr	31
ce426f
+	lbz	6,0(src)
ce426f
+	stb	6,0(dst)
ce426f
+	/* Return original DST pointer.  */
ce426f
 	blr
ce426f
 
ce426f
-	/* Handles copies of 0~8 bytes.  */
ce426f
-	.align  4
ce426f
+
ce426f
+/* Handles copies of 0~8 bytes.  */
ce426f
+	.align	4
ce426f
 L(copy_LE_8):
ce426f
-	bne	cr6,4f
ce426f
+	bne	cr6,L(tail4)
ce426f
 
ce426f
 	/* Though we could've used ld/std here, they are still
ce426f
 	slow for unaligned cases.  */
ce426f
 
ce426f
-	lwz	6,0(4)
ce426f
-	lwz     7,4(4)
ce426f
-	stw     6,0(3)
ce426f
-	stw     7,4(3)
ce426f
-	ld      3,-16(1)      /* Return original DST pointers.  */
ce426f
+	lwz	6,0(src)
ce426f
+	lwz	7,4(src)
ce426f
+	stw	6,0(dst)
ce426f
+	stw	7,4(dst)
ce426f
 	blr
ce426f
 
ce426f
-	.align  4
ce426f
-4:	/* Copies 4~7 bytes.  */
ce426f
-	bf	29,2b
ce426f
 
ce426f
-	lwz	6,0(4)
ce426f
-	stw     6,0(3)
ce426f
-	bf      30,5f
ce426f
-	lhz     7,4(4)
ce426f
-	sth     7,4(3)
ce426f
-	bf      31,0f
ce426f
-	lbz     8,6(4)
ce426f
-	stb     8,6(3)
ce426f
-	ld	3,-16(1)
ce426f
-	blr
ce426f
-
ce426f
-	.align  4
ce426f
-5:	/* Copy 1 byte.  */
ce426f
-	bf	31,0f
ce426f
-
ce426f
-	lbz	6,4(4)
ce426f
-	stb	6,4(3)
ce426f
-
ce426f
-0:	/* Return original DST pointer.  */
ce426f
-	ld	3,-16(1)
ce426f
-	blr
ce426f
-
ce426f
-	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
ce426f
-	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
ce426f
-	the data, allowing for aligned DST stores.  */
ce426f
-	.align  4
ce426f
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
ce426f
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
ce426f
+   the data, allowing for aligned DST stores.  */
ce426f
+	.align	4
ce426f
 L(copy_GE_32_unaligned):
ce426f
-	clrldi  0,0,60	      /* Number of bytes until the 1st
ce426f
-			      quadword.  */
ce426f
-	andi.   11,3,15       /* Check alignment of DST (against
ce426f
-			      quadwords).  */
ce426f
-	srdi    9,5,4	      /* Number of full quadwords remaining.  */
ce426f
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
ce426f
+#ifndef __LITTLE_ENDIAN__
ce426f
+	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
ce426f
+#endif
ce426f
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
ce426f
 
ce426f
 	beq	L(copy_GE_32_unaligned_cont)
ce426f
 
ce426f
-	/* SRC is not quadword aligned, get it aligned.  */
ce426f
+	/* DST is not quadword aligned, get it aligned.  */
ce426f
 
ce426f
-	mtcrf   0x01,0
ce426f
-	subf    31,0,5
ce426f
+	mtocrf	0x01,0
ce426f
+	subf	cnt,0,cnt
ce426f
 
ce426f
 	/* Vector instructions work best when proper alignment (16-bytes)
ce426f
 	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
ce426f
-1:	/* Copy 1 byte.  */
ce426f
+1:
ce426f
 	bf	31,2f
ce426f
-
ce426f
-	lbz	6,0(12)
ce426f
-	addi    12,12,1
ce426f
-	stb	6,0(3)
ce426f
-	addi    3,3,1
ce426f
-2:	/* Copy 2 bytes.  */
ce426f
+	lbz	6,0(src)
ce426f
+	addi	src,src,1
ce426f
+	stb	6,0(dst)
ce426f
+	addi	dst,dst,1
ce426f
+2:
ce426f
 	bf	30,4f
ce426f
-
ce426f
-	lhz     6,0(12)
ce426f
-	addi    12,12,2
ce426f
-	sth     6,0(3)
ce426f
-	addi    3,3,2
ce426f
-4:	/* Copy 4 bytes.  */
ce426f
+	lhz	6,0(src)
ce426f
+	addi	src,src,2
ce426f
+	sth	6,0(dst)
ce426f
+	addi	dst,dst,2
ce426f
+4:
ce426f
 	bf	29,8f
ce426f
-
ce426f
-	lwz     6,0(12)
ce426f
-	addi    12,12,4
ce426f
-	stw     6,0(3)
ce426f
-	addi    3,3,4
ce426f
-8:	/* Copy 8 bytes.  */
ce426f
+	lwz	6,0(src)
ce426f
+	addi	src,src,4
ce426f
+	stw	6,0(dst)
ce426f
+	addi	dst,dst,4
ce426f
+8:
ce426f
 	bf	28,0f
ce426f
-
ce426f
-	ld	6,0(12)
ce426f
-	addi    12,12,8
ce426f
-	std	6,0(3)
ce426f
-	addi    3,3,8
ce426f
+	ld	6,0(src)
ce426f
+	addi	src,src,8
ce426f
+	std	6,0(dst)
ce426f
+	addi	dst,dst,8
ce426f
 0:
ce426f
-	clrldi  10,12,60      /* Check alignment of SRC.  */
ce426f
-	srdi    9,31,4	      /* Number of full quadwords remaining.  */
ce426f
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
ce426f
 
ce426f
 	/* The proper alignment is present, it is OK to copy the bytes now.  */
ce426f
 L(copy_GE_32_unaligned_cont):
ce426f
 
ce426f
 	/* Setup two indexes to speed up the indexed vector operations.  */
ce426f
-	clrldi  11,31,60
ce426f
-	li      6,16	      /* Index for 16-bytes offsets.  */
ce426f
+	clrldi	10,cnt,60
ce426f
+	li	6,16	      /* Index for 16-bytes offsets.  */
ce426f
 	li	7,32	      /* Index for 32-bytes offsets.  */
ce426f
-	cmpldi  cr1,11,0
ce426f
-	srdi    8,31,5	      /* Setup the loop counter.  */
ce426f
-	mr      10,3
ce426f
-	mr      11,12
ce426f
-	mtcrf   0x01,9
ce426f
-	cmpldi  cr6,9,1
ce426f
-	lvsl    5,0,12
ce426f
-	lvx     3,0,12
ce426f
-	bf      31,L(setup_unaligned_loop)
ce426f
-
ce426f
-	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
ce426f
-	lvx     4,12,6
ce426f
-	vperm   6,3,4,5
ce426f
-	addi    11,12,16
ce426f
-	addi    10,3,16
ce426f
-	stvx    6,0,3
ce426f
+	cmpldi	cr1,10,0
ce426f
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
ce426f
+	mtocrf	0x01,9
ce426f
+	cmpldi	cr6,9,1
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	lvsr	5,0,src
ce426f
+#else
ce426f
+	lvsl	5,0,src
ce426f
+#endif
ce426f
+	lvx	3,0,src
ce426f
+	li	0,0
ce426f
+	bf	31,L(setup_unaligned_loop)
ce426f
+
ce426f
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
ce426f
+	lvx	4,src,6
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm	6,4,3,5
ce426f
+#else
ce426f
+	vperm	6,3,4,5
ce426f
+#endif
ce426f
+	addi	src,src,16
ce426f
+	stvx	6,0,dst
ce426f
+	addi	dst,dst,16
ce426f
 	vor	3,4,4
ce426f
+	clrrdi	0,src,60
ce426f
 
ce426f
 L(setup_unaligned_loop):
ce426f
-	mtctr   8
ce426f
-	ble     cr6,L(end_unaligned_loop)
ce426f
+	mtctr	8
ce426f
+	ble	cr6,L(end_unaligned_loop)
ce426f
 
ce426f
 	/* Copy 32 bytes at a time using vector instructions.  */
ce426f
-	.align  4
ce426f
+	.align	4
ce426f
 L(unaligned_loop):
ce426f
 
ce426f
 	/* Note: vr6/vr10 may contain data that was already copied,
ce426f
@@ -444,63 +385,56 @@
ce426f
 	some portions again. This is faster than having unaligned
ce426f
 	vector instructions though.  */
ce426f
 
ce426f
-	lvx	4,11,6	      /* vr4 = r11+16.  */
ce426f
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
ce426f
-			      of vr3/vr4 into vr6.  */
ce426f
-	lvx	3,11,7	      /* vr3 = r11+32.  */
ce426f
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
ce426f
-			      of vr3/vr4 into vr10.  */
ce426f
-	addi    11,11,32
ce426f
-	stvx    6,0,10
ce426f
-	stvx    10,10,6
ce426f
-	addi    10,10,32
ce426f
-
ce426f
+	lvx	4,src,6
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm	6,4,3,5
ce426f
+#else
ce426f
+	vperm	6,3,4,5
ce426f
+#endif
ce426f
+	lvx	3,src,7
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm	10,3,4,5
ce426f
+#else
ce426f
+	vperm	10,4,3,5
ce426f
+#endif
ce426f
+	addi	src,src,32
ce426f
+	stvx	6,0,dst
ce426f
+	stvx	10,dst,6
ce426f
+	addi	dst,dst,32
ce426f
 	bdnz	L(unaligned_loop)
ce426f
 
ce426f
-	.align  4
ce426f
+	clrrdi	0,src,60
ce426f
+
ce426f
+	.align	4
ce426f
 L(end_unaligned_loop):
ce426f
 
ce426f
 	/* Check for tail bytes.  */
ce426f
-	rldicr  0,31,0,59
ce426f
-	mtcrf   0x01,31
ce426f
-	beq	cr1,0f
ce426f
+	mtocrf	0x01,cnt
ce426f
+	beqlr	cr1
ce426f
 
ce426f
-	add	3,3,0
ce426f
-	add	12,12,0
ce426f
+	add	src,src,0
ce426f
 
ce426f
 	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
ce426f
-8:	/* Copy 8 bytes.  */
ce426f
+	/* Copy 8 bytes.  */
ce426f
 	bf	28,4f
ce426f
-
ce426f
-	lwz	6,0(12)
ce426f
-	lwz	7,4(12)
ce426f
-	addi    12,12,8
ce426f
-	stw	6,0(3)
ce426f
-	stw	7,4(3)
ce426f
-	addi    3,3,8
ce426f
-4:	/* Copy 4 bytes.  */
ce426f
-	bf	29,2f
ce426f
-
ce426f
-	lwz	6,0(12)
ce426f
-	addi    12,12,4
ce426f
-	stw	6,0(3)
ce426f
-	addi    3,3,4
ce426f
-2:	/* Copy 2~3 bytes.  */
ce426f
-	bf	30,1f
ce426f
-
ce426f
-	lhz	6,0(12)
ce426f
-	addi    12,12,2
ce426f
-	sth	6,0(3)
ce426f
-	addi    3,3,2
ce426f
-1:	/* Copy 1 byte.  */
ce426f
-	bf	31,0f
ce426f
-
ce426f
-	lbz	6,0(12)
ce426f
-	stb	6,0(3)
ce426f
-0:	/* Return original DST pointer.  */
ce426f
-	ld	31,-8(1)
ce426f
-	ld	3,-16(1)
ce426f
+	lwz	6,0(src)
ce426f
+	lwz	7,4(src)
ce426f
+	addi	src,src,8
ce426f
+	stw	6,0(dst)
ce426f
+	stw	7,4(dst)
ce426f
+	addi	dst,dst,8
ce426f
+4:	/* Copy 4~7 bytes.  */
ce426f
+	bf	29,L(tail2)
ce426f
+	lwz	6,0(src)
ce426f
+	stw	6,0(dst)
ce426f
+	bf	30,L(tail5)
ce426f
+	lhz	7,4(src)
ce426f
+	sth	7,4(dst)
ce426f
+	bflr	31
ce426f
+	lbz	8,6(src)
ce426f
+	stb	8,6(dst)
ce426f
+	/* Return original DST pointer.  */
ce426f
 	blr
ce426f
 
ce426f
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
ce426f
+END_GEN_TB (memcpy,TB_TOCLESS)
ce426f
 libc_hidden_builtin_def (memcpy)
ce426f
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S
ce426f
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
ce426f
@@ -367,13 +367,21 @@
ce426f
 	mr	11,12
ce426f
 	mtcrf	0x01,9
ce426f
 	cmpldi	cr6,9,1
ce426f
-	lvsl	5,0,12
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	lvsr    5,0,12
ce426f
+#else
ce426f
+	lvsl    5,0,12
ce426f
+#endif
ce426f
 	lvx	3,0,12
ce426f
 	bf	31,L(setup_unaligned_loop)
ce426f
 
ce426f
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
ce426f
 	lvx	4,12,6
ce426f
-	vperm	6,3,4,5
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm   6,4,3,5
ce426f
+#else
ce426f
+	vperm   6,3,4,5
ce426f
+#endif
ce426f
 	addi	11,12,16
ce426f
 	addi	10,3,16
ce426f
 	stvx	6,0,3
ce426f
@@ -393,11 +401,17 @@
ce426f
 	vector instructions though.  */
ce426f
 
ce426f
 	lvx	4,11,6	      /* vr4 = r11+16.  */
ce426f
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
ce426f
-				 of vr3/vr4 into vr6.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm   6,4,3,5
ce426f
+#else
ce426f
+	vperm   6,3,4,5
ce426f
+#endif
ce426f
 	lvx	3,11,7	      /* vr3 = r11+32.  */
ce426f
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
ce426f
-				 of vr3/vr4 into vr10.  */
ce426f
+#ifdef __LITTLE_ENDIAN__
ce426f
+	vperm   10,3,4,5
ce426f
+#else
ce426f
+	vperm   10,4,3,5
ce426f
+#endif
ce426f
 	addi	11,11,32
ce426f
 	stvx	6,0,10
ce426f
 	stvx	10,10,6