Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-ppc64le-31.patch

Blob History Raw

		5de29b	`# commit 759cfef3ac4c07dba1ece0bbc1207e099348816d`
		5de29b	`# Author: Alan Modra <amodra@gmail.com>`
		5de29b	`# Date: Sat Aug 17 18:47:22 2013 +0930`
		5de29b	`#`
		5de29b	`# PowerPC LE memcpy`
		5de29b	`# http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html`
		5de29b	`#`
		5de29b	`# LIttle-endian support for memcpy. I spent some time cleaning up the`
		5de29b	`# 64-bit power7 memcpy, in order to avoid the extra alignment traps`
		5de29b	`# power7 takes for little-endian. It probably would have been better`
		5de29b	`# to copy the linux kernel version of memcpy.`
		5de29b	`#`
		5de29b	`# * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.`
		5de29b	`# * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.`
		5de29b	`# * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.`
		5de29b	`# * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.`
		5de29b	`# * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.`
		5de29b	`# * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.`
		5de29b	`# * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.`
		5de29b	`# * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.`
		5de29b	`# * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better`
		5de29b	`# use of regs. Use power7 mtocrf. Tidy function tails.`
		5de29b	`#`
		12745e	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S`
		12745e	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		12745e	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		5de29b	`@@ -205,15 +205,28 @@`
		5de29b	`blt cr6,5f`
		5de29b	`srwi 7,6,16`
		5de29b	`bgt cr6,3f`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ sth 7,0(3)`
		5de29b	`+#else`
		5de29b	`sth 6,0(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`3:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,24`
		5de29b	`+ stb 6,0(3)`
		5de29b	`+ sth 7,1(3)`
		5de29b	`+#else`
		5de29b	`stb 7,0(3)`
		5de29b	`sth 6,1(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`5:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,8`
		5de29b	`+#endif`
		5de29b	`stb 6,0(3)`
		5de29b	`7:`
		5de29b	`cmplwi cr1,10,16`
		5de29b	`@@ -341,13 +354,23 @@`
		5de29b	`bf 30,1f`
		5de29b
		5de29b	`/* there are at least two words to copy, so copy them */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srw 0,6,10`
		5de29b	`+ slw 8,7,9`
		5de29b	`+#else`
		5de29b	`slw 0,6,10 /* shift 1st src word to left align it in R0 */`
		5de29b	`srw 8,7,9 /* shift 2nd src word to right align it in R8 */`
		5de29b	`+#endif`
		5de29b	`or 0,0,8 /* or them to get word to store */`
		5de29b	`lwz 6,8(5) /* load the 3rd src word */`
		5de29b	`stw 0,0(4) /* store the 1st dst word */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srw 0,7,10`
		5de29b	`+ slw 8,6,9`
		5de29b	`+#else`
		5de29b	`slw 0,7,10 /* now left align 2nd src word into R0 */`
		5de29b	`srw 8,6,9 /* shift 3rd src word to right align it in R8 */`
		5de29b	`+#endif`
		5de29b	`or 0,0,8 /* or them to get word to store */`
		5de29b	`lwz 7,12(5)`
		5de29b	`stw 0,4(4) /* store the 2nd dst word */`
		5de29b	`@@ -355,8 +378,13 @@`
		5de29b	`addi 5,5,16`
		5de29b	`bf 31,4f`
		5de29b	`/* there is a third word to copy, so copy it */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srw 0,6,10`
		5de29b	`+ slw 8,7,9`
		5de29b	`+#else`
		5de29b	`slw 0,6,10 /* shift 3rd src word to left align it in R0 */`
		5de29b	`srw 8,7,9 /* shift 4th src word to right align it in R8 */`
		5de29b	`+#endif`
		5de29b	`or 0,0,8 /* or them to get word to store */`
		5de29b	`stw 0,0(4) /* store 3rd dst word */`
		5de29b	`mr 6,7`
		5de29b	`@@ -366,8 +394,13 @@`
		5de29b	`b 4f`
		5de29b	`.align 4`
		5de29b	`1:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srw 0,6,10`
		5de29b	`+ slw 8,7,9`
		5de29b	`+#else`
		5de29b	`slw 0,6,10 /* shift 1st src word to left align it in R0 */`
		5de29b	`srw 8,7,9 /* shift 2nd src word to right align it in R8 */`
		5de29b	`+#endif`
		5de29b	`addi 5,5,8`
		5de29b	`or 0,0,8 /* or them to get word to store */`
		5de29b	`bf 31,4f`
		5de29b	`@@ -380,23 +413,43 @@`
		5de29b	`.align 4`
		5de29b	`4:`
		5de29b	`/* copy 16 bytes at a time */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srw 0,6,10`
		5de29b	`+ slw 8,7,9`
		5de29b	`+#else`
		5de29b	`slw 0,6,10`
		5de29b	`srw 8,7,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`lwz 6,0(5)`
		5de29b	`stw 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srw 0,7,10`
		5de29b	`+ slw 8,6,9`
		5de29b	`+#else`
		5de29b	`slw 0,7,10`
		5de29b	`srw 8,6,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`lwz 7,4(5)`
		5de29b	`stw 0,4(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srw 0,6,10`
		5de29b	`+ slw 8,7,9`
		5de29b	`+#else`
		5de29b	`slw 0,6,10`
		5de29b	`srw 8,7,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`lwz 6,8(5)`
		5de29b	`stw 0,8(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srw 0,7,10`
		5de29b	`+ slw 8,6,9`
		5de29b	`+#else`
		5de29b	`slw 0,7,10`
		5de29b	`srw 8,6,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`lwz 7,12(5)`
		5de29b	`stw 0,12(4)`
		5de29b	`@@ -405,8 +458,13 @@`
		5de29b	`bdnz+ 4b`
		5de29b	`8:`
		5de29b	`/* calculate and store the final word */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srw 0,6,10`
		5de29b	`+ slw 8,7,9`
		5de29b	`+#else`
		5de29b	`slw 0,6,10`
		5de29b	`srw 8,7,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`stw 0,0(4)`
		5de29b	`3:`
		12745e	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S`
		12745e	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		12745e	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		5de29b	`@@ -221,15 +221,28 @@`
		5de29b	`blt cr6,5f`
		5de29b	`srwi 7,6,16`
		5de29b	`bgt cr6,3f`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ sth 7,0(3)`
		5de29b	`+#else`
		5de29b	`sth 6,0(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`3:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,24`
		5de29b	`+ stb 6,0(3)`
		5de29b	`+ sth 7,1(3)`
		5de29b	`+#else`
		5de29b	`stb 7,0(3)`
		5de29b	`sth 6,1(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`5:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,8`
		5de29b	`+#endif`
		5de29b	`stb 6,0(3)`
		5de29b	`7:`
		5de29b	`cmplwi cr1,10,16`
		5de29b	`@@ -579,7 +592,11 @@`
		5de29b	`lwz 6,-1(4)`
		5de29b	`cmplwi cr6,31,4`
		5de29b	`srwi 8,31,5 /* calculate the 32 byte loop count */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srwi 6,6,8`
		5de29b	`+#else`
		5de29b	`slwi 6,6,8`
		5de29b	`+#endif`
		5de29b	`clrlwi 31,31,27 /* The remaining bytes, < 32. */`
		5de29b	`blt cr5,L(wdu1_32tail)`
		5de29b	`mtctr 8`
		5de29b	`@@ -587,8 +604,12 @@`
		5de29b
		5de29b	`lwz 8,3(4)`
		5de29b	`lwz 7,4(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rldimi 6,8,24,32`
		5de29b	`+#else`
		5de29b	`/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */`
		5de29b	`rlwimi 6,8,8,(32-8),31`
		5de29b	`+#endif`
		5de29b	`b L(wdu1_loop32x)`
		5de29b	`.align 4`
		5de29b	`L(wdu1_loop32):`
		5de29b	`@@ -597,8 +618,12 @@`
		5de29b	`lwz 7,4(4)`
		5de29b	`stw 10,-8(3)`
		5de29b	`stw 11,-4(3)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rldimi 6,8,24,32`
		5de29b	`+#else`
		5de29b	`/* Equivalent to srwi 8,8,32-8; or 6,6,8 */`
		5de29b	`rlwimi 6,8,8,(32-8),31`
		5de29b	`+#endif`
		5de29b	`L(wdu1_loop32x):`
		5de29b	`lwz 10,8(4)`
		5de29b	`lwz 11,12(4)`
		5de29b	`@@ -615,7 +640,11 @@`
		5de29b	`stw 6,16(3)`
		5de29b	`stw 7,20(3)`
		5de29b	`addi 3,3,32`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srwi 6,8,8`
		5de29b	`+#else`
		5de29b	`slwi 6,8,8`
		5de29b	`+#endif`
		5de29b	`bdnz+ L(wdu1_loop32)`
		5de29b	`stw 10,-8(3)`
		5de29b	`stw 11,-4(3)`
		5de29b	`@@ -626,8 +655,12 @@`
		5de29b	`blt cr6,L(wdu_4tail)`
		5de29b	`/* calculate and store the final word */`
		5de29b	`lwz 8,3(4)`
		5de29b	`-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rldimi 6,8,24,32`
		5de29b	`+#else`
		5de29b	`+/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */`
		5de29b	`rlwimi 6,8,8,(32-8),31`
		5de29b	`+#endif`
		5de29b	`b L(wdu_32tailx)`
		5de29b
		5de29b	`L(wdu2_32):`
		5de29b	`@@ -635,7 +668,11 @@`
		5de29b	`lwz 6,-2(4)`
		5de29b	`cmplwi cr6,31,4`
		5de29b	`srwi 8,31,5 /* calculate the 32 byte loop count */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srwi 6,6,16`
		5de29b	`+#else`
		5de29b	`slwi 6,6,16`
		5de29b	`+#endif`
		5de29b	`clrlwi 31,31,27 /* The remaining bytes, < 32. */`
		5de29b	`blt cr5,L(wdu2_32tail)`
		5de29b	`mtctr 8`
		5de29b	`@@ -643,8 +680,11 @@`
		5de29b
		5de29b	`lwz 8,2(4)`
		5de29b	`lwz 7,4(4)`
		5de29b	`-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rldimi 6,8,16,32`
		5de29b	`+#else`
		5de29b	`rlwimi 6,8,16,(32-16),31`
		5de29b	`+#endif`
		5de29b	`b L(wdu2_loop32x)`
		5de29b	`.align 4`
		5de29b	`L(wdu2_loop32):`
		5de29b	`@@ -653,8 +693,11 @@`
		5de29b	`lwz 7,4(4)`
		5de29b	`stw 10,-8(3)`
		5de29b	`stw 11,-4(3)`
		5de29b	`-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rldimi 6,8,16,32`
		5de29b	`+#else`
		5de29b	`rlwimi 6,8,16,(32-16),31`
		5de29b	`+#endif`
		5de29b	`L(wdu2_loop32x):`
		5de29b	`lwz 10,8(4)`
		5de29b	`lwz 11,12(4)`
		5de29b	`@@ -672,7 +715,11 @@`
		5de29b	`stw 6,16(3)`
		5de29b	`stw 7,20(3)`
		5de29b	`addi 3,3,32`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srwi 6,8,16`
		5de29b	`+#else`
		5de29b	`slwi 6,8,16`
		5de29b	`+#endif`
		5de29b	`bdnz+ L(wdu2_loop32)`
		5de29b	`stw 10,-8(3)`
		5de29b	`stw 11,-4(3)`
		5de29b	`@@ -683,8 +730,11 @@`
		5de29b	`blt cr6,L(wdu_4tail)`
		5de29b	`/* calculate and store the final word */`
		5de29b	`lwz 8,2(4)`
		5de29b	`-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rldimi 6,8,16,32`
		5de29b	`+#else`
		5de29b	`rlwimi 6,8,16,(32-16),31`
		5de29b	`+#endif`
		5de29b	`b L(wdu_32tailx)`
		5de29b
		5de29b	`L(wdu3_32):`
		5de29b	`@@ -692,7 +742,11 @@`
		5de29b	`lwz 6,-3(4)`
		5de29b	`cmplwi cr6,31,4`
		5de29b	`srwi 8,31,5 /* calculate the 32 byte loop count */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srwi 6,6,24`
		5de29b	`+#else`
		5de29b	`slwi 6,6,24`
		5de29b	`+#endif`
		5de29b	`clrlwi 31,31,27 /* The remaining bytes, < 32. */`
		5de29b	`blt cr5,L(wdu3_32tail)`
		5de29b	`mtctr 8`
		5de29b	`@@ -700,8 +754,11 @@`
		5de29b
		5de29b	`lwz 8,1(4)`
		5de29b	`lwz 7,4(4)`
		5de29b	`-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rldimi 6,8,8,32`
		5de29b	`+#else`
		5de29b	`rlwimi 6,8,24,(32-24),31`
		5de29b	`+#endif`
		5de29b	`b L(wdu3_loop32x)`
		5de29b	`.align 4`
		5de29b	`L(wdu3_loop32):`
		5de29b	`@@ -710,8 +767,11 @@`
		5de29b	`lwz 7,4(4)`
		5de29b	`stw 10,-8(3)`
		5de29b	`stw 11,-4(3)`
		5de29b	`-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rldimi 6,8,8,32`
		5de29b	`+#else`
		5de29b	`rlwimi 6,8,24,(32-24),31`
		5de29b	`+#endif`
		5de29b	`L(wdu3_loop32x):`
		5de29b	`lwz 10,8(4)`
		5de29b	`lwz 11,12(4)`
		5de29b	`@@ -728,7 +788,11 @@`
		5de29b	`stw 6,16(3)`
		5de29b	`stw 7,20(3)`
		5de29b	`addi 3,3,32`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srwi 6,8,24`
		5de29b	`+#else`
		5de29b	`slwi 6,8,24`
		5de29b	`+#endif`
		5de29b	`bdnz+ L(wdu3_loop32)`
		5de29b	`stw 10,-8(3)`
		5de29b	`stw 11,-4(3)`
		5de29b	`@@ -739,8 +803,11 @@`
		5de29b	`blt cr6,L(wdu_4tail)`
		5de29b	`/* calculate and store the final word */`
		5de29b	`lwz 8,1(4)`
		5de29b	`-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rldimi 6,8,8,32`
		5de29b	`+#else`
		5de29b	`rlwimi 6,8,24,(32-24),31`
		5de29b	`+#endif`
		5de29b	`b L(wdu_32tailx)`
		5de29b	`.align 4`
		5de29b	`L(wdu_32tailx):`
		12745e	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S`
		12745e	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		12745e	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		5de29b	`@@ -385,7 +385,7 @@`
		5de29b
		5de29b	`beq L(copy_GE_32_unaligned_cont)`
		5de29b
		5de29b	`- /* SRC is not quadword aligned, get it aligned. */`
		5de29b	`+ /* DST is not quadword aligned, get it aligned. */`
		5de29b
		5de29b	`mtcrf 0x01,0`
		5de29b	`subf 31,0,5`
		5de29b	`@@ -437,13 +437,21 @@`
		5de29b	`mr 11,12`
		5de29b	`mtcrf 0x01,9`
		5de29b	`cmplwi cr6,9,1`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ lvsr 5,0,12`
		5de29b	`+#else`
		5de29b	`lvsl 5,0,12`
		5de29b	`+#endif`
		5de29b	`lvx 3,0,12`
		5de29b	`bf 31,L(setup_unaligned_loop)`
		5de29b
		5de29b	`/* Copy another 16 bytes to align to 32-bytes due to the loop . */`
		5de29b	`lvx 4,12,6`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 6,4,3,5`
		5de29b	`+#else`
		5de29b	`vperm 6,3,4,5`
		5de29b	`+#endif`
		5de29b	`addi 11,12,16`
		5de29b	`addi 10,3,16`
		5de29b	`stvx 6,0,3`
		5de29b	`@@ -463,11 +471,17 @@`
		5de29b	`vector instructions though. */`
		5de29b
		5de29b	`lvx 4,11,6 /* vr4 = r11+16. */`
		5de29b	`- vperm 6,3,4,5 /* Merge the correctly-aligned portions`
		5de29b	`- of vr3/vr4 into vr6. */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 6,4,3,5`
		5de29b	`+#else`
		5de29b	`+ vperm 6,3,4,5`
		5de29b	`+#endif`
		5de29b	`lvx 3,11,7 /* vr3 = r11+32. */`
		5de29b	`- vperm 10,4,3,5 /* Merge the correctly-aligned portions`
		5de29b	`- of vr3/vr4 into vr10. */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 10,3,4,5`
		5de29b	`+#else`
		5de29b	`+ vperm 10,4,3,5`
		5de29b	`+#endif`
		5de29b	`addi 11,11,32`
		5de29b	`stvx 6,0,10`
		5de29b	`stvx 10,10,6`
		12745e	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S`
		12745e	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500`
		12745e	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500`
		5de29b	`@@ -327,7 +327,7 @@`
		5de29b
		5de29b	`beq L(copy_GE_32_unaligned_cont)`
		5de29b
		5de29b	`- /* SRC is not quadword aligned, get it aligned. */`
		5de29b	`+ /* DST is not quadword aligned, get it aligned. */`
		5de29b
		5de29b	`mtcrf 0x01,0`
		5de29b	`subf 31,0,5`
		5de29b	`@@ -379,13 +379,21 @@`
		5de29b	`mr 11,12`
		5de29b	`mtcrf 0x01,9`
		5de29b	`cmplwi cr6,9,1`
		5de29b	`- lvsl 5,0,12`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ lvsr 5,0,12`
		5de29b	`+#else`
		5de29b	`+ lvsl 5,0,12`
		5de29b	`+#endif`
		5de29b	`lvx 3,0,12`
		5de29b	`bf 31,L(setup_unaligned_loop)`
		5de29b
		5de29b	`/* Copy another 16 bytes to align to 32-bytes due to the loop . */`
		5de29b	`lvx 4,12,6`
		5de29b	`- vperm 6,3,4,5`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 6,4,3,5`
		5de29b	`+#else`
		5de29b	`+ vperm 6,3,4,5`
		5de29b	`+#endif`
		5de29b	`addi 11,12,16`
		5de29b	`addi 10,3,16`
		5de29b	`stvx 6,0,3`
		5de29b	`@@ -405,11 +413,17 @@`
		5de29b	`vector instructions though. */`
		5de29b
		5de29b	`lvx 4,11,6 /* vr4 = r11+16. */`
		5de29b	`- vperm 6,3,4,5 /* Merge the correctly-aligned portions`
		5de29b	`- of vr3/vr4 into vr6. */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 6,4,3,5`
		5de29b	`+#else`
		5de29b	`+ vperm 6,3,4,5`
		5de29b	`+#endif`
		5de29b	`lvx 3,11,7 /* vr3 = r11+32. */`
		5de29b	`- vperm 10,4,3,5 /* Merge the correctly-aligned portions`
		5de29b	`- of vr3/vr4 into vr10. */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 10,3,4,5`
		5de29b	`+#else`
		5de29b	`+ vperm 10,4,3,5`
		5de29b	`+#endif`
		5de29b	`addi 11,11,32`
		5de29b	`stvx 6,0,10`
		5de29b	`stvx 10,10,6`
		12745e	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S`
		12745e	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		12745e	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		5de29b	`@@ -214,15 +214,28 @@`
		5de29b	`blt cr6,5f`
		5de29b	`srdi 7,6,16`
		5de29b	`bgt cr6,3f`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ sth 7,0(3)`
		5de29b	`+#else`
		5de29b	`sth 6,0(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`3:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,24`
		5de29b	`+ stb 6,0(3)`
		5de29b	`+ sth 7,1(3)`
		5de29b	`+#else`
		5de29b	`stb 7,0(3)`
		5de29b	`sth 6,1(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`5:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,8`
		5de29b	`+#endif`
		5de29b	`stb 6,0(3)`
		5de29b	`7:`
		5de29b	`cmpldi cr1,10,16`
		5de29b	`@@ -330,7 +343,11 @@`
		5de29b	`ld 7,8(5)`
		5de29b	`subfic 9,10,64`
		5de29b	`beq 2f`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,6,10`
		5de29b	`+#else`
		5de29b	`sld 0,6,10`
		5de29b	`+#endif`
		5de29b	`cmpldi 11,1`
		5de29b	`mr 6,7`
		5de29b	`addi 4,4,-8`
		5de29b	`@@ -338,15 +355,25 @@`
		5de29b	`b 1f`
		5de29b	`2: addi 5,5,8`
		5de29b	`.align 4`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+0: srd 0,6,10`
		5de29b	`+ sld 8,7,9`
		5de29b	`+#else`
		5de29b	`0: sld 0,6,10`
		5de29b	`srd 8,7,9`
		5de29b	`+#endif`
		5de29b	`cmpldi 11,2`
		5de29b	`ld 6,8(5)`
		5de29b	`or 0,0,8`
		5de29b	`addi 11,11,-2`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,7,10`
		5de29b	`+1: sld 8,6,9`
		5de29b	`+#else`
		5de29b	`sld 0,7,10`
		5de29b	`1: srd 8,6,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`beq 8f`
		5de29b	`ld 7,16(5)`
		12745e	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S`
		12745e	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		12745e	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S 2014-05-29 13:05:51.000000000 -0500`
		5de29b	`@@ -1,5 +1,5 @@`
		5de29b	`/* Optimized memcpy implementation for PowerPC64.`
		5de29b	`- Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.`
		5de29b	`+ Copyright (C) 2003-2014 Free Software Foundation, Inc.`
		5de29b	`This file is part of the GNU C Library.`
		5de29b
		5de29b	`The GNU C Library is free software; you can redistribute it and/or`
		5de29b	`@@ -17,26 +17,24 @@`
		5de29b	`<http://www.gnu.org/licenses/>. */`
		5de29b
		5de29b	`#include <sysdep.h>`
		5de29b	`-#include <bp-sym.h>`
		5de29b	`-#include <bp-asm.h>`
		5de29b
		5de29b	`/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);`
		5de29b	`Returns 'dst'.`
		5de29b
		5de29b	`- Memcpy handles short copies (< 32-bytes) using a binary move blocks`
		5de29b	`- (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled`
		5de29b	`- with the appropriate combination of byte and halfword load/stores.`
		5de29b	`- There is minimal effort to optimize the alignment of short moves.`
		5de29b	`+ Memcpy handles short copies (< 32-bytes) using a binary move blocks`
		5de29b	`+ (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled`
		5de29b	`+ with the appropriate combination of byte and halfword load/stores.`
		5de29b	`+ There is minimal effort to optimize the alignment of short moves.`
		5de29b	`The 64-bit implementations of POWER3 and POWER4 do a reasonable job`
		5de29b	`- of handling unligned load/stores that do not cross 32-byte boundries.`
		5de29b	`+ of handling unaligned load/stores that do not cross 32-byte boundaries.`
		5de29b
		5de29b	`Longer moves (>= 32-bytes) justify the effort to get at least the`
		5de29b	`destination doubleword (8-byte) aligned. Further optimization is`
		5de29b	`- posible when both source and destination are doubleword aligned.`
		5de29b	`+ possible when both source and destination are doubleword aligned.`
		5de29b	`Each case has a optimized unrolled loop. */`
		5de29b
		5de29b	`.machine power4`
		5de29b	`-EALIGN (BP_SYM (memcpy), 5, 0)`
		5de29b	`+EALIGN (memcpy, 5, 0)`
		5de29b	`CALL_MCOUNT 3`
		5de29b
		5de29b	`cmpldi cr1,5,31`
		5de29b	`@@ -44,20 +42,20 @@`
		5de29b	`std 3,-16(1)`
		5de29b	`std 31,-8(1)`
		5de29b	`cfi_offset(31,-8)`
		5de29b	`- andi. 11,3,7 /* check alignement of dst. */`
		5de29b	`+ andi. 11,3,7 /* check alignment of dst. */`
		5de29b	`clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */`
		5de29b	`- clrldi 10,4,61 /* check alignement of src. */`
		5de29b	`+ clrldi 10,4,61 /* check alignment of src. */`
		5de29b	`cmpldi cr6,5,8`
		5de29b	`ble- cr1,.L2 /* If move < 32 bytes use short move code. */`
		5de29b	`- cmpld cr6,10,11`
		5de29b	`+ cmpld cr6,10,11`
		5de29b	`mr 12,4`
		5de29b	`srdi 9,5,3 /* Number of full double words remaining. */`
		5de29b	`mtcrf 0x01,0`
		5de29b	`mr 31,5`
		5de29b	`beq .L0`
		5de29b	`-`
		5de29b	`+`
		5de29b	`subf 31,0,5`
		5de29b	`- /* Move 0-7 bytes as needed to get the destination doubleword alligned. */`
		5de29b	`+ /* Move 0-7 bytes as needed to get the destination doubleword aligned. */`
		5de29b	`1: bf 31,2f`
		5de29b	`lbz 6,0(12)`
		5de29b	`addi 12,12,1`
		5de29b	`@@ -74,17 +72,17 @@`
		5de29b	`stw 6,0(3)`
		5de29b	`addi 3,3,4`
		5de29b	`0:`
		5de29b	`- clrldi 10,12,61 /* check alignement of src again. */`
		5de29b	`+ clrldi 10,12,61 /* check alignment of src again. */`
		5de29b	`srdi 9,31,3 /* Number of full double words remaining. */`
		5de29b	`-`
		5de29b	`- /* Copy doublewords from source to destination, assumpting the`
		5de29b	`+`
		5de29b	`+ /* Copy doublewords from source to destination, assuming the`
		5de29b	`destination is aligned on a doubleword boundary.`
		5de29b
		5de29b	`At this point we know there are at least 25 bytes left (32-7) to copy.`
		5de29b	`- The next step is to determine if the source is also doubleword aligned.`
		5de29b	`+ The next step is to determine if the source is also doubleword aligned.`
		5de29b	`If not branch to the unaligned move code at .L6. which uses`
		5de29b	`a load, shift, store strategy.`
		5de29b	`-`
		5de29b	`+`
		5de29b	`Otherwise source and destination are doubleword aligned, and we can`
		5de29b	`the optimized doubleword copy loop. */`
		5de29b	`.L0:`
		5de29b	`@@ -97,14 +95,14 @@`
		5de29b	`Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.`
		5de29b	`If the copy is not an exact multiple of 32 bytes, 1-3`
		5de29b	`doublewords are copied as needed to set up the main loop. After`
		5de29b	`- the main loop exits there may be a tail of 1-7 bytes. These byte are`
		5de29b	`+ the main loop exits there may be a tail of 1-7 bytes. These byte are`
		5de29b	`copied a word/halfword/byte at a time as needed to preserve alignment. */`
		5de29b
		5de29b	`srdi 8,31,5`
		5de29b	`cmpldi cr1,9,4`
		5de29b	`cmpldi cr6,11,0`
		5de29b	`mr 11,12`
		5de29b	`-`
		5de29b	`+`
		5de29b	`bf 30,1f`
		5de29b	`ld 6,0(12)`
		5de29b	`ld 7,8(12)`
		5de29b	`@@ -115,7 +113,7 @@`
		5de29b	`addi 10,3,16`
		5de29b	`bf 31,4f`
		5de29b	`ld 0,16(12)`
		5de29b	`- std 0,16(3)`
		5de29b	`+ std 0,16(3)`
		5de29b	`blt cr1,3f`
		5de29b	`addi 11,12,24`
		5de29b	`addi 10,3,24`
		5de29b	`@@ -129,7 +127,7 @@`
		5de29b	`addi 11,12,8`
		5de29b	`std 6,0(3)`
		5de29b	`addi 10,3,8`
		5de29b	`-`
		5de29b	`+`
		5de29b	`.align 4`
		5de29b	`4:`
		5de29b	`ld 6,0(11)`
		5de29b	`@@ -144,7 +142,7 @@`
		5de29b	`std 0,24(10)`
		5de29b	`addi 10,10,32`
		5de29b	`bdnz 4b`
		5de29b	`-3:`
		5de29b	`+3:`
		5de29b
		5de29b	`rldicr 0,31,0,60`
		5de29b	`mtcrf 0x01,31`
		5de29b	`@@ -152,9 +150,9 @@`
		5de29b	`.L9:`
		5de29b	`add 3,3,0`
		5de29b	`add 12,12,0`
		5de29b	`-`
		5de29b	`+`
		5de29b	`/* At this point we have a tail of 0-7 bytes and we know that the`
		5de29b	`- destiniation is double word aligned. */`
		5de29b	`+ destination is double word aligned. */`
		5de29b	`4: bf 29,2f`
		5de29b	`lwz 6,0(12)`
		5de29b	`addi 12,12,4`
		5de29b	`@@ -173,29 +171,29 @@`
		5de29b	`ld 31,-8(1)`
		5de29b	`ld 3,-16(1)`
		5de29b	`blr`
		5de29b	`-`
		5de29b	`-/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31`
		5de29b	`- bytes. Each case is handled without loops, using binary (1,2,4,8)`
		5de29b	`- tests.`
		5de29b	`-`
		5de29b	`+`
		5de29b	`+/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31`
		5de29b	`+ bytes. Each case is handled without loops, using binary (1,2,4,8)`
		5de29b	`+ tests.`
		5de29b	`+`
		5de29b	`In the short (0-8 byte) case no attempt is made to force alignment`
		5de29b	`- of either source or destination. The hardware will handle the`
		5de29b	`- unaligned load/stores with small delays for crossing 32- 64-byte, and`
		5de29b	`+ of either source or destination. The hardware will handle the`
		5de29b	`+ unaligned load/stores with small delays for crossing 32- 64-byte, and`
		5de29b	`4096-byte boundaries. Since these short moves are unlikely to be`
		5de29b	`- unaligned or cross these boundaries, the overhead to force`
		5de29b	`+ unaligned or cross these boundaries, the overhead to force`
		5de29b	`alignment is not justified.`
		5de29b	`-`
		5de29b	`+`
		5de29b	`The longer (9-31 byte) move is more likely to cross 32- or 64-byte`
		5de29b	`boundaries. Since only loads are sensitive to the 32-/64-byte`
		5de29b	`- boundaries it is more important to align the source then the`
		5de29b	`+ boundaries it is more important to align the source then the`
		5de29b	`destination. If the source is not already word aligned, we first`
		5de29b	`- move 1-3 bytes as needed. Since we are only word aligned we don't`
		5de29b	`- use double word load/stores to insure that all loads are aligned.`
		5de29b	`+ move 1-3 bytes as needed. Since we are only word aligned we don't`
		5de29b	`+ use double word load/stores to insure that all loads are aligned.`
		5de29b	`While the destination and stores may still be unaligned, this`
		5de29b	`is only an issue for page (4096 byte boundary) crossing, which`
		5de29b	`should be rare for these short moves. The hardware handles this`
		5de29b	`- case automatically with a small delay. */`
		5de29b	`-`
		5de29b	`+ case automatically with a small delay. */`
		5de29b	`+`
		5de29b	`.align 4`
		5de29b	`.L2:`
		5de29b	`mtcrf 0x01,5`
		5de29b	`@@ -216,15 +214,28 @@`
		5de29b	`blt cr6,5f`
		5de29b	`srdi 7,6,16`
		5de29b	`bgt cr6,3f`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ sth 7,0(3)`
		5de29b	`+#else`
		5de29b	`sth 6,0(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`3:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,24`
		5de29b	`+ stb 6,0(3)`
		5de29b	`+ sth 7,1(3)`
		5de29b	`+#else`
		5de29b	`stb 7,0(3)`
		5de29b	`sth 6,1(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`5:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,8`
		5de29b	`+#endif`
		5de29b	`stb 6,0(3)`
		5de29b	`7:`
		5de29b	`cmpldi cr1,10,16`
		5de29b	`@@ -258,11 +269,11 @@`
		5de29b	`lwz 6,0(12)`
		5de29b	`addi 12,12,4`
		5de29b	`stw 6,0(3)`
		5de29b	`- addi 3,3,4`
		5de29b	`+ addi 3,3,4`
		5de29b	`2: /* Move 2-3 bytes. */`
		5de29b	`bf 30,1f`
		5de29b	`lhz 6,0(12)`
		5de29b	`- sth 6,0(3)`
		5de29b	`+ sth 6,0(3)`
		5de29b	`bf 31,0f`
		5de29b	`lbz 7,2(12)`
		5de29b	`stb 7,2(3)`
		5de29b	`@@ -283,8 +294,8 @@`
		5de29b	`mr 12,4`
		5de29b	`bne cr6,4f`
		5de29b	`/* Would have liked to use use ld/std here but the 630 processors are`
		5de29b	`- slow for load/store doubles that are not at least word aligned.`
		5de29b	`- Unaligned Load/Store word execute with only a 1 cycle penaltity. */`
		5de29b	`+ slow for load/store doubles that are not at least word aligned.`
		5de29b	`+ Unaligned Load/Store word execute with only a 1 cycle penalty. */`
		5de29b	`lwz 6,0(4)`
		5de29b	`lwz 7,4(4)`
		5de29b	`stw 6,0(3)`
		5de29b	`@@ -299,14 +310,14 @@`
		5de29b	`6:`
		5de29b	`bf 30,5f`
		5de29b	`lhz 7,4(4)`
		5de29b	`- sth 7,4(3)`
		5de29b	`+ sth 7,4(3)`
		5de29b	`bf 31,0f`
		5de29b	`lbz 8,6(4)`
		5de29b	`stb 8,6(3)`
		5de29b	`ld 3,-16(1)`
		5de29b	`blr`
		5de29b	`.align 4`
		5de29b	`-5:`
		5de29b	`+5:`
		5de29b	`bf 31,0f`
		5de29b	`lbz 6,4(4)`
		5de29b	`stb 6,4(3)`
		5de29b	`@@ -336,13 +347,23 @@`
		5de29b	`bf 30,1f`
		5de29b
		5de29b	`/* there are at least two DWs to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,6,10`
		5de29b	`+ sld 8,7,9`
		5de29b	`+#else`
		5de29b	`sld 0,6,10`
		5de29b	`srd 8,7,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,7,10`
		5de29b	`+ sld 8,6,9`
		5de29b	`+#else`
		5de29b	`sld 0,7,10`
		5de29b	`srd 8,6,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,8(4)`
		5de29b	`@@ -351,8 +372,13 @@`
		5de29b	`blt cr6,8f /* if total DWs = 3, then bypass loop */`
		5de29b	`bf 31,4f`
		5de29b	`/* there is a third DW to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,6,10`
		5de29b	`+ sld 8,7,9`
		5de29b	`+#else`
		5de29b	`sld 0,6,10`
		5de29b	`srd 8,7,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`mr 6,7`
		5de29b	`@@ -363,8 +389,13 @@`
		5de29b	`b 4f`
		5de29b	`.align 4`
		5de29b	`1:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,6,10`
		5de29b	`+ sld 8,7,9`
		5de29b	`+#else`
		5de29b	`sld 0,6,10`
		5de29b	`srd 8,7,9`
		5de29b	`+#endif`
		5de29b	`addi 5,5,16`
		5de29b	`or 0,0,8`
		5de29b	`bf 31,4f`
		5de29b	`@@ -375,23 +406,44 @@`
		5de29b	`addi 4,4,8`
		5de29b	`.align 4`
		5de29b	`/* copy 32 bytes at a time */`
		5de29b	`-4: sld 0,6,10`
		5de29b	`+4:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,6,10`
		5de29b	`+ sld 8,7,9`
		5de29b	`+#else`
		5de29b	`+ sld 0,6,10`
		5de29b	`srd 8,7,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,0(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,7,10`
		5de29b	`+ sld 8,6,9`
		5de29b	`+#else`
		5de29b	`sld 0,7,10`
		5de29b	`srd 8,6,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,8(5)`
		5de29b	`std 0,8(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,6,10`
		5de29b	`+ sld 8,7,9`
		5de29b	`+#else`
		5de29b	`sld 0,6,10`
		5de29b	`srd 8,7,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,16(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,7,10`
		5de29b	`+ sld 8,6,9`
		5de29b	`+#else`
		5de29b	`sld 0,7,10`
		5de29b	`srd 8,6,9`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,24(4)`
		5de29b	`@@ -401,9 +453,14 @@`
		5de29b	`.align 4`
		5de29b	`8:`
		5de29b	`/* calculate and store the final DW */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srd 0,6,10`
		5de29b	`+ sld 8,7,9`
		5de29b	`+#else`
		5de29b	`sld 0,6,10`
		5de29b	`srd 8,7,9`
		5de29b	`- or 0,0,8`
		5de29b	`+#endif`
		5de29b	`+ or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`3:`
		5de29b	`rldicr 0,31,0,60`
		5de29b	`@@ -413,5 +470,5 @@`
		5de29b	`ld 31,-8(1)`
		5de29b	`ld 3,-16(1)`
		5de29b	`blr`
		5de29b	`-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)`
		5de29b	`+END_GEN_TB (memcpy,TB_TOCLESS)`
		5de29b	`libc_hidden_builtin_def (memcpy)`
		12745e	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S`
		12745e	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		12745e	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S 2014-05-29 13:05:27.000000000 -0500`
		5de29b	`@@ -1,5 +1,5 @@`
		5de29b	`/* Optimized memcpy implementation for PowerPC64.`
		5de29b	`- Copyright (C) 2003, 2006, 2007, 2011 Free Software Foundation, Inc.`
		5de29b	`+ Copyright (C) 2003-2014 Free Software Foundation, Inc.`
		5de29b	`This file is part of the GNU C Library.`
		5de29b
		5de29b	`The GNU C Library is free software; you can redistribute it and/or`
		5de29b	`@@ -17,52 +17,50 @@`
		5de29b	`<http://www.gnu.org/licenses/>. */`
		5de29b
		5de29b	`#include <sysdep.h>`
		5de29b	`-#include <bp-sym.h>`
		5de29b	`-#include <bp-asm.h>`
		5de29b
		5de29b	`/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);`
		5de29b	`Returns 'dst'.`
		5de29b
		5de29b	`- Memcpy handles short copies (< 32-bytes) using a binary move blocks`
		5de29b	`- (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled`
		5de29b	`- with the appropriate combination of byte and halfword load/stores.`
		5de29b	`- There is minimal effort to optimize the alignment of short moves.`
		5de29b	`+ Memcpy handles short copies (< 32-bytes) using a binary move blocks`
		5de29b	`+ (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled`
		5de29b	`+ with the appropriate combination of byte and halfword load/stores.`
		5de29b	`+ There is minimal effort to optimize the alignment of short moves.`
		5de29b	`The 64-bit implementations of POWER3 and POWER4 do a reasonable job`
		5de29b	`- of handling unligned load/stores that do not cross 32-byte boundries.`
		5de29b	`+ of handling unaligned load/stores that do not cross 32-byte boundaries.`
		5de29b
		5de29b	`Longer moves (>= 32-bytes) justify the effort to get at least the`
		5de29b	`destination doubleword (8-byte) aligned. Further optimization is`
		5de29b	`- posible when both source and destination are doubleword aligned.`
		5de29b	`- Each case has a optimized unrolled loop.`
		5de29b	`-`
		5de29b	`- For POWER6 unaligned loads will take a 20+ cycle hicup for any`
		5de29b	`+ possible when both source and destination are doubleword aligned.`
		5de29b	`+ Each case has a optimized unrolled loop.`
		5de29b	`+`
		5de29b	`+ For POWER6 unaligned loads will take a 20+ cycle hiccup for any`
		5de29b	`L1 cache miss that crosses a 32- or 128-byte boundary. Store`
		5de29b	`- is more forgiving and does not take a hicup until page or`
		5de29b	`- segment boundaries. So we require doubleword alignment for`
		5de29b	`+ is more forgiving and does not take a hiccup until page or`
		5de29b	`+ segment boundaries. So we require doubleword alignment for`
		5de29b	`the source but may take a risk and only require word alignment`
		5de29b	`for the destination. */`
		5de29b
		5de29b	`.machine "power6"`
		5de29b	`-EALIGN (BP_SYM (memcpy), 7, 0)`
		5de29b	`+EALIGN (memcpy, 7, 0)`
		5de29b	`CALL_MCOUNT 3`
		5de29b
		5de29b	`cmpldi cr1,5,31`
		5de29b	`neg 0,3`
		5de29b	`std 3,-16(1)`
		5de29b	`std 31,-8(1)`
		5de29b	`- andi. 11,3,7 /* check alignement of dst. */`
		5de29b	`+ andi. 11,3,7 /* check alignment of dst. */`
		5de29b	`clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */`
		5de29b	`- clrldi 10,4,61 /* check alignement of src. */`
		5de29b	`+ clrldi 10,4,61 /* check alignment of src. */`
		5de29b	`cmpldi cr6,5,8`
		5de29b	`ble- cr1,.L2 /* If move < 32 bytes use short move code. */`
		5de29b	`mtcrf 0x01,0`
		5de29b	`- cmpld cr6,10,11`
		5de29b	`+ cmpld cr6,10,11`
		5de29b	`srdi 9,5,3 /* Number of full double words remaining. */`
		5de29b	`beq .L0`
		5de29b	`-`
		5de29b	`+`
		5de29b	`subf 5,0,5`
		5de29b	`- /* Move 0-7 bytes as needed to get the destination doubleword alligned.`
		5de29b	`- Duplicate some code to maximize fall-throught and minimize agen delays. */`
		5de29b	`+ /* Move 0-7 bytes as needed to get the destination doubleword aligned.`
		5de29b	`+ Duplicate some code to maximize fall-through and minimize agen delays. */`
		5de29b	`1: bf 31,2f`
		5de29b	`lbz 6,0(4)`
		5de29b	`stb 6,0(3)`
		5de29b	`@@ -78,7 +76,7 @@`
		5de29b	`lwz 6,1(4)`
		5de29b	`stw 6,1(3)`
		5de29b	`b 0f`
		5de29b	`-`
		5de29b	`+`
		5de29b	`2: bf 30,4f`
		5de29b	`lhz 6,0(4)`
		5de29b	`sth 6,0(3)`
		5de29b	`@@ -86,26 +84,26 @@`
		5de29b	`lwz 6,2(4)`
		5de29b	`stw 6,2(3)`
		5de29b	`b 0f`
		5de29b	`-`
		5de29b	`+`
		5de29b	`4: bf 29,0f`
		5de29b	`lwz 6,0(4)`
		5de29b	`stw 6,0(3)`
		5de29b	`-0:`
		5de29b	`+0:`
		5de29b	`/* Add the number of bytes until the 1st doubleword of dst to src and dst. */`
		5de29b	`add 4,4,0`
		5de29b	`add 3,3,0`
		5de29b	`-`
		5de29b	`- clrldi 10,4,61 /* check alignement of src again. */`
		5de29b	`+`
		5de29b	`+ clrldi 10,4,61 /* check alignment of src again. */`
		5de29b	`srdi 9,5,3 /* Number of full double words remaining. */`
		5de29b	`-`
		5de29b	`- /* Copy doublewords from source to destination, assumpting the`
		5de29b	`+`
		5de29b	`+ /* Copy doublewords from source to destination, assuming the`
		5de29b	`destination is aligned on a doubleword boundary.`
		5de29b
		5de29b	`At this point we know there are at least 25 bytes left (32-7) to copy.`
		5de29b	`- The next step is to determine if the source is also doubleword aligned.`
		5de29b	`+ The next step is to determine if the source is also doubleword aligned.`
		5de29b	`If not branch to the unaligned move code at .L6. which uses`
		5de29b	`a load, shift, store strategy.`
		5de29b	`-`
		5de29b	`+`
		5de29b	`Otherwise source and destination are doubleword aligned, and we can`
		5de29b	`the optimized doubleword copy loop. */`
		5de29b	`.align 4`
		5de29b	`@@ -123,14 +121,14 @@`
		5de29b	`the main loop exits there may be a tail of 1-7 bytes. These byte`
		5de29b	`are copied a word/halfword/byte at a time as needed to preserve`
		5de29b	`alignment.`
		5de29b	`-`
		5de29b	`+`
		5de29b	`For POWER6 the L1 is store-through and the L2 is store-in. The`
		5de29b	`L2 is clocked at half CPU clock so we can store 16 bytes every`
		5de29b	`other cycle. POWER6 also has a load/store bypass so we can do`
		5de29b	`- load, load, store, store every 2 cycles.`
		5de29b	`-`
		5de29b	`+ load, load, store, store every 2 cycles.`
		5de29b	`+`
		5de29b	`The following code is sensitive to cache line alignment. Do not`
		5de29b	`- make any change with out first making sure thay don't result in`
		5de29b	`+ make any change with out first making sure they don't result in`
		5de29b	`splitting ld/std pairs across a cache line. */`
		5de29b
		5de29b	`mtcrf 0x02,5`
		5de29b	`@@ -273,7 +271,7 @@`
		5de29b	`std 8,16+96(10)`
		5de29b	`std 0,24+96(10)`
		5de29b	`ble cr5,L(das_loop_e)`
		5de29b	`-`
		5de29b	`+`
		5de29b	`mtctr 12`
		5de29b	`.align 4`
		5de29b	`L(das_loop2):`
		5de29b	`@@ -326,10 +324,10 @@`
		5de29b	`.align 4`
		5de29b	`L(das_tail):`
		5de29b	`beq cr1,0f`
		5de29b	`-`
		5de29b	`+`
		5de29b	`L(das_tail2):`
		5de29b	`/* At this point we have a tail of 0-7 bytes and we know that the`
		5de29b	`- destiniation is double word aligned. */`
		5de29b	`+ destination is double word aligned. */`
		5de29b	`4: bf 29,2f`
		5de29b	`lwz 6,0(4)`
		5de29b	`stw 6,0(3)`
		5de29b	`@@ -344,7 +342,7 @@`
		5de29b	`lbz 6,4(4)`
		5de29b	`stb 6,4(3)`
		5de29b	`b 0f`
		5de29b	`-`
		5de29b	`+`
		5de29b	`2: bf 30,1f`
		5de29b	`lhz 6,0(4)`
		5de29b	`sth 6,0(3)`
		5de29b	`@@ -352,7 +350,7 @@`
		5de29b	`lbz 6,2(4)`
		5de29b	`stb 6,2(3)`
		5de29b	`b 0f`
		5de29b	`-`
		5de29b	`+`
		5de29b	`1: bf 31,0f`
		5de29b	`lbz 6,0(4)`
		5de29b	`stb 6,0(3)`
		5de29b	`@@ -361,7 +359,7 @@`
		5de29b	`ld 3,-16(1)`
		5de29b	`blr`
		5de29b
		5de29b	`-/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31`
		5de29b	`+/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31`
		5de29b	`bytes. Each case is handled without loops, using binary (1,2,4,8)`
		5de29b	`tests.`
		5de29b
		5de29b	`@@ -402,15 +400,28 @@`
		5de29b	`blt cr6,5f`
		5de29b	`srdi 7,6,16`
		5de29b	`bgt cr6,3f`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ sth 7,0(3)`
		5de29b	`+#else`
		5de29b	`sth 6,0(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`3:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,24`
		5de29b	`+ stb 6,0(3)`
		5de29b	`+ sth 7,1(3)`
		5de29b	`+#else`
		5de29b	`stb 7,0(3)`
		5de29b	`sth 6,1(3)`
		5de29b	`+#endif`
		5de29b	`b 7f`
		5de29b	`.align 4`
		5de29b	`5:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ rotlwi 6,6,8`
		5de29b	`+#endif`
		5de29b	`stb 6,0(3)`
		5de29b	`7:`
		5de29b	`cmpldi cr1,10,16`
		5de29b	`@@ -421,7 +432,7 @@`
		5de29b	`/* At least 6 bytes left and the source is word aligned. This allows`
		5de29b	`some speculative loads up front. */`
		5de29b	`/* We need to special case the fall-through because the biggest delays`
		5de29b	`- are due to address computation not being ready in time for the`
		5de29b	`+ are due to address computation not being ready in time for the`
		5de29b	`AGEN. */`
		5de29b	`lwz 6,0(12)`
		5de29b	`lwz 7,4(12)`
		5de29b	`@@ -452,7 +463,7 @@`
		5de29b	`ld 3,-16(1)`
		5de29b	`blr`
		5de29b	`.align 4`
		5de29b	`-L(dus_tail16p8): /* less then 8 bytes left. */`
		5de29b	`+L(dus_tail16p8): /* less than 8 bytes left. */`
		5de29b	`beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */`
		5de29b	`cmpldi cr1,10,20`
		5de29b	`bf 29,L(dus_tail16p2)`
		5de29b	`@@ -466,7 +477,7 @@`
		5de29b	`ld 3,-16(1)`
		5de29b	`blr`
		5de29b	`.align 4`
		5de29b	`-L(dus_tail16p4): /* less then 4 bytes left. */`
		5de29b	`+L(dus_tail16p4): /* less than 4 bytes left. */`
		5de29b	`addi 12,12,24`
		5de29b	`addi 3,3,24`
		5de29b	`bgt cr0,L(dus_tail2)`
		5de29b	`@@ -474,7 +485,7 @@`
		5de29b	`ld 3,-16(1)`
		5de29b	`blr`
		5de29b	`.align 4`
		5de29b	`-L(dus_tail16p2): /* 16 bytes moved, less then 4 bytes left. */`
		5de29b	`+L(dus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */`
		5de29b	`addi 12,12,16`
		5de29b	`addi 3,3,16`
		5de29b	`b L(dus_tail2)`
		5de29b	`@@ -499,7 +510,7 @@`
		5de29b	`ld 3,-16(1)`
		5de29b	`blr`
		5de29b	`.align 4`
		5de29b	`-L(dus_tail8p4): /* less then 4 bytes left. */`
		5de29b	`+L(dus_tail8p4): /* less than 4 bytes left. */`
		5de29b	`addi 12,12,8`
		5de29b	`addi 3,3,8`
		5de29b	`bgt cr1,L(dus_tail2)`
		5de29b	`@@ -510,14 +521,14 @@`
		5de29b	`.align 4`
		5de29b	`L(dus_tail4): /* Move 4 bytes. */`
		5de29b	`/* r6 already loaded speculatively. If we are here we know there is`
		5de29b	`- more then 4 bytes left. So there is no need to test. */`
		5de29b	`+ more than 4 bytes left. So there is no need to test. */`
		5de29b	`addi 12,12,4`
		5de29b	`stw 6,0(3)`
		5de29b	`addi 3,3,4`
		5de29b	`L(dus_tail2): /* Move 2-3 bytes. */`
		5de29b	`bf 30,L(dus_tail1)`
		5de29b	`lhz 6,0(12)`
		5de29b	`- sth 6,0(3)`
		5de29b	`+ sth 6,0(3)`
		5de29b	`bf 31,L(dus_tailX)`
		5de29b	`lbz 7,2(12)`
		5de29b	`stb 7,2(3)`
		5de29b	`@@ -537,7 +548,7 @@`
		5de29b	`.LE8:`
		5de29b	`mr 12,4`
		5de29b	`bne cr6,L(dus_4)`
		5de29b	`-/* Exactly 8 bytes. We may cross a 32-/128-byte boundry and take a ~20`
		5de29b	`+/* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20`
		5de29b	`cycle delay. This case should be rare and any attempt to avoid this`
		5de29b	`would take most of 20 cycles any way. */`
		5de29b	`ld 6,0(4)`
		5de29b	`@@ -552,7 +563,7 @@`
		5de29b	`stw 6,0(3)`
		5de29b	`bf 30,L(dus_5)`
		5de29b	`lhz 7,4(4)`
		5de29b	`- sth 7,4(3)`
		5de29b	`+ sth 7,4(3)`
		5de29b	`bf 31,L(dus_0)`
		5de29b	`lbz 8,6(4)`
		5de29b	`stb 8,6(3)`
		5de29b	`@@ -590,20 +601,31 @@`
		5de29b	`bge cr0, L(du4_do)`
		5de29b	`blt cr5, L(du1_do)`
		5de29b	`beq cr5, L(du2_do)`
		5de29b	`- b L(du3_do)`
		5de29b	`-`
		5de29b	`+ b L(du3_do)`
		5de29b	`+`
		5de29b	`.align 4`
		5de29b	`L(du1_do):`
		5de29b	`bf 30,L(du1_1dw)`
		5de29b
		5de29b	`/* there are at least two DWs to copy */`
		5de29b	`+ /* FIXME: can combine last shift and "or" into "rldimi" */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 8`
		5de29b	`+ sldi 8,7, 64-8`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 8`
		5de29b	`srdi 8,7, 64-8`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 8`
		5de29b	`+ sldi 8,6, 64-8`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 8`
		5de29b	`srdi 8,6, 64-8`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,8(4)`
		5de29b	`@@ -612,8 +634,13 @@`
		5de29b	`blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */`
		5de29b	`bf 31,L(du1_loop)`
		5de29b	`/* there is a third DW to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 8`
		5de29b	`+ sldi 8,7, 64-8`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 8`
		5de29b	`srdi 8,7, 64-8`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`mr 6,7`
		5de29b	`@@ -624,8 +651,13 @@`
		5de29b	`b L(du1_loop)`
		5de29b	`.align 4`
		5de29b	`L(du1_1dw):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 8`
		5de29b	`+ sldi 8,7, 64-8`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 8`
		5de29b	`srdi 8,7, 64-8`
		5de29b	`+#endif`
		5de29b	`addi 5,5,16`
		5de29b	`or 0,0,8`
		5de29b	`bf 31,L(du1_loop)`
		5de29b	`@@ -637,23 +669,43 @@`
		5de29b	`.align 4`
		5de29b	`/* copy 32 bytes at a time */`
		5de29b	`L(du1_loop):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 8`
		5de29b	`+ sldi 8,7, 64-8`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 8`
		5de29b	`srdi 8,7, 64-8`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,0(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 8`
		5de29b	`+ sldi 8,6, 64-8`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 8`
		5de29b	`srdi 8,6, 64-8`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,8(5)`
		5de29b	`std 0,8(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 8`
		5de29b	`+ sldi 8,7, 64-8`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 8`
		5de29b	`srdi 8,7, 64-8`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,16(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 8`
		5de29b	`+ sldi 8,6, 64-8`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 8`
		5de29b	`srdi 8,6, 64-8`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,24(4)`
		5de29b	`@@ -663,9 +715,14 @@`
		5de29b	`.align 4`
		5de29b	`L(du1_fini):`
		5de29b	`/* calculate and store the final DW */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 8`
		5de29b	`+ sldi 8,7, 64-8`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 8`
		5de29b	`srdi 8,7, 64-8`
		5de29b	`- or 0,0,8`
		5de29b	`+#endif`
		5de29b	`+ or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`b L(du_done)`
		5de29b
		5de29b	`@@ -674,13 +731,23 @@`
		5de29b	`bf 30,L(du2_1dw)`
		5de29b
		5de29b	`/* there are at least two DWs to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 16`
		5de29b	`+ sldi 8,7, 64-16`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 16`
		5de29b	`srdi 8,7, 64-16`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 16`
		5de29b	`+ sldi 8,6, 64-16`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 16`
		5de29b	`srdi 8,6, 64-16`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,8(4)`
		5de29b	`@@ -689,8 +756,13 @@`
		5de29b	`blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */`
		5de29b	`bf 31,L(du2_loop)`
		5de29b	`/* there is a third DW to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 16`
		5de29b	`+ sldi 8,7, 64-16`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 16`
		5de29b	`srdi 8,7, 64-16`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`mr 6,7`
		5de29b	`@@ -701,8 +773,13 @@`
		5de29b	`b L(du2_loop)`
		5de29b	`.align 4`
		5de29b	`L(du2_1dw):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 16`
		5de29b	`+ sldi 8,7, 64-16`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 16`
		5de29b	`srdi 8,7, 64-16`
		5de29b	`+#endif`
		5de29b	`addi 5,5,16`
		5de29b	`or 0,0,8`
		5de29b	`bf 31,L(du2_loop)`
		5de29b	`@@ -714,23 +791,43 @@`
		5de29b	`.align 4`
		5de29b	`/* copy 32 bytes at a time */`
		5de29b	`L(du2_loop):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 16`
		5de29b	`+ sldi 8,7, 64-16`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 16`
		5de29b	`srdi 8,7, 64-16`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,0(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 16`
		5de29b	`+ sldi 8,6, 64-16`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 16`
		5de29b	`srdi 8,6, 64-16`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,8(5)`
		5de29b	`std 0,8(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 16`
		5de29b	`+ sldi 8,7, 64-16`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 16`
		5de29b	`srdi 8,7, 64-16`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,16(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 16`
		5de29b	`+ sldi 8,6, 64-16`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 16`
		5de29b	`srdi 8,6, 64-16`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,24(4)`
		5de29b	`@@ -740,9 +837,14 @@`
		5de29b	`.align 4`
		5de29b	`L(du2_fini):`
		5de29b	`/* calculate and store the final DW */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 16`
		5de29b	`+ sldi 8,7, 64-16`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 16`
		5de29b	`srdi 8,7, 64-16`
		5de29b	`- or 0,0,8`
		5de29b	`+#endif`
		5de29b	`+ or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`b L(du_done)`
		5de29b
		5de29b	`@@ -751,13 +853,23 @@`
		5de29b	`bf 30,L(du3_1dw)`
		5de29b
		5de29b	`/* there are at least two DWs to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 24`
		5de29b	`+ sldi 8,7, 64-24`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 24`
		5de29b	`srdi 8,7, 64-24`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 24`
		5de29b	`+ sldi 8,6, 64-24`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 24`
		5de29b	`srdi 8,6, 64-24`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,8(4)`
		5de29b	`@@ -766,8 +878,13 @@`
		5de29b	`blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */`
		5de29b	`bf 31,L(du3_loop)`
		5de29b	`/* there is a third DW to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 24`
		5de29b	`+ sldi 8,7, 64-24`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 24`
		5de29b	`srdi 8,7, 64-24`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`mr 6,7`
		5de29b	`@@ -778,8 +895,13 @@`
		5de29b	`b L(du3_loop)`
		5de29b	`.align 4`
		5de29b	`L(du3_1dw):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 24`
		5de29b	`+ sldi 8,7, 64-24`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 24`
		5de29b	`srdi 8,7, 64-24`
		5de29b	`+#endif`
		5de29b	`addi 5,5,16`
		5de29b	`or 0,0,8`
		5de29b	`bf 31,L(du3_loop)`
		5de29b	`@@ -791,23 +913,43 @@`
		5de29b	`.align 4`
		5de29b	`/* copy 32 bytes at a time */`
		5de29b	`L(du3_loop):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 24`
		5de29b	`+ sldi 8,7, 64-24`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 24`
		5de29b	`srdi 8,7, 64-24`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,0(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 24`
		5de29b	`+ sldi 8,6, 64-24`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 24`
		5de29b	`srdi 8,6, 64-24`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,8(5)`
		5de29b	`std 0,8(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 24`
		5de29b	`+ sldi 8,7, 64-24`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 24`
		5de29b	`srdi 8,7, 64-24`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,16(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 24`
		5de29b	`+ sldi 8,6, 64-24`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 24`
		5de29b	`srdi 8,6, 64-24`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,24(4)`
		5de29b	`@@ -817,9 +959,14 @@`
		5de29b	`.align 4`
		5de29b	`L(du3_fini):`
		5de29b	`/* calculate and store the final DW */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 24`
		5de29b	`+ sldi 8,7, 64-24`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 24`
		5de29b	`srdi 8,7, 64-24`
		5de29b	`- or 0,0,8`
		5de29b	`+#endif`
		5de29b	`+ or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`b L(du_done)`
		5de29b
		5de29b	`@@ -834,13 +981,23 @@`
		5de29b	`bf 30,L(du4_1dw)`
		5de29b
		5de29b	`/* there are at least two DWs to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 32`
		5de29b	`+ sldi 8,7, 64-32`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 32`
		5de29b	`srdi 8,7, 64-32`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 32`
		5de29b	`+ sldi 8,6, 64-32`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 32`
		5de29b	`srdi 8,6, 64-32`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,8(4)`
		5de29b	`@@ -849,8 +1006,13 @@`
		5de29b	`blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */`
		5de29b	`bf 31,L(du4_loop)`
		5de29b	`/* there is a third DW to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 32`
		5de29b	`+ sldi 8,7, 64-32`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 32`
		5de29b	`srdi 8,7, 64-32`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`mr 6,7`
		5de29b	`@@ -861,8 +1023,13 @@`
		5de29b	`b L(du4_loop)`
		5de29b	`.align 4`
		5de29b	`L(du4_1dw):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 32`
		5de29b	`+ sldi 8,7, 64-32`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 32`
		5de29b	`srdi 8,7, 64-32`
		5de29b	`+#endif`
		5de29b	`addi 5,5,16`
		5de29b	`or 0,0,8`
		5de29b	`bf 31,L(du4_loop)`
		5de29b	`@@ -874,23 +1041,43 @@`
		5de29b	`.align 4`
		5de29b	`/* copy 32 bytes at a time */`
		5de29b	`L(du4_loop):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 32`
		5de29b	`+ sldi 8,7, 64-32`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 32`
		5de29b	`srdi 8,7, 64-32`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,0(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 32`
		5de29b	`+ sldi 8,6, 64-32`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 32`
		5de29b	`srdi 8,6, 64-32`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,8(5)`
		5de29b	`std 0,8(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 32`
		5de29b	`+ sldi 8,7, 64-32`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 32`
		5de29b	`srdi 8,7, 64-32`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,16(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 32`
		5de29b	`+ sldi 8,6, 64-32`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 32`
		5de29b	`srdi 8,6, 64-32`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,24(4)`
		5de29b	`@@ -900,9 +1087,14 @@`
		5de29b	`.align 4`
		5de29b	`L(du4_fini):`
		5de29b	`/* calculate and store the final DW */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 32`
		5de29b	`+ sldi 8,7, 64-32`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 32`
		5de29b	`srdi 8,7, 64-32`
		5de29b	`- or 0,0,8`
		5de29b	`+#endif`
		5de29b	`+ or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`b L(du_done)`
		5de29b
		5de29b	`@@ -911,13 +1103,23 @@`
		5de29b	`bf 30,L(du5_1dw)`
		5de29b
		5de29b	`/* there are at least two DWs to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 40`
		5de29b	`+ sldi 8,7, 64-40`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 40`
		5de29b	`srdi 8,7, 64-40`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 40`
		5de29b	`+ sldi 8,6, 64-40`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 40`
		5de29b	`srdi 8,6, 64-40`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,8(4)`
		5de29b	`@@ -926,8 +1128,13 @@`
		5de29b	`blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */`
		5de29b	`bf 31,L(du5_loop)`
		5de29b	`/* there is a third DW to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 40`
		5de29b	`+ sldi 8,7, 64-40`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 40`
		5de29b	`srdi 8,7, 64-40`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`mr 6,7`
		5de29b	`@@ -938,8 +1145,13 @@`
		5de29b	`b L(du5_loop)`
		5de29b	`.align 4`
		5de29b	`L(du5_1dw):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 40`
		5de29b	`+ sldi 8,7, 64-40`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 40`
		5de29b	`srdi 8,7, 64-40`
		5de29b	`+#endif`
		5de29b	`addi 5,5,16`
		5de29b	`or 0,0,8`
		5de29b	`bf 31,L(du5_loop)`
		5de29b	`@@ -951,23 +1163,43 @@`
		5de29b	`.align 4`
		5de29b	`/* copy 32 bytes at a time */`
		5de29b	`L(du5_loop):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 40`
		5de29b	`+ sldi 8,7, 64-40`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 40`
		5de29b	`srdi 8,7, 64-40`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,0(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 40`
		5de29b	`+ sldi 8,6, 64-40`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 40`
		5de29b	`srdi 8,6, 64-40`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,8(5)`
		5de29b	`std 0,8(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 40`
		5de29b	`+ sldi 8,7, 64-40`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 40`
		5de29b	`srdi 8,7, 64-40`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,16(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 40`
		5de29b	`+ sldi 8,6, 64-40`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 40`
		5de29b	`srdi 8,6, 64-40`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,24(4)`
		5de29b	`@@ -977,9 +1209,14 @@`
		5de29b	`.align 4`
		5de29b	`L(du5_fini):`
		5de29b	`/* calculate and store the final DW */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 40`
		5de29b	`+ sldi 8,7, 64-40`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 40`
		5de29b	`srdi 8,7, 64-40`
		5de29b	`- or 0,0,8`
		5de29b	`+#endif`
		5de29b	`+ or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`b L(du_done)`
		5de29b
		5de29b	`@@ -988,13 +1225,23 @@`
		5de29b	`bf 30,L(du6_1dw)`
		5de29b
		5de29b	`/* there are at least two DWs to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 48`
		5de29b	`+ sldi 8,7, 64-48`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 48`
		5de29b	`srdi 8,7, 64-48`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 48`
		5de29b	`+ sldi 8,6, 64-48`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 48`
		5de29b	`srdi 8,6, 64-48`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,8(4)`
		5de29b	`@@ -1003,8 +1250,13 @@`
		5de29b	`blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */`
		5de29b	`bf 31,L(du6_loop)`
		5de29b	`/* there is a third DW to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 48`
		5de29b	`+ sldi 8,7, 64-48`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 48`
		5de29b	`srdi 8,7, 64-48`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`mr 6,7`
		5de29b	`@@ -1015,8 +1267,13 @@`
		5de29b	`b L(du6_loop)`
		5de29b	`.align 4`
		5de29b	`L(du6_1dw):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 48`
		5de29b	`+ sldi 8,7, 64-48`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 48`
		5de29b	`srdi 8,7, 64-48`
		5de29b	`+#endif`
		5de29b	`addi 5,5,16`
		5de29b	`or 0,0,8`
		5de29b	`bf 31,L(du6_loop)`
		5de29b	`@@ -1028,23 +1285,43 @@`
		5de29b	`.align 4`
		5de29b	`/* copy 32 bytes at a time */`
		5de29b	`L(du6_loop):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 48`
		5de29b	`+ sldi 8,7, 64-48`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 48`
		5de29b	`srdi 8,7, 64-48`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,0(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 48`
		5de29b	`+ sldi 8,6, 64-48`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 48`
		5de29b	`srdi 8,6, 64-48`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,8(5)`
		5de29b	`std 0,8(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 48`
		5de29b	`+ sldi 8,7, 64-48`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 48`
		5de29b	`srdi 8,7, 64-48`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,16(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 48`
		5de29b	`+ sldi 8,6, 64-48`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 48`
		5de29b	`srdi 8,6, 64-48`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,24(4)`
		5de29b	`@@ -1054,9 +1331,14 @@`
		5de29b	`.align 4`
		5de29b	`L(du6_fini):`
		5de29b	`/* calculate and store the final DW */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 48`
		5de29b	`+ sldi 8,7, 64-48`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 48`
		5de29b	`srdi 8,7, 64-48`
		5de29b	`- or 0,0,8`
		5de29b	`+#endif`
		5de29b	`+ or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`b L(du_done)`
		5de29b
		5de29b	`@@ -1065,13 +1347,23 @@`
		5de29b	`bf 30,L(du7_1dw)`
		5de29b
		5de29b	`/* there are at least two DWs to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 56`
		5de29b	`+ sldi 8,7, 64-56`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 56`
		5de29b	`srdi 8,7, 64-56`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 56`
		5de29b	`+ sldi 8,6, 64-56`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 56`
		5de29b	`srdi 8,6, 64-56`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,8(4)`
		5de29b	`@@ -1080,8 +1372,13 @@`
		5de29b	`blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */`
		5de29b	`bf 31,L(du7_loop)`
		5de29b	`/* there is a third DW to copy */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 56`
		5de29b	`+ sldi 8,7, 64-56`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 56`
		5de29b	`srdi 8,7, 64-56`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`mr 6,7`
		5de29b	`@@ -1092,8 +1389,13 @@`
		5de29b	`b L(du7_loop)`
		5de29b	`.align 4`
		5de29b	`L(du7_1dw):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 56`
		5de29b	`+ sldi 8,7, 64-56`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 56`
		5de29b	`srdi 8,7, 64-56`
		5de29b	`+#endif`
		5de29b	`addi 5,5,16`
		5de29b	`or 0,0,8`
		5de29b	`bf 31,L(du7_loop)`
		5de29b	`@@ -1105,23 +1407,43 @@`
		5de29b	`.align 4`
		5de29b	`/* copy 32 bytes at a time */`
		5de29b	`L(du7_loop):`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 56`
		5de29b	`+ sldi 8,7, 64-56`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 56`
		5de29b	`srdi 8,7, 64-56`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,0(5)`
		5de29b	`std 0,0(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 56`
		5de29b	`+ sldi 8,6, 64-56`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 56`
		5de29b	`srdi 8,6, 64-56`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,8(5)`
		5de29b	`std 0,8(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 56`
		5de29b	`+ sldi 8,7, 64-56`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 56`
		5de29b	`srdi 8,7, 64-56`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 6,16(5)`
		5de29b	`std 0,16(4)`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,7, 56`
		5de29b	`+ sldi 8,6, 64-56`
		5de29b	`+#else`
		5de29b	`sldi 0,7, 56`
		5de29b	`srdi 8,6, 64-56`
		5de29b	`+#endif`
		5de29b	`or 0,0,8`
		5de29b	`ld 7,24(5)`
		5de29b	`std 0,24(4)`
		5de29b	`@@ -1131,12 +1453,17 @@`
		5de29b	`.align 4`
		5de29b	`L(du7_fini):`
		5de29b	`/* calculate and store the final DW */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ srdi 0,6, 56`
		5de29b	`+ sldi 8,7, 64-56`
		5de29b	`+#else`
		5de29b	`sldi 0,6, 56`
		5de29b	`srdi 8,7, 64-56`
		5de29b	`- or 0,0,8`
		5de29b	`+#endif`
		5de29b	`+ or 0,0,8`
		5de29b	`std 0,0(4)`
		5de29b	`b L(du_done)`
		5de29b	`-`
		5de29b	`+`
		5de29b	`.align 4`
		5de29b	`L(du_done):`
		5de29b	`rldicr 0,31,0,60`
		5de29b	`@@ -1144,9 +1471,9 @@`
		5de29b	`beq cr1,0f /* If the tail is 0 bytes we are done! */`
		5de29b
		5de29b	`add 3,3,0`
		5de29b	`- add 12,12,0`
		5de29b	`+ add 12,12,0`
		5de29b	`/* At this point we have a tail of 0-7 bytes and we know that the`
		5de29b	`- destiniation is double word aligned. */`
		5de29b	`+ destination is double word aligned. */`
		5de29b	`4: bf 29,2f`
		5de29b	`lwz 6,0(12)`
		5de29b	`addi 12,12,4`
		5de29b	`@@ -1165,5 +1492,5 @@`
		5de29b	`ld 31,-8(1)`
		5de29b	`ld 3,-16(1)`
		5de29b	`blr`
		5de29b	`-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)`
		5de29b	`+END_GEN_TB (memcpy,TB_TOCLESS)`
		5de29b	`libc_hidden_builtin_def (memcpy)`
		12745e	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S`
		12745e	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		12745e	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S 2014-05-29 13:05:40.000000000 -0500`
		5de29b	`@@ -1,5 +1,5 @@`
		5de29b	`/* Optimized memcpy implementation for PowerPC64/POWER7.`
		5de29b	`- Copyright (C) 2010, 2011 Free Software Foundation, Inc.`
		5de29b	`+ Copyright (C) 2010-2014 Free Software Foundation, Inc.`
		5de29b	`Contributed by Luis Machado <luisgpm@br.ibm.com>.`
		5de29b	`This file is part of the GNU C Library.`
		5de29b
		5de29b	`@@ -18,425 +18,366 @@`
		5de29b	`<http://www.gnu.org/licenses/>. */`
		5de29b
		5de29b	`#include <sysdep.h>`
		5de29b	`-#include <bp-sym.h>`
		5de29b	`-#include <bp-asm.h>`
		5de29b
		5de29b
		5de29b	`/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);`
		5de29b	`Returns 'dst'. */`
		5de29b
		5de29b	`+#define dst 11 /* Use r11 so r3 kept unchanged. */`
		5de29b	`+#define src 4`
		5de29b	`+#define cnt 5`
		5de29b	`+`
		5de29b	`.machine power7`
		5de29b	`-EALIGN (BP_SYM (memcpy), 5, 0)`
		5de29b	`+EALIGN (memcpy, 5, 0)`
		5de29b	`CALL_MCOUNT 3`
		5de29b
		5de29b	`- cmpldi cr1,5,31`
		5de29b	`+ cmpldi cr1,cnt,31`
		5de29b	`neg 0,3`
		5de29b	`- std 3,-16(1)`
		5de29b	`- std 31,-8(1)`
		5de29b	`- cfi_offset(31,-8)`
		5de29b	`ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move`
		5de29b	`code. */`
		5de29b
		5de29b	`- andi. 11,3,7 /* Check alignment of DST. */`
		5de29b	`-`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x`
		5de29b	`+ or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy`
		5de29b	`+ loop is only used for quadword aligned copies. */`
		5de29b	`+ andi. 10,3,15`
		5de29b	`+ clrldi 11,4,60`
		5de29b	`+#else`
		5de29b	`+ andi. 10,3,7 /* Check alignment of DST. */`
		5de29b	`+ clrldi 11,4,61 /* Check alignment of SRC. */`
		5de29b	`+#endif`
		5de29b	`+ cmpld cr6,10,11 /* SRC and DST alignments match? */`
		5de29b
		5de29b	`- clrldi 10,4,61 /* Check alignment of SRC. */`
		5de29b	`- cmpld cr6,10,11 /* SRC and DST alignments match? */`
		5de29b	`- mr 12,4`
		5de29b	`- mr 31,5`
		5de29b	`+ mr dst,3`
		5de29b	`bne cr6,L(copy_GE_32_unaligned)`
		5de29b	`+ beq L(aligned_copy)`
		5de29b
		5de29b	`- srdi 9,5,3 /* Number of full quadwords remaining. */`
		5de29b	`-`
		5de29b	`- beq L(copy_GE_32_aligned_cont)`
		5de29b	`-`
		5de29b	`- clrldi 0,0,61`
		5de29b	`- mtcrf 0x01,0`
		5de29b	`- subf 31,0,5`
		5de29b	`-`
		5de29b	`- /* Get the SRC aligned to 8 bytes. */`
		5de29b	`-`
		5de29b	`-1: bf 31,2f`
		5de29b	`- lbz 6,0(12)`
		5de29b	`- addi 12,12,1`
		5de29b	`- stb 6,0(3)`
		5de29b	`- addi 3,3,1`
		5de29b	`-2: bf 30,4f`
		5de29b	`- lhz 6,0(12)`
		5de29b	`- addi 12,12,2`
		5de29b	`- sth 6,0(3)`
		5de29b	`- addi 3,3,2`
		5de29b	`-4: bf 29,0f`
		5de29b	`- lwz 6,0(12)`
		5de29b	`- addi 12,12,4`
		5de29b	`- stw 6,0(3)`
		5de29b	`- addi 3,3,4`
		5de29b	`-0:`
		5de29b	`- clrldi 10,12,61 /* Check alignment of SRC again. */`
		5de29b	`- srdi 9,31,3 /* Number of full doublewords remaining. */`
		5de29b	`-`
		5de29b	`-L(copy_GE_32_aligned_cont):`
		5de29b	`-`
		5de29b	`- clrldi 11,31,61`
		5de29b	`- mtcrf 0x01,9`
		5de29b	`-`
		5de29b	`- srdi 8,31,5`
		5de29b	`- cmpldi cr1,9,4`
		5de29b	`- cmpldi cr6,11,0`
		5de29b	`- mr 11,12`
		5de29b	`+ mtocrf 0x01,0`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ clrldi 0,0,60`
		5de29b	`+#else`
		5de29b	`+ clrldi 0,0,61`
		5de29b	`+#endif`
		5de29b
		5de29b	`- /* Copy 1~3 doublewords so the main loop starts`
		5de29b	`- at a multiple of 32 bytes. */`
		5de29b	`-`
		5de29b	`- bf 30,1f`
		5de29b	`- ld 6,0(12)`
		5de29b	`- ld 7,8(12)`
		5de29b	`- addi 11,12,16`
		5de29b	`- mtctr 8`
		5de29b	`- std 6,0(3)`
		5de29b	`- std 7,8(3)`
		5de29b	`- addi 10,3,16`
		5de29b	`- bf 31,4f`
		5de29b	`- ld 0,16(12)`
		5de29b	`- std 0,16(3)`
		5de29b	`- blt cr1,3f`
		5de29b	`- addi 11,12,24`
		5de29b	`- addi 10,3,24`
		5de29b	`- b 4f`
		5de29b	`-`
		5de29b	`- .align 4`
		5de29b	`-1: /* Copy 1 doubleword and set the counter. */`
		5de29b	`- mr 10,3`
		5de29b	`- mtctr 8`
		5de29b	`- bf 31,4f`
		5de29b	`- ld 6,0(12)`
		5de29b	`- addi 11,12,8`
		5de29b	`- std 6,0(3)`
		5de29b	`- addi 10,3,8`
		5de29b	`+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */`
		5de29b	`+1:`
		5de29b	`+ bf 31,2f`
		5de29b	`+ lbz 6,0(src)`
		5de29b	`+ addi src,src,1`
		5de29b	`+ stb 6,0(dst)`
		5de29b	`+ addi dst,dst,1`
		5de29b	`+2:`
		5de29b	`+ bf 30,4f`
		5de29b	`+ lhz 6,0(src)`
		5de29b	`+ addi src,src,2`
		5de29b	`+ sth 6,0(dst)`
		5de29b	`+ addi dst,dst,2`
		5de29b	`+4:`
		5de29b	`+ bf 29,8f`
		5de29b	`+ lwz 6,0(src)`
		5de29b	`+ addi src,src,4`
		5de29b	`+ stw 6,0(dst)`
		5de29b	`+ addi dst,dst,4`
		5de29b	`+8:`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ bf 28,16f`
		5de29b	`+ ld 6,0(src)`
		5de29b	`+ addi src,src,8`
		5de29b	`+ std 6,0(dst)`
		5de29b	`+ addi dst,dst,8`
		5de29b	`+16:`
		5de29b	`+#endif`
		5de29b	`+ subf cnt,0,cnt`
		5de29b
		5de29b	`+/* Main aligned copy loop. Copies 128 bytes at a time. */`
		5de29b	`L(aligned_copy):`
		5de29b	`- /* Main aligned copy loop. Copies up to 128-bytes at a time. */`
		5de29b	`- .align 4`
		5de29b	`-4:`
		5de29b	`- /* check for any 32-byte or 64-byte lumps that are outside of a`
		5de29b	`- nice 128-byte range. R8 contains the number of 32-byte`
		5de29b	`- lumps, so drop this into the CR, and use the SO/EQ bits to help`
		5de29b	`- handle the 32- or 64- byte lumps. Then handle the rest with an`
		5de29b	`- unrolled 128-bytes-at-a-time copy loop. */`
		5de29b	`- mtocrf 1,8`
		5de29b	`- li 6,16 # 16() index`
		5de29b	`- li 7,32 # 32() index`
		5de29b	`- li 8,48 # 48() index`
		5de29b	`-`
		5de29b	`-L(aligned_32byte):`
		5de29b	`- /* if the SO bit (indicating a 32-byte lump) is not set, move along. */`
		5de29b	`- bns cr7,L(aligned_64byte)`
		5de29b	`- lxvd2x 6,0,11`
		5de29b	`- lxvd2x 7,11,6`
		5de29b	`- addi 11,11,32`
		5de29b	`- stxvd2x 6,0,10`
		5de29b	`- stxvd2x 7,10,6`
		5de29b	`- addi 10,10,32`
		5de29b	`-`
		5de29b	`-L(aligned_64byte):`
		5de29b	`- /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */`
		5de29b	`- bne cr7,L(aligned_128setup)`
		5de29b	`- lxvd2x 6,0,11`
		5de29b	`- lxvd2x 7,11,6`
		5de29b	`- lxvd2x 8,11,7`
		5de29b	`- lxvd2x 9,11,8`
		5de29b	`- addi 11,11,64`
		5de29b	`- stxvd2x 6,0,10`
		5de29b	`- stxvd2x 7,10,6`
		5de29b	`- stxvd2x 8,10,7`
		5de29b	`- stxvd2x 9,10,8`
		5de29b	`- addi 10,10,64`
		5de29b	`-`
		5de29b	`-L(aligned_128setup):`
		5de29b	`- /* Set up for the 128-byte at a time copy loop. */`
		5de29b	`- srdi 8,31,7`
		5de29b	`- cmpdi 8,0 # Any 4x lumps left?`
		5de29b	`- beq 3f # if not, move along.`
		5de29b	`- lxvd2x 6,0,11`
		5de29b	`- lxvd2x 7,11,6`
		5de29b	`- mtctr 8 # otherwise, load the ctr and begin.`
		5de29b	`- li 8,48 # 48() index`
		5de29b	`+ li 6,16`
		5de29b	`+ li 7,32`
		5de29b	`+ li 8,48`
		5de29b	`+ mtocrf 0x02,cnt`
		5de29b	`+ srdi 12,cnt,7`
		5de29b	`+ cmpdi 12,0`
		5de29b	`+ beq L(aligned_tail)`
		5de29b	`+ lxvd2x 6,0,src`
		5de29b	`+ lxvd2x 7,src,6`
		5de29b	`+ mtctr 12`
		5de29b	`b L(aligned_128loop)`
		5de29b
		5de29b	`+ .align 4`
		5de29b	`L(aligned_128head):`
		5de29b	`/* for the 2nd + iteration of this loop. */`
		5de29b	`- lxvd2x 6,0,11`
		5de29b	`- lxvd2x 7,11,6`
		5de29b	`+ lxvd2x 6,0,src`
		5de29b	`+ lxvd2x 7,src,6`
		5de29b	`L(aligned_128loop):`
		5de29b	`- lxvd2x 8,11,7`
		5de29b	`- lxvd2x 9,11,8`
		5de29b	`- stxvd2x 6,0,10`
		5de29b	`- addi 11,11,64`
		5de29b	`- stxvd2x 7,10,6`
		5de29b	`- stxvd2x 8,10,7`
		5de29b	`- stxvd2x 9,10,8`
		5de29b	`- lxvd2x 6,0,11`
		5de29b	`- lxvd2x 7,11,6`
		5de29b	`- addi 10,10,64`
		5de29b	`- lxvd2x 8,11,7`
		5de29b	`- lxvd2x 9,11,8`
		5de29b	`- addi 11,11,64`
		5de29b	`- stxvd2x 6,0,10`
		5de29b	`- stxvd2x 7,10,6`
		5de29b	`- stxvd2x 8,10,7`
		5de29b	`- stxvd2x 9,10,8`
		5de29b	`- addi 10,10,64`
		5de29b	`+ lxvd2x 8,src,7`
		5de29b	`+ lxvd2x 9,src,8`
		5de29b	`+ stxvd2x 6,0,dst`
		5de29b	`+ addi src,src,64`
		5de29b	`+ stxvd2x 7,dst,6`
		5de29b	`+ stxvd2x 8,dst,7`
		5de29b	`+ stxvd2x 9,dst,8`
		5de29b	`+ lxvd2x 6,0,src`
		5de29b	`+ lxvd2x 7,src,6`
		5de29b	`+ addi dst,dst,64`
		5de29b	`+ lxvd2x 8,src,7`
		5de29b	`+ lxvd2x 9,src,8`
		5de29b	`+ addi src,src,64`
		5de29b	`+ stxvd2x 6,0,dst`
		5de29b	`+ stxvd2x 7,dst,6`
		5de29b	`+ stxvd2x 8,dst,7`
		5de29b	`+ stxvd2x 9,dst,8`
		5de29b	`+ addi dst,dst,64`
		5de29b	`bdnz L(aligned_128head)`
		5de29b
		5de29b	`-3:`
		5de29b	`- /* Check for tail bytes. */`
		5de29b	`- rldicr 0,31,0,60`
		5de29b	`- mtcrf 0x01,31`
		5de29b	`- beq cr6,0f`
		5de29b	`-`
		5de29b	`-.L9:`
		5de29b	`- add 3,3,0`
		5de29b	`- add 12,12,0`
		5de29b	`-`
		5de29b	`- /* At this point we have a tail of 0-7 bytes and we know that the`
		5de29b	`- destination is doubleword-aligned. */`
		5de29b	`-4: /* Copy 4 bytes. */`
		5de29b	`- bf 29,2f`
		5de29b	`-`
		5de29b	`- lwz 6,0(12)`
		5de29b	`- addi 12,12,4`
		5de29b	`- stw 6,0(3)`
		5de29b	`- addi 3,3,4`
		5de29b	`-2: /* Copy 2 bytes. */`
		5de29b	`- bf 30,1f`
		5de29b	`-`
		5de29b	`- lhz 6,0(12)`
		5de29b	`- addi 12,12,2`
		5de29b	`- sth 6,0(3)`
		5de29b	`- addi 3,3,2`
		5de29b	`-1: /* Copy 1 byte. */`
		5de29b	`- bf 31,0f`
		5de29b	`-`
		5de29b	`- lbz 6,0(12)`
		5de29b	`- stb 6,0(3)`
		5de29b	`-0: /* Return original DST pointer. */`
		5de29b	`- ld 31,-8(1)`
		5de29b	`- ld 3,-16(1)`
		5de29b	`+L(aligned_tail):`
		5de29b	`+ mtocrf 0x01,cnt`
		5de29b	`+ bf 25,32f`
		5de29b	`+ lxvd2x 6,0,src`
		5de29b	`+ lxvd2x 7,src,6`
		5de29b	`+ lxvd2x 8,src,7`
		5de29b	`+ lxvd2x 9,src,8`
		5de29b	`+ addi src,src,64`
		5de29b	`+ stxvd2x 6,0,dst`
		5de29b	`+ stxvd2x 7,dst,6`
		5de29b	`+ stxvd2x 8,dst,7`
		5de29b	`+ stxvd2x 9,dst,8`
		5de29b	`+ addi dst,dst,64`
		5de29b	`+32:`
		5de29b	`+ bf 26,16f`
		5de29b	`+ lxvd2x 6,0,src`
		5de29b	`+ lxvd2x 7,src,6`
		5de29b	`+ addi src,src,32`
		5de29b	`+ stxvd2x 6,0,dst`
		5de29b	`+ stxvd2x 7,dst,6`
		5de29b	`+ addi dst,dst,32`
		5de29b	`+16:`
		5de29b	`+ bf 27,8f`
		5de29b	`+ lxvd2x 6,0,src`
		5de29b	`+ addi src,src,16`
		5de29b	`+ stxvd2x 6,0,dst`
		5de29b	`+ addi dst,dst,16`
		5de29b	`+8:`
		5de29b	`+ bf 28,4f`
		5de29b	`+ ld 6,0(src)`
		5de29b	`+ addi src,src,8`
		5de29b	`+ std 6,0(dst)`
		5de29b	`+ addi dst,dst,8`
		5de29b	`+4: /* Copies 4~7 bytes. */`
		5de29b	`+ bf 29,L(tail2)`
		5de29b	`+ lwz 6,0(src)`
		5de29b	`+ stw 6,0(dst)`
		5de29b	`+ bf 30,L(tail5)`
		5de29b	`+ lhz 7,4(src)`
		5de29b	`+ sth 7,4(dst)`
		5de29b	`+ bflr 31`
		5de29b	`+ lbz 8,6(src)`
		5de29b	`+ stb 8,6(dst)`
		5de29b	`+ /* Return original DST pointer. */`
		5de29b	`blr`
		5de29b
		5de29b	`- /* Handle copies of 0~31 bytes. */`
		5de29b	`- .align 4`
		5de29b	`+`
		5de29b	`+/* Handle copies of 0~31 bytes. */`
		5de29b	`+ .align 4`
		5de29b	`L(copy_LT_32):`
		5de29b	`- cmpldi cr6,5,8`
		5de29b	`- mr 12,4`
		5de29b	`- mtcrf 0x01,5`
		5de29b	`+ mr dst,3`
		5de29b	`+ cmpldi cr6,cnt,8`
		5de29b	`+ mtocrf 0x01,cnt`
		5de29b	`ble cr6,L(copy_LE_8)`
		5de29b
		5de29b	`/* At least 9 bytes to go. */`
		5de29b	`neg 8,4`
		5de29b	`- clrrdi 11,4,2`
		5de29b	`- andi. 0,8,3`
		5de29b	`- cmpldi cr1,5,16`
		5de29b	`- mr 10,5`
		5de29b	`+ andi. 0,8,3`
		5de29b	`+ cmpldi cr1,cnt,16`
		5de29b	`beq L(copy_LT_32_aligned)`
		5de29b
		5de29b	`- /* Force 4-bytes alignment for SRC. */`
		5de29b	`- mtocrf 0x01,0`
		5de29b	`- subf 10,0,5`
		5de29b	`-2: bf 30,1f`
		5de29b	`-`
		5de29b	`- lhz 6,0(12)`
		5de29b	`- addi 12,12,2`
		5de29b	`- sth 6,0(3)`
		5de29b	`- addi 3,3,2`
		5de29b	`-1: bf 31,L(end_4bytes_alignment)`
		5de29b	`-`
		5de29b	`- lbz 6,0(12)`
		5de29b	`- addi 12,12,1`
		5de29b	`- stb 6,0(3)`
		5de29b	`- addi 3,3,1`
		5de29b	`+ /* Force 4-byte alignment for SRC. */`
		5de29b	`+ mtocrf 0x01,0`
		5de29b	`+ subf cnt,0,cnt`
		5de29b	`+2:`
		5de29b	`+ bf 30,1f`
		5de29b	`+ lhz 6,0(src)`
		5de29b	`+ addi src,src,2`
		5de29b	`+ sth 6,0(dst)`
		5de29b	`+ addi dst,dst,2`
		5de29b	`+1:`
		5de29b	`+ bf 31,L(end_4bytes_alignment)`
		5de29b	`+ lbz 6,0(src)`
		5de29b	`+ addi src,src,1`
		5de29b	`+ stb 6,0(dst)`
		5de29b	`+ addi dst,dst,1`
		5de29b
		5de29b	`- .align 4`
		5de29b	`+ .align 4`
		5de29b	`L(end_4bytes_alignment):`
		5de29b	`- cmpldi cr1,10,16`
		5de29b	`- mtcrf 0x01,10`
		5de29b	`+ cmpldi cr1,cnt,16`
		5de29b	`+ mtocrf 0x01,cnt`
		5de29b
		5de29b	`L(copy_LT_32_aligned):`
		5de29b	`/* At least 6 bytes to go, and SRC is word-aligned. */`
		5de29b	`blt cr1,8f`
		5de29b
		5de29b	`/* Copy 16 bytes. */`
		5de29b	`- lwz 6,0(12)`
		5de29b	`- lwz 7,4(12)`
		5de29b	`- stw 6,0(3)`
		5de29b	`- lwz 8,8(12)`
		5de29b	`- stw 7,4(3)`
		5de29b	`- lwz 6,12(12)`
		5de29b	`- addi 12,12,16`
		5de29b	`- stw 8,8(3)`
		5de29b	`- stw 6,12(3)`
		5de29b	`- addi 3,3,16`
		5de29b	`+ lwz 6,0(src)`
		5de29b	`+ lwz 7,4(src)`
		5de29b	`+ stw 6,0(dst)`
		5de29b	`+ lwz 8,8(src)`
		5de29b	`+ stw 7,4(dst)`
		5de29b	`+ lwz 6,12(src)`
		5de29b	`+ addi src,src,16`
		5de29b	`+ stw 8,8(dst)`
		5de29b	`+ stw 6,12(dst)`
		5de29b	`+ addi dst,dst,16`
		5de29b	`8: /* Copy 8 bytes. */`
		5de29b	`- bf 28,4f`
		5de29b	`+ bf 28,L(tail4)`
		5de29b	`+ lwz 6,0(src)`
		5de29b	`+ lwz 7,4(src)`
		5de29b	`+ addi src,src,8`
		5de29b	`+ stw 6,0(dst)`
		5de29b	`+ stw 7,4(dst)`
		5de29b	`+ addi dst,dst,8`
		5de29b	`+`
		5de29b	`+ .align 4`
		5de29b	`+/* Copies 4~7 bytes. */`
		5de29b	`+L(tail4):`
		5de29b	`+ bf 29,L(tail2)`
		5de29b	`+ lwz 6,0(src)`
		5de29b	`+ stw 6,0(dst)`
		5de29b	`+ bf 30,L(tail5)`
		5de29b	`+ lhz 7,4(src)`
		5de29b	`+ sth 7,4(dst)`
		5de29b	`+ bflr 31`
		5de29b	`+ lbz 8,6(src)`
		5de29b	`+ stb 8,6(dst)`
		5de29b	`+ /* Return original DST pointer. */`
		5de29b	`+ blr`
		5de29b
		5de29b	`- lwz 6,0(12)`
		5de29b	`- lwz 7,4(12)`
		5de29b	`- addi 12,12,8`
		5de29b	`- stw 6,0(3)`
		5de29b	`- stw 7,4(3)`
		5de29b	`- addi 3,3,8`
		5de29b	`-4: /* Copy 4 bytes. */`
		5de29b	`- bf 29,2f`
		5de29b	`-`
		5de29b	`- lwz 6,0(12)`
		5de29b	`- addi 12,12,4`
		5de29b	`- stw 6,0(3)`
		5de29b	`- addi 3,3,4`
		5de29b	`-2: /* Copy 2-3 bytes. */`
		5de29b	`+ .align 4`
		5de29b	`+/* Copies 2~3 bytes. */`
		5de29b	`+L(tail2):`
		5de29b	`bf 30,1f`
		5de29b	`-`
		5de29b	`- lhz 6,0(12)`
		5de29b	`- sth 6,0(3)`
		5de29b	`- bf 31,0f`
		5de29b	`- lbz 7,2(12)`
		5de29b	`- stb 7,2(3)`
		5de29b	`- ld 3,-16(1)`
		5de29b	`+ lhz 6,0(src)`
		5de29b	`+ sth 6,0(dst)`
		5de29b	`+ bflr 31`
		5de29b	`+ lbz 7,2(src)`
		5de29b	`+ stb 7,2(dst)`
		5de29b	`blr`
		5de29b
		5de29b	`- .align 4`
		5de29b	`-1: /* Copy 1 byte. */`
		5de29b	`- bf 31,0f`
		5de29b	`+ .align 4`
		5de29b	`+L(tail5):`
		5de29b	`+ bflr 31`
		5de29b	`+ lbz 6,4(src)`
		5de29b	`+ stb 6,4(dst)`
		5de29b	`+ blr`
		5de29b
		5de29b	`- lbz 6,0(12)`
		5de29b	`- stb 6,0(3)`
		5de29b	`-0: /* Return original DST pointer. */`
		5de29b	`- ld 3,-16(1)`
		5de29b	`+ .align 4`
		5de29b	`+1:`
		5de29b	`+ bflr 31`
		5de29b	`+ lbz 6,0(src)`
		5de29b	`+ stb 6,0(dst)`
		5de29b	`+ /* Return original DST pointer. */`
		5de29b	`blr`
		5de29b
		5de29b	`- /* Handles copies of 0~8 bytes. */`
		5de29b	`- .align 4`
		5de29b	`+`
		5de29b	`+/* Handles copies of 0~8 bytes. */`
		5de29b	`+ .align 4`
		5de29b	`L(copy_LE_8):`
		5de29b	`- bne cr6,4f`
		5de29b	`+ bne cr6,L(tail4)`
		5de29b
		5de29b	`/* Though we could've used ld/std here, they are still`
		5de29b	`slow for unaligned cases. */`
		5de29b
		5de29b	`- lwz 6,0(4)`
		5de29b	`- lwz 7,4(4)`
		5de29b	`- stw 6,0(3)`
		5de29b	`- stw 7,4(3)`
		5de29b	`- ld 3,-16(1) /* Return original DST pointers. */`
		5de29b	`+ lwz 6,0(src)`
		5de29b	`+ lwz 7,4(src)`
		5de29b	`+ stw 6,0(dst)`
		5de29b	`+ stw 7,4(dst)`
		5de29b	`blr`
		5de29b
		5de29b	`- .align 4`
		5de29b	`-4: /* Copies 4~7 bytes. */`
		5de29b	`- bf 29,2b`
		5de29b
		5de29b	`- lwz 6,0(4)`
		5de29b	`- stw 6,0(3)`
		5de29b	`- bf 30,5f`
		5de29b	`- lhz 7,4(4)`
		5de29b	`- sth 7,4(3)`
		5de29b	`- bf 31,0f`
		5de29b	`- lbz 8,6(4)`
		5de29b	`- stb 8,6(3)`
		5de29b	`- ld 3,-16(1)`
		5de29b	`- blr`
		5de29b	`-`
		5de29b	`- .align 4`
		5de29b	`-5: /* Copy 1 byte. */`
		5de29b	`- bf 31,0f`
		5de29b	`-`
		5de29b	`- lbz 6,4(4)`
		5de29b	`- stb 6,4(3)`
		5de29b	`-`
		5de29b	`-0: /* Return original DST pointer. */`
		5de29b	`- ld 3,-16(1)`
		5de29b	`- blr`
		5de29b	`-`
		5de29b	`- /* Handle copies of 32+ bytes where DST is aligned (to quadword) but`
		5de29b	`- SRC is not. Use aligned quadword loads from SRC, shifted to realign`
		5de29b	`- the data, allowing for aligned DST stores. */`
		5de29b	`- .align 4`
		5de29b	`+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but`
		5de29b	`+ SRC is not. Use aligned quadword loads from SRC, shifted to realign`
		5de29b	`+ the data, allowing for aligned DST stores. */`
		5de29b	`+ .align 4`
		5de29b	`L(copy_GE_32_unaligned):`
		5de29b	`- clrldi 0,0,60 /* Number of bytes until the 1st`
		5de29b	`- quadword. */`
		5de29b	`- andi. 11,3,15 /* Check alignment of DST (against`
		5de29b	`- quadwords). */`
		5de29b	`- srdi 9,5,4 /* Number of full quadwords remaining. */`
		5de29b	`+ clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */`
		5de29b	`+#ifndef __LITTLE_ENDIAN__`
		5de29b	`+ andi. 10,3,15 /* Check alignment of DST (against quadwords). */`
		5de29b	`+#endif`
		5de29b	`+ srdi 9,cnt,4 /* Number of full quadwords remaining. */`
		5de29b
		5de29b	`beq L(copy_GE_32_unaligned_cont)`
		5de29b
		5de29b	`- /* SRC is not quadword aligned, get it aligned. */`
		5de29b	`+ /* DST is not quadword aligned, get it aligned. */`
		5de29b
		5de29b	`- mtcrf 0x01,0`
		5de29b	`- subf 31,0,5`
		5de29b	`+ mtocrf 0x01,0`
		5de29b	`+ subf cnt,0,cnt`
		5de29b
		5de29b	`/* Vector instructions work best when proper alignment (16-bytes)`
		5de29b	`is present. Move 0~15 bytes as needed to get DST quadword-aligned. */`
		5de29b	`-1: /* Copy 1 byte. */`
		5de29b	`+1:`
		5de29b	`bf 31,2f`
		5de29b	`-`
		5de29b	`- lbz 6,0(12)`
		5de29b	`- addi 12,12,1`
		5de29b	`- stb 6,0(3)`
		5de29b	`- addi 3,3,1`
		5de29b	`-2: /* Copy 2 bytes. */`
		5de29b	`+ lbz 6,0(src)`
		5de29b	`+ addi src,src,1`
		5de29b	`+ stb 6,0(dst)`
		5de29b	`+ addi dst,dst,1`
		5de29b	`+2:`
		5de29b	`bf 30,4f`
		5de29b	`-`
		5de29b	`- lhz 6,0(12)`
		5de29b	`- addi 12,12,2`
		5de29b	`- sth 6,0(3)`
		5de29b	`- addi 3,3,2`
		5de29b	`-4: /* Copy 4 bytes. */`
		5de29b	`+ lhz 6,0(src)`
		5de29b	`+ addi src,src,2`
		5de29b	`+ sth 6,0(dst)`
		5de29b	`+ addi dst,dst,2`
		5de29b	`+4:`
		5de29b	`bf 29,8f`
		5de29b	`-`
		5de29b	`- lwz 6,0(12)`
		5de29b	`- addi 12,12,4`
		5de29b	`- stw 6,0(3)`
		5de29b	`- addi 3,3,4`
		5de29b	`-8: /* Copy 8 bytes. */`
		5de29b	`+ lwz 6,0(src)`
		5de29b	`+ addi src,src,4`
		5de29b	`+ stw 6,0(dst)`
		5de29b	`+ addi dst,dst,4`
		5de29b	`+8:`
		5de29b	`bf 28,0f`
		5de29b	`-`
		5de29b	`- ld 6,0(12)`
		5de29b	`- addi 12,12,8`
		5de29b	`- std 6,0(3)`
		5de29b	`- addi 3,3,8`
		5de29b	`+ ld 6,0(src)`
		5de29b	`+ addi src,src,8`
		5de29b	`+ std 6,0(dst)`
		5de29b	`+ addi dst,dst,8`
		5de29b	`0:`
		5de29b	`- clrldi 10,12,60 /* Check alignment of SRC. */`
		5de29b	`- srdi 9,31,4 /* Number of full quadwords remaining. */`
		5de29b	`+ srdi 9,cnt,4 /* Number of full quadwords remaining. */`
		5de29b
		5de29b	`/* The proper alignment is present, it is OK to copy the bytes now. */`
		5de29b	`L(copy_GE_32_unaligned_cont):`
		5de29b
		5de29b	`/* Setup two indexes to speed up the indexed vector operations. */`
		5de29b	`- clrldi 11,31,60`
		5de29b	`- li 6,16 /* Index for 16-bytes offsets. */`
		5de29b	`+ clrldi 10,cnt,60`
		5de29b	`+ li 6,16 /* Index for 16-bytes offsets. */`
		5de29b	`li 7,32 /* Index for 32-bytes offsets. */`
		5de29b	`- cmpldi cr1,11,0`
		5de29b	`- srdi 8,31,5 /* Setup the loop counter. */`
		5de29b	`- mr 10,3`
		5de29b	`- mr 11,12`
		5de29b	`- mtcrf 0x01,9`
		5de29b	`- cmpldi cr6,9,1`
		5de29b	`- lvsl 5,0,12`
		5de29b	`- lvx 3,0,12`
		5de29b	`- bf 31,L(setup_unaligned_loop)`
		5de29b	`-`
		5de29b	`- /* Copy another 16 bytes to align to 32-bytes due to the loop . */`
		5de29b	`- lvx 4,12,6`
		5de29b	`- vperm 6,3,4,5`
		5de29b	`- addi 11,12,16`
		5de29b	`- addi 10,3,16`
		5de29b	`- stvx 6,0,3`
		5de29b	`+ cmpldi cr1,10,0`
		5de29b	`+ srdi 8,cnt,5 /* Setup the loop counter. */`
		5de29b	`+ mtocrf 0x01,9`
		5de29b	`+ cmpldi cr6,9,1`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ lvsr 5,0,src`
		5de29b	`+#else`
		5de29b	`+ lvsl 5,0,src`
		5de29b	`+#endif`
		5de29b	`+ lvx 3,0,src`
		5de29b	`+ li 0,0`
		5de29b	`+ bf 31,L(setup_unaligned_loop)`
		5de29b	`+`
		5de29b	`+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */`
		5de29b	`+ lvx 4,src,6`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 6,4,3,5`
		5de29b	`+#else`
		5de29b	`+ vperm 6,3,4,5`
		5de29b	`+#endif`
		5de29b	`+ addi src,src,16`
		5de29b	`+ stvx 6,0,dst`
		5de29b	`+ addi dst,dst,16`
		5de29b	`vor 3,4,4`
		5de29b	`+ clrrdi 0,src,60`
		5de29b
		5de29b	`L(setup_unaligned_loop):`
		5de29b	`- mtctr 8`
		5de29b	`- ble cr6,L(end_unaligned_loop)`
		5de29b	`+ mtctr 8`
		5de29b	`+ ble cr6,L(end_unaligned_loop)`
		5de29b
		5de29b	`/* Copy 32 bytes at a time using vector instructions. */`
		5de29b	`- .align 4`
		5de29b	`+ .align 4`
		5de29b	`L(unaligned_loop):`
		5de29b
		5de29b	`/* Note: vr6/vr10 may contain data that was already copied,`
		5de29b	`@@ -444,63 +385,56 @@`
		5de29b	`some portions again. This is faster than having unaligned`
		5de29b	`vector instructions though. */`
		5de29b
		5de29b	`- lvx 4,11,6 /* vr4 = r11+16. */`
		5de29b	`- vperm 6,3,4,5 /* Merge the correctly-aligned portions`
		5de29b	`- of vr3/vr4 into vr6. */`
		5de29b	`- lvx 3,11,7 /* vr3 = r11+32. */`
		5de29b	`- vperm 10,4,3,5 /* Merge the correctly-aligned portions`
		5de29b	`- of vr3/vr4 into vr10. */`
		5de29b	`- addi 11,11,32`
		5de29b	`- stvx 6,0,10`
		5de29b	`- stvx 10,10,6`
		5de29b	`- addi 10,10,32`
		5de29b	`-`
		5de29b	`+ lvx 4,src,6`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 6,4,3,5`
		5de29b	`+#else`
		5de29b	`+ vperm 6,3,4,5`
		5de29b	`+#endif`
		5de29b	`+ lvx 3,src,7`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 10,3,4,5`
		5de29b	`+#else`
		5de29b	`+ vperm 10,4,3,5`
		5de29b	`+#endif`
		5de29b	`+ addi src,src,32`
		5de29b	`+ stvx 6,0,dst`
		5de29b	`+ stvx 10,dst,6`
		5de29b	`+ addi dst,dst,32`
		5de29b	`bdnz L(unaligned_loop)`
		5de29b
		5de29b	`- .align 4`
		5de29b	`+ clrrdi 0,src,60`
		5de29b	`+`
		5de29b	`+ .align 4`
		5de29b	`L(end_unaligned_loop):`
		5de29b
		5de29b	`/* Check for tail bytes. */`
		5de29b	`- rldicr 0,31,0,59`
		5de29b	`- mtcrf 0x01,31`
		5de29b	`- beq cr1,0f`
		5de29b	`+ mtocrf 0x01,cnt`
		5de29b	`+ beqlr cr1`
		5de29b
		5de29b	`- add 3,3,0`
		5de29b	`- add 12,12,0`
		5de29b	`+ add src,src,0`
		5de29b
		5de29b	`/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */`
		5de29b	`-8: /* Copy 8 bytes. */`
		5de29b	`+ /* Copy 8 bytes. */`
		5de29b	`bf 28,4f`
		5de29b	`-`
		5de29b	`- lwz 6,0(12)`
		5de29b	`- lwz 7,4(12)`
		5de29b	`- addi 12,12,8`
		5de29b	`- stw 6,0(3)`
		5de29b	`- stw 7,4(3)`
		5de29b	`- addi 3,3,8`
		5de29b	`-4: /* Copy 4 bytes. */`
		5de29b	`- bf 29,2f`
		5de29b	`-`
		5de29b	`- lwz 6,0(12)`
		5de29b	`- addi 12,12,4`
		5de29b	`- stw 6,0(3)`
		5de29b	`- addi 3,3,4`
		5de29b	`-2: /* Copy 2~3 bytes. */`
		5de29b	`- bf 30,1f`
		5de29b	`-`
		5de29b	`- lhz 6,0(12)`
		5de29b	`- addi 12,12,2`
		5de29b	`- sth 6,0(3)`
		5de29b	`- addi 3,3,2`
		5de29b	`-1: /* Copy 1 byte. */`
		5de29b	`- bf 31,0f`
		5de29b	`-`
		5de29b	`- lbz 6,0(12)`
		5de29b	`- stb 6,0(3)`
		5de29b	`-0: /* Return original DST pointer. */`
		5de29b	`- ld 31,-8(1)`
		5de29b	`- ld 3,-16(1)`
		5de29b	`+ lwz 6,0(src)`
		5de29b	`+ lwz 7,4(src)`
		5de29b	`+ addi src,src,8`
		5de29b	`+ stw 6,0(dst)`
		5de29b	`+ stw 7,4(dst)`
		5de29b	`+ addi dst,dst,8`
		5de29b	`+4: /* Copy 4~7 bytes. */`
		5de29b	`+ bf 29,L(tail2)`
		5de29b	`+ lwz 6,0(src)`
		5de29b	`+ stw 6,0(dst)`
		5de29b	`+ bf 30,L(tail5)`
		5de29b	`+ lhz 7,4(src)`
		5de29b	`+ sth 7,4(dst)`
		5de29b	`+ bflr 31`
		5de29b	`+ lbz 8,6(src)`
		5de29b	`+ stb 8,6(dst)`
		5de29b	`+ /* Return original DST pointer. */`
		5de29b	`blr`
		5de29b
		5de29b	`-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)`
		5de29b	`+END_GEN_TB (memcpy,TB_TOCLESS)`
		5de29b	`libc_hidden_builtin_def (memcpy)`
		12745e	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S`
		12745e	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500`
		12745e	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500`
		5de29b	`@@ -367,13 +367,21 @@`
		5de29b	`mr 11,12`
		5de29b	`mtcrf 0x01,9`
		5de29b	`cmpldi cr6,9,1`
		5de29b	`- lvsl 5,0,12`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ lvsr 5,0,12`
		5de29b	`+#else`
		5de29b	`+ lvsl 5,0,12`
		5de29b	`+#endif`
		5de29b	`lvx 3,0,12`
		5de29b	`bf 31,L(setup_unaligned_loop)`
		5de29b
		5de29b	`/* Copy another 16 bytes to align to 32-bytes due to the loop . */`
		5de29b	`lvx 4,12,6`
		5de29b	`- vperm 6,3,4,5`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 6,4,3,5`
		5de29b	`+#else`
		5de29b	`+ vperm 6,3,4,5`
		5de29b	`+#endif`
		5de29b	`addi 11,12,16`
		5de29b	`addi 10,3,16`
		5de29b	`stvx 6,0,3`
		5de29b	`@@ -393,11 +401,17 @@`
		5de29b	`vector instructions though. */`
		5de29b
		5de29b	`lvx 4,11,6 /* vr4 = r11+16. */`
		5de29b	`- vperm 6,3,4,5 /* Merge the correctly-aligned portions`
		5de29b	`- of vr3/vr4 into vr6. */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 6,4,3,5`
		5de29b	`+#else`
		5de29b	`+ vperm 6,3,4,5`
		5de29b	`+#endif`
		5de29b	`lvx 3,11,7 /* vr3 = r11+32. */`
		5de29b	`- vperm 10,4,3,5 /* Merge the correctly-aligned portions`
		5de29b	`- of vr3/vr4 into vr10. */`
		5de29b	`+#ifdef __LITTLE_ENDIAN__`
		5de29b	`+ vperm 10,3,4,5`
		5de29b	`+#else`
		5de29b	`+ vperm 10,4,3,5`
		5de29b	`+#endif`
		5de29b	`addi 11,11,32`
		5de29b	`stvx 6,0,10`
		5de29b	`stvx 10,10,6`

rpms / glibc

Source Code

Blame SOURCES/glibc-ppc64le-31.patch