Tree - rpms/glibc - CentOS Git server

olga / rpms / glibc

Forked from rpms/glibc 5 years ago

Source
Stats

Blame SOURCES/glibc-ppc64le-31.patch

Blob History Raw

		ce426f	`# commit 759cfef3ac4c07dba1ece0bbc1207e099348816d`
		ce426f	`# Author: Alan Modra <amodra@gmail.com>`
		ce426f	`# Date: Sat Aug 17 18:47:22 2013 +0930`
		ce426f	`#`
		ce426f	`# PowerPC LE memcpy`
		ce426f	`# http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html`
		ce426f	`#`
		ce426f	`# LIttle-endian support for memcpy. I spent some time cleaning up the`
		ce426f	`# 64-bit power7 memcpy, in order to avoid the extra alignment traps`
		ce426f	`# power7 takes for little-endian. It probably would have been better`
		ce426f	`# to copy the linux kernel version of memcpy.`
		ce426f	`#`
		ce426f	`# * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.`
		ce426f	`# * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.`
		ce426f	`# * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.`
		ce426f	`# * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.`
		ce426f	`# * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.`
		ce426f	`# * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.`
		ce426f	`# * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.`
		ce426f	`# * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.`
		ce426f	`# * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better`
		ce426f	`# use of regs. Use power7 mtocrf. Tidy function tails.`
		ce426f	`#`
		ce426f	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S`
		ce426f	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`@@ -205,15 +205,28 @@`
		ce426f	`blt cr6,5f`
		ce426f	`srwi 7,6,16`
		ce426f	`bgt cr6,3f`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ sth 7,0(3)`
		ce426f	`+#else`
		ce426f	`sth 6,0(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`3:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,24`
		ce426f	`+ stb 6,0(3)`
		ce426f	`+ sth 7,1(3)`
		ce426f	`+#else`
		ce426f	`stb 7,0(3)`
		ce426f	`sth 6,1(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`5:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,8`
		ce426f	`+#endif`
		ce426f	`stb 6,0(3)`
		ce426f	`7:`
		ce426f	`cmplwi cr1,10,16`
		ce426f	`@@ -341,13 +354,23 @@`
		ce426f	`bf 30,1f`
		ce426f
		ce426f	`/* there are at least two words to copy, so copy them */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srw 0,6,10`
		ce426f	`+ slw 8,7,9`
		ce426f	`+#else`
		ce426f	`slw 0,6,10 /* shift 1st src word to left align it in R0 */`
		ce426f	`srw 8,7,9 /* shift 2nd src word to right align it in R8 */`
		ce426f	`+#endif`
		ce426f	`or 0,0,8 /* or them to get word to store */`
		ce426f	`lwz 6,8(5) /* load the 3rd src word */`
		ce426f	`stw 0,0(4) /* store the 1st dst word */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srw 0,7,10`
		ce426f	`+ slw 8,6,9`
		ce426f	`+#else`
		ce426f	`slw 0,7,10 /* now left align 2nd src word into R0 */`
		ce426f	`srw 8,6,9 /* shift 3rd src word to right align it in R8 */`
		ce426f	`+#endif`
		ce426f	`or 0,0,8 /* or them to get word to store */`
		ce426f	`lwz 7,12(5)`
		ce426f	`stw 0,4(4) /* store the 2nd dst word */`
		ce426f	`@@ -355,8 +378,13 @@`
		ce426f	`addi 5,5,16`
		ce426f	`bf 31,4f`
		ce426f	`/* there is a third word to copy, so copy it */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srw 0,6,10`
		ce426f	`+ slw 8,7,9`
		ce426f	`+#else`
		ce426f	`slw 0,6,10 /* shift 3rd src word to left align it in R0 */`
		ce426f	`srw 8,7,9 /* shift 4th src word to right align it in R8 */`
		ce426f	`+#endif`
		ce426f	`or 0,0,8 /* or them to get word to store */`
		ce426f	`stw 0,0(4) /* store 3rd dst word */`
		ce426f	`mr 6,7`
		ce426f	`@@ -366,8 +394,13 @@`
		ce426f	`b 4f`
		ce426f	`.align 4`
		ce426f	`1:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srw 0,6,10`
		ce426f	`+ slw 8,7,9`
		ce426f	`+#else`
		ce426f	`slw 0,6,10 /* shift 1st src word to left align it in R0 */`
		ce426f	`srw 8,7,9 /* shift 2nd src word to right align it in R8 */`
		ce426f	`+#endif`
		ce426f	`addi 5,5,8`
		ce426f	`or 0,0,8 /* or them to get word to store */`
		ce426f	`bf 31,4f`
		ce426f	`@@ -380,23 +413,43 @@`
		ce426f	`.align 4`
		ce426f	`4:`
		ce426f	`/* copy 16 bytes at a time */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srw 0,6,10`
		ce426f	`+ slw 8,7,9`
		ce426f	`+#else`
		ce426f	`slw 0,6,10`
		ce426f	`srw 8,7,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`lwz 6,0(5)`
		ce426f	`stw 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srw 0,7,10`
		ce426f	`+ slw 8,6,9`
		ce426f	`+#else`
		ce426f	`slw 0,7,10`
		ce426f	`srw 8,6,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`lwz 7,4(5)`
		ce426f	`stw 0,4(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srw 0,6,10`
		ce426f	`+ slw 8,7,9`
		ce426f	`+#else`
		ce426f	`slw 0,6,10`
		ce426f	`srw 8,7,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`lwz 6,8(5)`
		ce426f	`stw 0,8(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srw 0,7,10`
		ce426f	`+ slw 8,6,9`
		ce426f	`+#else`
		ce426f	`slw 0,7,10`
		ce426f	`srw 8,6,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`lwz 7,12(5)`
		ce426f	`stw 0,12(4)`
		ce426f	`@@ -405,8 +458,13 @@`
		ce426f	`bdnz+ 4b`
		ce426f	`8:`
		ce426f	`/* calculate and store the final word */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srw 0,6,10`
		ce426f	`+ slw 8,7,9`
		ce426f	`+#else`
		ce426f	`slw 0,6,10`
		ce426f	`srw 8,7,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`stw 0,0(4)`
		ce426f	`3:`
		ce426f	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S`
		ce426f	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`@@ -221,15 +221,28 @@`
		ce426f	`blt cr6,5f`
		ce426f	`srwi 7,6,16`
		ce426f	`bgt cr6,3f`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ sth 7,0(3)`
		ce426f	`+#else`
		ce426f	`sth 6,0(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`3:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,24`
		ce426f	`+ stb 6,0(3)`
		ce426f	`+ sth 7,1(3)`
		ce426f	`+#else`
		ce426f	`stb 7,0(3)`
		ce426f	`sth 6,1(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`5:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,8`
		ce426f	`+#endif`
		ce426f	`stb 6,0(3)`
		ce426f	`7:`
		ce426f	`cmplwi cr1,10,16`
		ce426f	`@@ -579,7 +592,11 @@`
		ce426f	`lwz 6,-1(4)`
		ce426f	`cmplwi cr6,31,4`
		ce426f	`srwi 8,31,5 /* calculate the 32 byte loop count */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srwi 6,6,8`
		ce426f	`+#else`
		ce426f	`slwi 6,6,8`
		ce426f	`+#endif`
		ce426f	`clrlwi 31,31,27 /* The remaining bytes, < 32. */`
		ce426f	`blt cr5,L(wdu1_32tail)`
		ce426f	`mtctr 8`
		ce426f	`@@ -587,8 +604,12 @@`
		ce426f
		ce426f	`lwz 8,3(4)`
		ce426f	`lwz 7,4(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rldimi 6,8,24,32`
		ce426f	`+#else`
		ce426f	`/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */`
		ce426f	`rlwimi 6,8,8,(32-8),31`
		ce426f	`+#endif`
		ce426f	`b L(wdu1_loop32x)`
		ce426f	`.align 4`
		ce426f	`L(wdu1_loop32):`
		ce426f	`@@ -597,8 +618,12 @@`
		ce426f	`lwz 7,4(4)`
		ce426f	`stw 10,-8(3)`
		ce426f	`stw 11,-4(3)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rldimi 6,8,24,32`
		ce426f	`+#else`
		ce426f	`/* Equivalent to srwi 8,8,32-8; or 6,6,8 */`
		ce426f	`rlwimi 6,8,8,(32-8),31`
		ce426f	`+#endif`
		ce426f	`L(wdu1_loop32x):`
		ce426f	`lwz 10,8(4)`
		ce426f	`lwz 11,12(4)`
		ce426f	`@@ -615,7 +640,11 @@`
		ce426f	`stw 6,16(3)`
		ce426f	`stw 7,20(3)`
		ce426f	`addi 3,3,32`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srwi 6,8,8`
		ce426f	`+#else`
		ce426f	`slwi 6,8,8`
		ce426f	`+#endif`
		ce426f	`bdnz+ L(wdu1_loop32)`
		ce426f	`stw 10,-8(3)`
		ce426f	`stw 11,-4(3)`
		ce426f	`@@ -626,8 +655,12 @@`
		ce426f	`blt cr6,L(wdu_4tail)`
		ce426f	`/* calculate and store the final word */`
		ce426f	`lwz 8,3(4)`
		ce426f	`-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rldimi 6,8,24,32`
		ce426f	`+#else`
		ce426f	`+/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */`
		ce426f	`rlwimi 6,8,8,(32-8),31`
		ce426f	`+#endif`
		ce426f	`b L(wdu_32tailx)`
		ce426f
		ce426f	`L(wdu2_32):`
		ce426f	`@@ -635,7 +668,11 @@`
		ce426f	`lwz 6,-2(4)`
		ce426f	`cmplwi cr6,31,4`
		ce426f	`srwi 8,31,5 /* calculate the 32 byte loop count */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srwi 6,6,16`
		ce426f	`+#else`
		ce426f	`slwi 6,6,16`
		ce426f	`+#endif`
		ce426f	`clrlwi 31,31,27 /* The remaining bytes, < 32. */`
		ce426f	`blt cr5,L(wdu2_32tail)`
		ce426f	`mtctr 8`
		ce426f	`@@ -643,8 +680,11 @@`
		ce426f
		ce426f	`lwz 8,2(4)`
		ce426f	`lwz 7,4(4)`
		ce426f	`-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rldimi 6,8,16,32`
		ce426f	`+#else`
		ce426f	`rlwimi 6,8,16,(32-16),31`
		ce426f	`+#endif`
		ce426f	`b L(wdu2_loop32x)`
		ce426f	`.align 4`
		ce426f	`L(wdu2_loop32):`
		ce426f	`@@ -653,8 +693,11 @@`
		ce426f	`lwz 7,4(4)`
		ce426f	`stw 10,-8(3)`
		ce426f	`stw 11,-4(3)`
		ce426f	`-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rldimi 6,8,16,32`
		ce426f	`+#else`
		ce426f	`rlwimi 6,8,16,(32-16),31`
		ce426f	`+#endif`
		ce426f	`L(wdu2_loop32x):`
		ce426f	`lwz 10,8(4)`
		ce426f	`lwz 11,12(4)`
		ce426f	`@@ -672,7 +715,11 @@`
		ce426f	`stw 6,16(3)`
		ce426f	`stw 7,20(3)`
		ce426f	`addi 3,3,32`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srwi 6,8,16`
		ce426f	`+#else`
		ce426f	`slwi 6,8,16`
		ce426f	`+#endif`
		ce426f	`bdnz+ L(wdu2_loop32)`
		ce426f	`stw 10,-8(3)`
		ce426f	`stw 11,-4(3)`
		ce426f	`@@ -683,8 +730,11 @@`
		ce426f	`blt cr6,L(wdu_4tail)`
		ce426f	`/* calculate and store the final word */`
		ce426f	`lwz 8,2(4)`
		ce426f	`-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rldimi 6,8,16,32`
		ce426f	`+#else`
		ce426f	`rlwimi 6,8,16,(32-16),31`
		ce426f	`+#endif`
		ce426f	`b L(wdu_32tailx)`
		ce426f
		ce426f	`L(wdu3_32):`
		ce426f	`@@ -692,7 +742,11 @@`
		ce426f	`lwz 6,-3(4)`
		ce426f	`cmplwi cr6,31,4`
		ce426f	`srwi 8,31,5 /* calculate the 32 byte loop count */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srwi 6,6,24`
		ce426f	`+#else`
		ce426f	`slwi 6,6,24`
		ce426f	`+#endif`
		ce426f	`clrlwi 31,31,27 /* The remaining bytes, < 32. */`
		ce426f	`blt cr5,L(wdu3_32tail)`
		ce426f	`mtctr 8`
		ce426f	`@@ -700,8 +754,11 @@`
		ce426f
		ce426f	`lwz 8,1(4)`
		ce426f	`lwz 7,4(4)`
		ce426f	`-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rldimi 6,8,8,32`
		ce426f	`+#else`
		ce426f	`rlwimi 6,8,24,(32-24),31`
		ce426f	`+#endif`
		ce426f	`b L(wdu3_loop32x)`
		ce426f	`.align 4`
		ce426f	`L(wdu3_loop32):`
		ce426f	`@@ -710,8 +767,11 @@`
		ce426f	`lwz 7,4(4)`
		ce426f	`stw 10,-8(3)`
		ce426f	`stw 11,-4(3)`
		ce426f	`-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rldimi 6,8,8,32`
		ce426f	`+#else`
		ce426f	`rlwimi 6,8,24,(32-24),31`
		ce426f	`+#endif`
		ce426f	`L(wdu3_loop32x):`
		ce426f	`lwz 10,8(4)`
		ce426f	`lwz 11,12(4)`
		ce426f	`@@ -728,7 +788,11 @@`
		ce426f	`stw 6,16(3)`
		ce426f	`stw 7,20(3)`
		ce426f	`addi 3,3,32`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srwi 6,8,24`
		ce426f	`+#else`
		ce426f	`slwi 6,8,24`
		ce426f	`+#endif`
		ce426f	`bdnz+ L(wdu3_loop32)`
		ce426f	`stw 10,-8(3)`
		ce426f	`stw 11,-4(3)`
		ce426f	`@@ -739,8 +803,11 @@`
		ce426f	`blt cr6,L(wdu_4tail)`
		ce426f	`/* calculate and store the final word */`
		ce426f	`lwz 8,1(4)`
		ce426f	`-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rldimi 6,8,8,32`
		ce426f	`+#else`
		ce426f	`rlwimi 6,8,24,(32-24),31`
		ce426f	`+#endif`
		ce426f	`b L(wdu_32tailx)`
		ce426f	`.align 4`
		ce426f	`L(wdu_32tailx):`
		ce426f	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S`
		ce426f	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`@@ -385,7 +385,7 @@`
		ce426f
		ce426f	`beq L(copy_GE_32_unaligned_cont)`
		ce426f
		ce426f	`- /* SRC is not quadword aligned, get it aligned. */`
		ce426f	`+ /* DST is not quadword aligned, get it aligned. */`
		ce426f
		ce426f	`mtcrf 0x01,0`
		ce426f	`subf 31,0,5`
		ce426f	`@@ -437,13 +437,21 @@`
		ce426f	`mr 11,12`
		ce426f	`mtcrf 0x01,9`
		ce426f	`cmplwi cr6,9,1`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ lvsr 5,0,12`
		ce426f	`+#else`
		ce426f	`lvsl 5,0,12`
		ce426f	`+#endif`
		ce426f	`lvx 3,0,12`
		ce426f	`bf 31,L(setup_unaligned_loop)`
		ce426f
		ce426f	`/* Copy another 16 bytes to align to 32-bytes due to the loop . */`
		ce426f	`lvx 4,12,6`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 6,4,3,5`
		ce426f	`+#else`
		ce426f	`vperm 6,3,4,5`
		ce426f	`+#endif`
		ce426f	`addi 11,12,16`
		ce426f	`addi 10,3,16`
		ce426f	`stvx 6,0,3`
		ce426f	`@@ -463,11 +471,17 @@`
		ce426f	`vector instructions though. */`
		ce426f
		ce426f	`lvx 4,11,6 /* vr4 = r11+16. */`
		ce426f	`- vperm 6,3,4,5 /* Merge the correctly-aligned portions`
		ce426f	`- of vr3/vr4 into vr6. */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 6,4,3,5`
		ce426f	`+#else`
		ce426f	`+ vperm 6,3,4,5`
		ce426f	`+#endif`
		ce426f	`lvx 3,11,7 /* vr3 = r11+32. */`
		ce426f	`- vperm 10,4,3,5 /* Merge the correctly-aligned portions`
		ce426f	`- of vr3/vr4 into vr10. */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 10,3,4,5`
		ce426f	`+#else`
		ce426f	`+ vperm 10,4,3,5`
		ce426f	`+#endif`
		ce426f	`addi 11,11,32`
		ce426f	`stvx 6,0,10`
		ce426f	`stvx 10,10,6`
		ce426f	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S`
		ce426f	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`@@ -327,7 +327,7 @@`
		ce426f
		ce426f	`beq L(copy_GE_32_unaligned_cont)`
		ce426f
		ce426f	`- /* SRC is not quadword aligned, get it aligned. */`
		ce426f	`+ /* DST is not quadword aligned, get it aligned. */`
		ce426f
		ce426f	`mtcrf 0x01,0`
		ce426f	`subf 31,0,5`
		ce426f	`@@ -379,13 +379,21 @@`
		ce426f	`mr 11,12`
		ce426f	`mtcrf 0x01,9`
		ce426f	`cmplwi cr6,9,1`
		ce426f	`- lvsl 5,0,12`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ lvsr 5,0,12`
		ce426f	`+#else`
		ce426f	`+ lvsl 5,0,12`
		ce426f	`+#endif`
		ce426f	`lvx 3,0,12`
		ce426f	`bf 31,L(setup_unaligned_loop)`
		ce426f
		ce426f	`/* Copy another 16 bytes to align to 32-bytes due to the loop . */`
		ce426f	`lvx 4,12,6`
		ce426f	`- vperm 6,3,4,5`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 6,4,3,5`
		ce426f	`+#else`
		ce426f	`+ vperm 6,3,4,5`
		ce426f	`+#endif`
		ce426f	`addi 11,12,16`
		ce426f	`addi 10,3,16`
		ce426f	`stvx 6,0,3`
		ce426f	`@@ -405,11 +413,17 @@`
		ce426f	`vector instructions though. */`
		ce426f
		ce426f	`lvx 4,11,6 /* vr4 = r11+16. */`
		ce426f	`- vperm 6,3,4,5 /* Merge the correctly-aligned portions`
		ce426f	`- of vr3/vr4 into vr6. */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 6,4,3,5`
		ce426f	`+#else`
		ce426f	`+ vperm 6,3,4,5`
		ce426f	`+#endif`
		ce426f	`lvx 3,11,7 /* vr3 = r11+32. */`
		ce426f	`- vperm 10,4,3,5 /* Merge the correctly-aligned portions`
		ce426f	`- of vr3/vr4 into vr10. */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 10,3,4,5`
		ce426f	`+#else`
		ce426f	`+ vperm 10,4,3,5`
		ce426f	`+#endif`
		ce426f	`addi 11,11,32`
		ce426f	`stvx 6,0,10`
		ce426f	`stvx 10,10,6`
		ce426f	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S`
		ce426f	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`@@ -214,15 +214,28 @@`
		ce426f	`blt cr6,5f`
		ce426f	`srdi 7,6,16`
		ce426f	`bgt cr6,3f`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ sth 7,0(3)`
		ce426f	`+#else`
		ce426f	`sth 6,0(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`3:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,24`
		ce426f	`+ stb 6,0(3)`
		ce426f	`+ sth 7,1(3)`
		ce426f	`+#else`
		ce426f	`stb 7,0(3)`
		ce426f	`sth 6,1(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`5:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,8`
		ce426f	`+#endif`
		ce426f	`stb 6,0(3)`
		ce426f	`7:`
		ce426f	`cmpldi cr1,10,16`
		ce426f	`@@ -330,7 +343,11 @@`
		ce426f	`ld 7,8(5)`
		ce426f	`subfic 9,10,64`
		ce426f	`beq 2f`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,6,10`
		ce426f	`+#else`
		ce426f	`sld 0,6,10`
		ce426f	`+#endif`
		ce426f	`cmpldi 11,1`
		ce426f	`mr 6,7`
		ce426f	`addi 4,4,-8`
		ce426f	`@@ -338,15 +355,25 @@`
		ce426f	`b 1f`
		ce426f	`2: addi 5,5,8`
		ce426f	`.align 4`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+0: srd 0,6,10`
		ce426f	`+ sld 8,7,9`
		ce426f	`+#else`
		ce426f	`0: sld 0,6,10`
		ce426f	`srd 8,7,9`
		ce426f	`+#endif`
		ce426f	`cmpldi 11,2`
		ce426f	`ld 6,8(5)`
		ce426f	`or 0,0,8`
		ce426f	`addi 11,11,-2`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,7,10`
		ce426f	`+1: sld 8,6,9`
		ce426f	`+#else`
		ce426f	`sld 0,7,10`
		ce426f	`1: srd 8,6,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`beq 8f`
		ce426f	`ld 7,16(5)`
		ce426f	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S`
		ce426f	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S 2014-05-29 13:05:51.000000000 -0500`
		ce426f	`@@ -1,5 +1,5 @@`
		ce426f	`/* Optimized memcpy implementation for PowerPC64.`
		ce426f	`- Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.`
		ce426f	`+ Copyright (C) 2003-2014 Free Software Foundation, Inc.`
		ce426f	`This file is part of the GNU C Library.`
		ce426f
		ce426f	`The GNU C Library is free software; you can redistribute it and/or`
		ce426f	`@@ -17,26 +17,24 @@`
		ce426f	`<http://www.gnu.org/licenses/>. */`
		ce426f
		ce426f	`#include <sysdep.h>`
		ce426f	`-#include <bp-sym.h>`
		ce426f	`-#include <bp-asm.h>`
		ce426f
		ce426f	`/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);`
		ce426f	`Returns 'dst'.`
		ce426f
		ce426f	`- Memcpy handles short copies (< 32-bytes) using a binary move blocks`
		ce426f	`- (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled`
		ce426f	`- with the appropriate combination of byte and halfword load/stores.`
		ce426f	`- There is minimal effort to optimize the alignment of short moves.`
		ce426f	`+ Memcpy handles short copies (< 32-bytes) using a binary move blocks`
		ce426f	`+ (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled`
		ce426f	`+ with the appropriate combination of byte and halfword load/stores.`
		ce426f	`+ There is minimal effort to optimize the alignment of short moves.`
		ce426f	`The 64-bit implementations of POWER3 and POWER4 do a reasonable job`
		ce426f	`- of handling unligned load/stores that do not cross 32-byte boundries.`
		ce426f	`+ of handling unaligned load/stores that do not cross 32-byte boundaries.`
		ce426f
		ce426f	`Longer moves (>= 32-bytes) justify the effort to get at least the`
		ce426f	`destination doubleword (8-byte) aligned. Further optimization is`
		ce426f	`- posible when both source and destination are doubleword aligned.`
		ce426f	`+ possible when both source and destination are doubleword aligned.`
		ce426f	`Each case has a optimized unrolled loop. */`
		ce426f
		ce426f	`.machine power4`
		ce426f	`-EALIGN (BP_SYM (memcpy), 5, 0)`
		ce426f	`+EALIGN (memcpy, 5, 0)`
		ce426f	`CALL_MCOUNT 3`
		ce426f
		ce426f	`cmpldi cr1,5,31`
		ce426f	`@@ -44,20 +42,20 @@`
		ce426f	`std 3,-16(1)`
		ce426f	`std 31,-8(1)`
		ce426f	`cfi_offset(31,-8)`
		ce426f	`- andi. 11,3,7 /* check alignement of dst. */`
		ce426f	`+ andi. 11,3,7 /* check alignment of dst. */`
		ce426f	`clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */`
		ce426f	`- clrldi 10,4,61 /* check alignement of src. */`
		ce426f	`+ clrldi 10,4,61 /* check alignment of src. */`
		ce426f	`cmpldi cr6,5,8`
		ce426f	`ble- cr1,.L2 /* If move < 32 bytes use short move code. */`
		ce426f	`- cmpld cr6,10,11`
		ce426f	`+ cmpld cr6,10,11`
		ce426f	`mr 12,4`
		ce426f	`srdi 9,5,3 /* Number of full double words remaining. */`
		ce426f	`mtcrf 0x01,0`
		ce426f	`mr 31,5`
		ce426f	`beq .L0`
		ce426f	`-`
		ce426f	`+`
		ce426f	`subf 31,0,5`
		ce426f	`- /* Move 0-7 bytes as needed to get the destination doubleword alligned. */`
		ce426f	`+ /* Move 0-7 bytes as needed to get the destination doubleword aligned. */`
		ce426f	`1: bf 31,2f`
		ce426f	`lbz 6,0(12)`
		ce426f	`addi 12,12,1`
		ce426f	`@@ -74,17 +72,17 @@`
		ce426f	`stw 6,0(3)`
		ce426f	`addi 3,3,4`
		ce426f	`0:`
		ce426f	`- clrldi 10,12,61 /* check alignement of src again. */`
		ce426f	`+ clrldi 10,12,61 /* check alignment of src again. */`
		ce426f	`srdi 9,31,3 /* Number of full double words remaining. */`
		ce426f	`-`
		ce426f	`- /* Copy doublewords from source to destination, assumpting the`
		ce426f	`+`
		ce426f	`+ /* Copy doublewords from source to destination, assuming the`
		ce426f	`destination is aligned on a doubleword boundary.`
		ce426f
		ce426f	`At this point we know there are at least 25 bytes left (32-7) to copy.`
		ce426f	`- The next step is to determine if the source is also doubleword aligned.`
		ce426f	`+ The next step is to determine if the source is also doubleword aligned.`
		ce426f	`If not branch to the unaligned move code at .L6. which uses`
		ce426f	`a load, shift, store strategy.`
		ce426f	`-`
		ce426f	`+`
		ce426f	`Otherwise source and destination are doubleword aligned, and we can`
		ce426f	`the optimized doubleword copy loop. */`
		ce426f	`.L0:`
		ce426f	`@@ -97,14 +95,14 @@`
		ce426f	`Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.`
		ce426f	`If the copy is not an exact multiple of 32 bytes, 1-3`
		ce426f	`doublewords are copied as needed to set up the main loop. After`
		ce426f	`- the main loop exits there may be a tail of 1-7 bytes. These byte are`
		ce426f	`+ the main loop exits there may be a tail of 1-7 bytes. These byte are`
		ce426f	`copied a word/halfword/byte at a time as needed to preserve alignment. */`
		ce426f
		ce426f	`srdi 8,31,5`
		ce426f	`cmpldi cr1,9,4`
		ce426f	`cmpldi cr6,11,0`
		ce426f	`mr 11,12`
		ce426f	`-`
		ce426f	`+`
		ce426f	`bf 30,1f`
		ce426f	`ld 6,0(12)`
		ce426f	`ld 7,8(12)`
		ce426f	`@@ -115,7 +113,7 @@`
		ce426f	`addi 10,3,16`
		ce426f	`bf 31,4f`
		ce426f	`ld 0,16(12)`
		ce426f	`- std 0,16(3)`
		ce426f	`+ std 0,16(3)`
		ce426f	`blt cr1,3f`
		ce426f	`addi 11,12,24`
		ce426f	`addi 10,3,24`
		ce426f	`@@ -129,7 +127,7 @@`
		ce426f	`addi 11,12,8`
		ce426f	`std 6,0(3)`
		ce426f	`addi 10,3,8`
		ce426f	`-`
		ce426f	`+`
		ce426f	`.align 4`
		ce426f	`4:`
		ce426f	`ld 6,0(11)`
		ce426f	`@@ -144,7 +142,7 @@`
		ce426f	`std 0,24(10)`
		ce426f	`addi 10,10,32`
		ce426f	`bdnz 4b`
		ce426f	`-3:`
		ce426f	`+3:`
		ce426f
		ce426f	`rldicr 0,31,0,60`
		ce426f	`mtcrf 0x01,31`
		ce426f	`@@ -152,9 +150,9 @@`
		ce426f	`.L9:`
		ce426f	`add 3,3,0`
		ce426f	`add 12,12,0`
		ce426f	`-`
		ce426f	`+`
		ce426f	`/* At this point we have a tail of 0-7 bytes and we know that the`
		ce426f	`- destiniation is double word aligned. */`
		ce426f	`+ destination is double word aligned. */`
		ce426f	`4: bf 29,2f`
		ce426f	`lwz 6,0(12)`
		ce426f	`addi 12,12,4`
		ce426f	`@@ -173,29 +171,29 @@`
		ce426f	`ld 31,-8(1)`
		ce426f	`ld 3,-16(1)`
		ce426f	`blr`
		ce426f	`-`
		ce426f	`-/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31`
		ce426f	`- bytes. Each case is handled without loops, using binary (1,2,4,8)`
		ce426f	`- tests.`
		ce426f	`-`
		ce426f	`+`
		ce426f	`+/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31`
		ce426f	`+ bytes. Each case is handled without loops, using binary (1,2,4,8)`
		ce426f	`+ tests.`
		ce426f	`+`
		ce426f	`In the short (0-8 byte) case no attempt is made to force alignment`
		ce426f	`- of either source or destination. The hardware will handle the`
		ce426f	`- unaligned load/stores with small delays for crossing 32- 64-byte, and`
		ce426f	`+ of either source or destination. The hardware will handle the`
		ce426f	`+ unaligned load/stores with small delays for crossing 32- 64-byte, and`
		ce426f	`4096-byte boundaries. Since these short moves are unlikely to be`
		ce426f	`- unaligned or cross these boundaries, the overhead to force`
		ce426f	`+ unaligned or cross these boundaries, the overhead to force`
		ce426f	`alignment is not justified.`
		ce426f	`-`
		ce426f	`+`
		ce426f	`The longer (9-31 byte) move is more likely to cross 32- or 64-byte`
		ce426f	`boundaries. Since only loads are sensitive to the 32-/64-byte`
		ce426f	`- boundaries it is more important to align the source then the`
		ce426f	`+ boundaries it is more important to align the source then the`
		ce426f	`destination. If the source is not already word aligned, we first`
		ce426f	`- move 1-3 bytes as needed. Since we are only word aligned we don't`
		ce426f	`- use double word load/stores to insure that all loads are aligned.`
		ce426f	`+ move 1-3 bytes as needed. Since we are only word aligned we don't`
		ce426f	`+ use double word load/stores to insure that all loads are aligned.`
		ce426f	`While the destination and stores may still be unaligned, this`
		ce426f	`is only an issue for page (4096 byte boundary) crossing, which`
		ce426f	`should be rare for these short moves. The hardware handles this`
		ce426f	`- case automatically with a small delay. */`
		ce426f	`-`
		ce426f	`+ case automatically with a small delay. */`
		ce426f	`+`
		ce426f	`.align 4`
		ce426f	`.L2:`
		ce426f	`mtcrf 0x01,5`
		ce426f	`@@ -216,15 +214,28 @@`
		ce426f	`blt cr6,5f`
		ce426f	`srdi 7,6,16`
		ce426f	`bgt cr6,3f`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ sth 7,0(3)`
		ce426f	`+#else`
		ce426f	`sth 6,0(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`3:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,24`
		ce426f	`+ stb 6,0(3)`
		ce426f	`+ sth 7,1(3)`
		ce426f	`+#else`
		ce426f	`stb 7,0(3)`
		ce426f	`sth 6,1(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`5:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,8`
		ce426f	`+#endif`
		ce426f	`stb 6,0(3)`
		ce426f	`7:`
		ce426f	`cmpldi cr1,10,16`
		ce426f	`@@ -258,11 +269,11 @@`
		ce426f	`lwz 6,0(12)`
		ce426f	`addi 12,12,4`
		ce426f	`stw 6,0(3)`
		ce426f	`- addi 3,3,4`
		ce426f	`+ addi 3,3,4`
		ce426f	`2: /* Move 2-3 bytes. */`
		ce426f	`bf 30,1f`
		ce426f	`lhz 6,0(12)`
		ce426f	`- sth 6,0(3)`
		ce426f	`+ sth 6,0(3)`
		ce426f	`bf 31,0f`
		ce426f	`lbz 7,2(12)`
		ce426f	`stb 7,2(3)`
		ce426f	`@@ -283,8 +294,8 @@`
		ce426f	`mr 12,4`
		ce426f	`bne cr6,4f`
		ce426f	`/* Would have liked to use use ld/std here but the 630 processors are`
		ce426f	`- slow for load/store doubles that are not at least word aligned.`
		ce426f	`- Unaligned Load/Store word execute with only a 1 cycle penaltity. */`
		ce426f	`+ slow for load/store doubles that are not at least word aligned.`
		ce426f	`+ Unaligned Load/Store word execute with only a 1 cycle penalty. */`
		ce426f	`lwz 6,0(4)`
		ce426f	`lwz 7,4(4)`
		ce426f	`stw 6,0(3)`
		ce426f	`@@ -299,14 +310,14 @@`
		ce426f	`6:`
		ce426f	`bf 30,5f`
		ce426f	`lhz 7,4(4)`
		ce426f	`- sth 7,4(3)`
		ce426f	`+ sth 7,4(3)`
		ce426f	`bf 31,0f`
		ce426f	`lbz 8,6(4)`
		ce426f	`stb 8,6(3)`
		ce426f	`ld 3,-16(1)`
		ce426f	`blr`
		ce426f	`.align 4`
		ce426f	`-5:`
		ce426f	`+5:`
		ce426f	`bf 31,0f`
		ce426f	`lbz 6,4(4)`
		ce426f	`stb 6,4(3)`
		ce426f	`@@ -336,13 +347,23 @@`
		ce426f	`bf 30,1f`
		ce426f
		ce426f	`/* there are at least two DWs to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,6,10`
		ce426f	`+ sld 8,7,9`
		ce426f	`+#else`
		ce426f	`sld 0,6,10`
		ce426f	`srd 8,7,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,7,10`
		ce426f	`+ sld 8,6,9`
		ce426f	`+#else`
		ce426f	`sld 0,7,10`
		ce426f	`srd 8,6,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,8(4)`
		ce426f	`@@ -351,8 +372,13 @@`
		ce426f	`blt cr6,8f /* if total DWs = 3, then bypass loop */`
		ce426f	`bf 31,4f`
		ce426f	`/* there is a third DW to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,6,10`
		ce426f	`+ sld 8,7,9`
		ce426f	`+#else`
		ce426f	`sld 0,6,10`
		ce426f	`srd 8,7,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`mr 6,7`
		ce426f	`@@ -363,8 +389,13 @@`
		ce426f	`b 4f`
		ce426f	`.align 4`
		ce426f	`1:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,6,10`
		ce426f	`+ sld 8,7,9`
		ce426f	`+#else`
		ce426f	`sld 0,6,10`
		ce426f	`srd 8,7,9`
		ce426f	`+#endif`
		ce426f	`addi 5,5,16`
		ce426f	`or 0,0,8`
		ce426f	`bf 31,4f`
		ce426f	`@@ -375,23 +406,44 @@`
		ce426f	`addi 4,4,8`
		ce426f	`.align 4`
		ce426f	`/* copy 32 bytes at a time */`
		ce426f	`-4: sld 0,6,10`
		ce426f	`+4:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,6,10`
		ce426f	`+ sld 8,7,9`
		ce426f	`+#else`
		ce426f	`+ sld 0,6,10`
		ce426f	`srd 8,7,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,0(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,7,10`
		ce426f	`+ sld 8,6,9`
		ce426f	`+#else`
		ce426f	`sld 0,7,10`
		ce426f	`srd 8,6,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,8(5)`
		ce426f	`std 0,8(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,6,10`
		ce426f	`+ sld 8,7,9`
		ce426f	`+#else`
		ce426f	`sld 0,6,10`
		ce426f	`srd 8,7,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,16(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,7,10`
		ce426f	`+ sld 8,6,9`
		ce426f	`+#else`
		ce426f	`sld 0,7,10`
		ce426f	`srd 8,6,9`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,24(4)`
		ce426f	`@@ -401,9 +453,14 @@`
		ce426f	`.align 4`
		ce426f	`8:`
		ce426f	`/* calculate and store the final DW */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srd 0,6,10`
		ce426f	`+ sld 8,7,9`
		ce426f	`+#else`
		ce426f	`sld 0,6,10`
		ce426f	`srd 8,7,9`
		ce426f	`- or 0,0,8`
		ce426f	`+#endif`
		ce426f	`+ or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`3:`
		ce426f	`rldicr 0,31,0,60`
		ce426f	`@@ -413,5 +470,5 @@`
		ce426f	`ld 31,-8(1)`
		ce426f	`ld 3,-16(1)`
		ce426f	`blr`
		ce426f	`-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)`
		ce426f	`+END_GEN_TB (memcpy,TB_TOCLESS)`
		ce426f	`libc_hidden_builtin_def (memcpy)`
		ce426f	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S`
		ce426f	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S 2014-05-29 13:05:27.000000000 -0500`
		ce426f	`@@ -1,5 +1,5 @@`
		ce426f	`/* Optimized memcpy implementation for PowerPC64.`
		ce426f	`- Copyright (C) 2003, 2006, 2007, 2011 Free Software Foundation, Inc.`
		ce426f	`+ Copyright (C) 2003-2014 Free Software Foundation, Inc.`
		ce426f	`This file is part of the GNU C Library.`
		ce426f
		ce426f	`The GNU C Library is free software; you can redistribute it and/or`
		ce426f	`@@ -17,52 +17,50 @@`
		ce426f	`<http://www.gnu.org/licenses/>. */`
		ce426f
		ce426f	`#include <sysdep.h>`
		ce426f	`-#include <bp-sym.h>`
		ce426f	`-#include <bp-asm.h>`
		ce426f
		ce426f	`/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);`
		ce426f	`Returns 'dst'.`
		ce426f
		ce426f	`- Memcpy handles short copies (< 32-bytes) using a binary move blocks`
		ce426f	`- (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled`
		ce426f	`- with the appropriate combination of byte and halfword load/stores.`
		ce426f	`- There is minimal effort to optimize the alignment of short moves.`
		ce426f	`+ Memcpy handles short copies (< 32-bytes) using a binary move blocks`
		ce426f	`+ (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled`
		ce426f	`+ with the appropriate combination of byte and halfword load/stores.`
		ce426f	`+ There is minimal effort to optimize the alignment of short moves.`
		ce426f	`The 64-bit implementations of POWER3 and POWER4 do a reasonable job`
		ce426f	`- of handling unligned load/stores that do not cross 32-byte boundries.`
		ce426f	`+ of handling unaligned load/stores that do not cross 32-byte boundaries.`
		ce426f
		ce426f	`Longer moves (>= 32-bytes) justify the effort to get at least the`
		ce426f	`destination doubleword (8-byte) aligned. Further optimization is`
		ce426f	`- posible when both source and destination are doubleword aligned.`
		ce426f	`- Each case has a optimized unrolled loop.`
		ce426f	`-`
		ce426f	`- For POWER6 unaligned loads will take a 20+ cycle hicup for any`
		ce426f	`+ possible when both source and destination are doubleword aligned.`
		ce426f	`+ Each case has a optimized unrolled loop.`
		ce426f	`+`
		ce426f	`+ For POWER6 unaligned loads will take a 20+ cycle hiccup for any`
		ce426f	`L1 cache miss that crosses a 32- or 128-byte boundary. Store`
		ce426f	`- is more forgiving and does not take a hicup until page or`
		ce426f	`- segment boundaries. So we require doubleword alignment for`
		ce426f	`+ is more forgiving and does not take a hiccup until page or`
		ce426f	`+ segment boundaries. So we require doubleword alignment for`
		ce426f	`the source but may take a risk and only require word alignment`
		ce426f	`for the destination. */`
		ce426f
		ce426f	`.machine "power6"`
		ce426f	`-EALIGN (BP_SYM (memcpy), 7, 0)`
		ce426f	`+EALIGN (memcpy, 7, 0)`
		ce426f	`CALL_MCOUNT 3`
		ce426f
		ce426f	`cmpldi cr1,5,31`
		ce426f	`neg 0,3`
		ce426f	`std 3,-16(1)`
		ce426f	`std 31,-8(1)`
		ce426f	`- andi. 11,3,7 /* check alignement of dst. */`
		ce426f	`+ andi. 11,3,7 /* check alignment of dst. */`
		ce426f	`clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */`
		ce426f	`- clrldi 10,4,61 /* check alignement of src. */`
		ce426f	`+ clrldi 10,4,61 /* check alignment of src. */`
		ce426f	`cmpldi cr6,5,8`
		ce426f	`ble- cr1,.L2 /* If move < 32 bytes use short move code. */`
		ce426f	`mtcrf 0x01,0`
		ce426f	`- cmpld cr6,10,11`
		ce426f	`+ cmpld cr6,10,11`
		ce426f	`srdi 9,5,3 /* Number of full double words remaining. */`
		ce426f	`beq .L0`
		ce426f	`-`
		ce426f	`+`
		ce426f	`subf 5,0,5`
		ce426f	`- /* Move 0-7 bytes as needed to get the destination doubleword alligned.`
		ce426f	`- Duplicate some code to maximize fall-throught and minimize agen delays. */`
		ce426f	`+ /* Move 0-7 bytes as needed to get the destination doubleword aligned.`
		ce426f	`+ Duplicate some code to maximize fall-through and minimize agen delays. */`
		ce426f	`1: bf 31,2f`
		ce426f	`lbz 6,0(4)`
		ce426f	`stb 6,0(3)`
		ce426f	`@@ -78,7 +76,7 @@`
		ce426f	`lwz 6,1(4)`
		ce426f	`stw 6,1(3)`
		ce426f	`b 0f`
		ce426f	`-`
		ce426f	`+`
		ce426f	`2: bf 30,4f`
		ce426f	`lhz 6,0(4)`
		ce426f	`sth 6,0(3)`
		ce426f	`@@ -86,26 +84,26 @@`
		ce426f	`lwz 6,2(4)`
		ce426f	`stw 6,2(3)`
		ce426f	`b 0f`
		ce426f	`-`
		ce426f	`+`
		ce426f	`4: bf 29,0f`
		ce426f	`lwz 6,0(4)`
		ce426f	`stw 6,0(3)`
		ce426f	`-0:`
		ce426f	`+0:`
		ce426f	`/* Add the number of bytes until the 1st doubleword of dst to src and dst. */`
		ce426f	`add 4,4,0`
		ce426f	`add 3,3,0`
		ce426f	`-`
		ce426f	`- clrldi 10,4,61 /* check alignement of src again. */`
		ce426f	`+`
		ce426f	`+ clrldi 10,4,61 /* check alignment of src again. */`
		ce426f	`srdi 9,5,3 /* Number of full double words remaining. */`
		ce426f	`-`
		ce426f	`- /* Copy doublewords from source to destination, assumpting the`
		ce426f	`+`
		ce426f	`+ /* Copy doublewords from source to destination, assuming the`
		ce426f	`destination is aligned on a doubleword boundary.`
		ce426f
		ce426f	`At this point we know there are at least 25 bytes left (32-7) to copy.`
		ce426f	`- The next step is to determine if the source is also doubleword aligned.`
		ce426f	`+ The next step is to determine if the source is also doubleword aligned.`
		ce426f	`If not branch to the unaligned move code at .L6. which uses`
		ce426f	`a load, shift, store strategy.`
		ce426f	`-`
		ce426f	`+`
		ce426f	`Otherwise source and destination are doubleword aligned, and we can`
		ce426f	`the optimized doubleword copy loop. */`
		ce426f	`.align 4`
		ce426f	`@@ -123,14 +121,14 @@`
		ce426f	`the main loop exits there may be a tail of 1-7 bytes. These byte`
		ce426f	`are copied a word/halfword/byte at a time as needed to preserve`
		ce426f	`alignment.`
		ce426f	`-`
		ce426f	`+`
		ce426f	`For POWER6 the L1 is store-through and the L2 is store-in. The`
		ce426f	`L2 is clocked at half CPU clock so we can store 16 bytes every`
		ce426f	`other cycle. POWER6 also has a load/store bypass so we can do`
		ce426f	`- load, load, store, store every 2 cycles.`
		ce426f	`-`
		ce426f	`+ load, load, store, store every 2 cycles.`
		ce426f	`+`
		ce426f	`The following code is sensitive to cache line alignment. Do not`
		ce426f	`- make any change with out first making sure thay don't result in`
		ce426f	`+ make any change with out first making sure they don't result in`
		ce426f	`splitting ld/std pairs across a cache line. */`
		ce426f
		ce426f	`mtcrf 0x02,5`
		ce426f	`@@ -273,7 +271,7 @@`
		ce426f	`std 8,16+96(10)`
		ce426f	`std 0,24+96(10)`
		ce426f	`ble cr5,L(das_loop_e)`
		ce426f	`-`
		ce426f	`+`
		ce426f	`mtctr 12`
		ce426f	`.align 4`
		ce426f	`L(das_loop2):`
		ce426f	`@@ -326,10 +324,10 @@`
		ce426f	`.align 4`
		ce426f	`L(das_tail):`
		ce426f	`beq cr1,0f`
		ce426f	`-`
		ce426f	`+`
		ce426f	`L(das_tail2):`
		ce426f	`/* At this point we have a tail of 0-7 bytes and we know that the`
		ce426f	`- destiniation is double word aligned. */`
		ce426f	`+ destination is double word aligned. */`
		ce426f	`4: bf 29,2f`
		ce426f	`lwz 6,0(4)`
		ce426f	`stw 6,0(3)`
		ce426f	`@@ -344,7 +342,7 @@`
		ce426f	`lbz 6,4(4)`
		ce426f	`stb 6,4(3)`
		ce426f	`b 0f`
		ce426f	`-`
		ce426f	`+`
		ce426f	`2: bf 30,1f`
		ce426f	`lhz 6,0(4)`
		ce426f	`sth 6,0(3)`
		ce426f	`@@ -352,7 +350,7 @@`
		ce426f	`lbz 6,2(4)`
		ce426f	`stb 6,2(3)`
		ce426f	`b 0f`
		ce426f	`-`
		ce426f	`+`
		ce426f	`1: bf 31,0f`
		ce426f	`lbz 6,0(4)`
		ce426f	`stb 6,0(3)`
		ce426f	`@@ -361,7 +359,7 @@`
		ce426f	`ld 3,-16(1)`
		ce426f	`blr`
		ce426f
		ce426f	`-/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31`
		ce426f	`+/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31`
		ce426f	`bytes. Each case is handled without loops, using binary (1,2,4,8)`
		ce426f	`tests.`
		ce426f
		ce426f	`@@ -402,15 +400,28 @@`
		ce426f	`blt cr6,5f`
		ce426f	`srdi 7,6,16`
		ce426f	`bgt cr6,3f`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ sth 7,0(3)`
		ce426f	`+#else`
		ce426f	`sth 6,0(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`3:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,24`
		ce426f	`+ stb 6,0(3)`
		ce426f	`+ sth 7,1(3)`
		ce426f	`+#else`
		ce426f	`stb 7,0(3)`
		ce426f	`sth 6,1(3)`
		ce426f	`+#endif`
		ce426f	`b 7f`
		ce426f	`.align 4`
		ce426f	`5:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ rotlwi 6,6,8`
		ce426f	`+#endif`
		ce426f	`stb 6,0(3)`
		ce426f	`7:`
		ce426f	`cmpldi cr1,10,16`
		ce426f	`@@ -421,7 +432,7 @@`
		ce426f	`/* At least 6 bytes left and the source is word aligned. This allows`
		ce426f	`some speculative loads up front. */`
		ce426f	`/* We need to special case the fall-through because the biggest delays`
		ce426f	`- are due to address computation not being ready in time for the`
		ce426f	`+ are due to address computation not being ready in time for the`
		ce426f	`AGEN. */`
		ce426f	`lwz 6,0(12)`
		ce426f	`lwz 7,4(12)`
		ce426f	`@@ -452,7 +463,7 @@`
		ce426f	`ld 3,-16(1)`
		ce426f	`blr`
		ce426f	`.align 4`
		ce426f	`-L(dus_tail16p8): /* less then 8 bytes left. */`
		ce426f	`+L(dus_tail16p8): /* less than 8 bytes left. */`
		ce426f	`beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */`
		ce426f	`cmpldi cr1,10,20`
		ce426f	`bf 29,L(dus_tail16p2)`
		ce426f	`@@ -466,7 +477,7 @@`
		ce426f	`ld 3,-16(1)`
		ce426f	`blr`
		ce426f	`.align 4`
		ce426f	`-L(dus_tail16p4): /* less then 4 bytes left. */`
		ce426f	`+L(dus_tail16p4): /* less than 4 bytes left. */`
		ce426f	`addi 12,12,24`
		ce426f	`addi 3,3,24`
		ce426f	`bgt cr0,L(dus_tail2)`
		ce426f	`@@ -474,7 +485,7 @@`
		ce426f	`ld 3,-16(1)`
		ce426f	`blr`
		ce426f	`.align 4`
		ce426f	`-L(dus_tail16p2): /* 16 bytes moved, less then 4 bytes left. */`
		ce426f	`+L(dus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */`
		ce426f	`addi 12,12,16`
		ce426f	`addi 3,3,16`
		ce426f	`b L(dus_tail2)`
		ce426f	`@@ -499,7 +510,7 @@`
		ce426f	`ld 3,-16(1)`
		ce426f	`blr`
		ce426f	`.align 4`
		ce426f	`-L(dus_tail8p4): /* less then 4 bytes left. */`
		ce426f	`+L(dus_tail8p4): /* less than 4 bytes left. */`
		ce426f	`addi 12,12,8`
		ce426f	`addi 3,3,8`
		ce426f	`bgt cr1,L(dus_tail2)`
		ce426f	`@@ -510,14 +521,14 @@`
		ce426f	`.align 4`
		ce426f	`L(dus_tail4): /* Move 4 bytes. */`
		ce426f	`/* r6 already loaded speculatively. If we are here we know there is`
		ce426f	`- more then 4 bytes left. So there is no need to test. */`
		ce426f	`+ more than 4 bytes left. So there is no need to test. */`
		ce426f	`addi 12,12,4`
		ce426f	`stw 6,0(3)`
		ce426f	`addi 3,3,4`
		ce426f	`L(dus_tail2): /* Move 2-3 bytes. */`
		ce426f	`bf 30,L(dus_tail1)`
		ce426f	`lhz 6,0(12)`
		ce426f	`- sth 6,0(3)`
		ce426f	`+ sth 6,0(3)`
		ce426f	`bf 31,L(dus_tailX)`
		ce426f	`lbz 7,2(12)`
		ce426f	`stb 7,2(3)`
		ce426f	`@@ -537,7 +548,7 @@`
		ce426f	`.LE8:`
		ce426f	`mr 12,4`
		ce426f	`bne cr6,L(dus_4)`
		ce426f	`-/* Exactly 8 bytes. We may cross a 32-/128-byte boundry and take a ~20`
		ce426f	`+/* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20`
		ce426f	`cycle delay. This case should be rare and any attempt to avoid this`
		ce426f	`would take most of 20 cycles any way. */`
		ce426f	`ld 6,0(4)`
		ce426f	`@@ -552,7 +563,7 @@`
		ce426f	`stw 6,0(3)`
		ce426f	`bf 30,L(dus_5)`
		ce426f	`lhz 7,4(4)`
		ce426f	`- sth 7,4(3)`
		ce426f	`+ sth 7,4(3)`
		ce426f	`bf 31,L(dus_0)`
		ce426f	`lbz 8,6(4)`
		ce426f	`stb 8,6(3)`
		ce426f	`@@ -590,20 +601,31 @@`
		ce426f	`bge cr0, L(du4_do)`
		ce426f	`blt cr5, L(du1_do)`
		ce426f	`beq cr5, L(du2_do)`
		ce426f	`- b L(du3_do)`
		ce426f	`-`
		ce426f	`+ b L(du3_do)`
		ce426f	`+`
		ce426f	`.align 4`
		ce426f	`L(du1_do):`
		ce426f	`bf 30,L(du1_1dw)`
		ce426f
		ce426f	`/* there are at least two DWs to copy */`
		ce426f	`+ /* FIXME: can combine last shift and "or" into "rldimi" */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 8`
		ce426f	`+ sldi 8,7, 64-8`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 8`
		ce426f	`srdi 8,7, 64-8`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 8`
		ce426f	`+ sldi 8,6, 64-8`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 8`
		ce426f	`srdi 8,6, 64-8`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,8(4)`
		ce426f	`@@ -612,8 +634,13 @@`
		ce426f	`blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */`
		ce426f	`bf 31,L(du1_loop)`
		ce426f	`/* there is a third DW to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 8`
		ce426f	`+ sldi 8,7, 64-8`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 8`
		ce426f	`srdi 8,7, 64-8`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`mr 6,7`
		ce426f	`@@ -624,8 +651,13 @@`
		ce426f	`b L(du1_loop)`
		ce426f	`.align 4`
		ce426f	`L(du1_1dw):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 8`
		ce426f	`+ sldi 8,7, 64-8`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 8`
		ce426f	`srdi 8,7, 64-8`
		ce426f	`+#endif`
		ce426f	`addi 5,5,16`
		ce426f	`or 0,0,8`
		ce426f	`bf 31,L(du1_loop)`
		ce426f	`@@ -637,23 +669,43 @@`
		ce426f	`.align 4`
		ce426f	`/* copy 32 bytes at a time */`
		ce426f	`L(du1_loop):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 8`
		ce426f	`+ sldi 8,7, 64-8`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 8`
		ce426f	`srdi 8,7, 64-8`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,0(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 8`
		ce426f	`+ sldi 8,6, 64-8`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 8`
		ce426f	`srdi 8,6, 64-8`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,8(5)`
		ce426f	`std 0,8(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 8`
		ce426f	`+ sldi 8,7, 64-8`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 8`
		ce426f	`srdi 8,7, 64-8`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,16(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 8`
		ce426f	`+ sldi 8,6, 64-8`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 8`
		ce426f	`srdi 8,6, 64-8`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,24(4)`
		ce426f	`@@ -663,9 +715,14 @@`
		ce426f	`.align 4`
		ce426f	`L(du1_fini):`
		ce426f	`/* calculate and store the final DW */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 8`
		ce426f	`+ sldi 8,7, 64-8`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 8`
		ce426f	`srdi 8,7, 64-8`
		ce426f	`- or 0,0,8`
		ce426f	`+#endif`
		ce426f	`+ or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`b L(du_done)`
		ce426f
		ce426f	`@@ -674,13 +731,23 @@`
		ce426f	`bf 30,L(du2_1dw)`
		ce426f
		ce426f	`/* there are at least two DWs to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 16`
		ce426f	`+ sldi 8,7, 64-16`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 16`
		ce426f	`srdi 8,7, 64-16`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 16`
		ce426f	`+ sldi 8,6, 64-16`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 16`
		ce426f	`srdi 8,6, 64-16`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,8(4)`
		ce426f	`@@ -689,8 +756,13 @@`
		ce426f	`blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */`
		ce426f	`bf 31,L(du2_loop)`
		ce426f	`/* there is a third DW to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 16`
		ce426f	`+ sldi 8,7, 64-16`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 16`
		ce426f	`srdi 8,7, 64-16`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`mr 6,7`
		ce426f	`@@ -701,8 +773,13 @@`
		ce426f	`b L(du2_loop)`
		ce426f	`.align 4`
		ce426f	`L(du2_1dw):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 16`
		ce426f	`+ sldi 8,7, 64-16`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 16`
		ce426f	`srdi 8,7, 64-16`
		ce426f	`+#endif`
		ce426f	`addi 5,5,16`
		ce426f	`or 0,0,8`
		ce426f	`bf 31,L(du2_loop)`
		ce426f	`@@ -714,23 +791,43 @@`
		ce426f	`.align 4`
		ce426f	`/* copy 32 bytes at a time */`
		ce426f	`L(du2_loop):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 16`
		ce426f	`+ sldi 8,7, 64-16`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 16`
		ce426f	`srdi 8,7, 64-16`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,0(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 16`
		ce426f	`+ sldi 8,6, 64-16`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 16`
		ce426f	`srdi 8,6, 64-16`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,8(5)`
		ce426f	`std 0,8(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 16`
		ce426f	`+ sldi 8,7, 64-16`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 16`
		ce426f	`srdi 8,7, 64-16`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,16(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 16`
		ce426f	`+ sldi 8,6, 64-16`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 16`
		ce426f	`srdi 8,6, 64-16`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,24(4)`
		ce426f	`@@ -740,9 +837,14 @@`
		ce426f	`.align 4`
		ce426f	`L(du2_fini):`
		ce426f	`/* calculate and store the final DW */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 16`
		ce426f	`+ sldi 8,7, 64-16`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 16`
		ce426f	`srdi 8,7, 64-16`
		ce426f	`- or 0,0,8`
		ce426f	`+#endif`
		ce426f	`+ or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`b L(du_done)`
		ce426f
		ce426f	`@@ -751,13 +853,23 @@`
		ce426f	`bf 30,L(du3_1dw)`
		ce426f
		ce426f	`/* there are at least two DWs to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 24`
		ce426f	`+ sldi 8,7, 64-24`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 24`
		ce426f	`srdi 8,7, 64-24`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 24`
		ce426f	`+ sldi 8,6, 64-24`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 24`
		ce426f	`srdi 8,6, 64-24`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,8(4)`
		ce426f	`@@ -766,8 +878,13 @@`
		ce426f	`blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */`
		ce426f	`bf 31,L(du3_loop)`
		ce426f	`/* there is a third DW to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 24`
		ce426f	`+ sldi 8,7, 64-24`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 24`
		ce426f	`srdi 8,7, 64-24`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`mr 6,7`
		ce426f	`@@ -778,8 +895,13 @@`
		ce426f	`b L(du3_loop)`
		ce426f	`.align 4`
		ce426f	`L(du3_1dw):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 24`
		ce426f	`+ sldi 8,7, 64-24`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 24`
		ce426f	`srdi 8,7, 64-24`
		ce426f	`+#endif`
		ce426f	`addi 5,5,16`
		ce426f	`or 0,0,8`
		ce426f	`bf 31,L(du3_loop)`
		ce426f	`@@ -791,23 +913,43 @@`
		ce426f	`.align 4`
		ce426f	`/* copy 32 bytes at a time */`
		ce426f	`L(du3_loop):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 24`
		ce426f	`+ sldi 8,7, 64-24`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 24`
		ce426f	`srdi 8,7, 64-24`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,0(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 24`
		ce426f	`+ sldi 8,6, 64-24`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 24`
		ce426f	`srdi 8,6, 64-24`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,8(5)`
		ce426f	`std 0,8(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 24`
		ce426f	`+ sldi 8,7, 64-24`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 24`
		ce426f	`srdi 8,7, 64-24`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,16(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 24`
		ce426f	`+ sldi 8,6, 64-24`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 24`
		ce426f	`srdi 8,6, 64-24`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,24(4)`
		ce426f	`@@ -817,9 +959,14 @@`
		ce426f	`.align 4`
		ce426f	`L(du3_fini):`
		ce426f	`/* calculate and store the final DW */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 24`
		ce426f	`+ sldi 8,7, 64-24`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 24`
		ce426f	`srdi 8,7, 64-24`
		ce426f	`- or 0,0,8`
		ce426f	`+#endif`
		ce426f	`+ or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`b L(du_done)`
		ce426f
		ce426f	`@@ -834,13 +981,23 @@`
		ce426f	`bf 30,L(du4_1dw)`
		ce426f
		ce426f	`/* there are at least two DWs to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 32`
		ce426f	`+ sldi 8,7, 64-32`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 32`
		ce426f	`srdi 8,7, 64-32`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 32`
		ce426f	`+ sldi 8,6, 64-32`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 32`
		ce426f	`srdi 8,6, 64-32`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,8(4)`
		ce426f	`@@ -849,8 +1006,13 @@`
		ce426f	`blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */`
		ce426f	`bf 31,L(du4_loop)`
		ce426f	`/* there is a third DW to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 32`
		ce426f	`+ sldi 8,7, 64-32`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 32`
		ce426f	`srdi 8,7, 64-32`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`mr 6,7`
		ce426f	`@@ -861,8 +1023,13 @@`
		ce426f	`b L(du4_loop)`
		ce426f	`.align 4`
		ce426f	`L(du4_1dw):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 32`
		ce426f	`+ sldi 8,7, 64-32`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 32`
		ce426f	`srdi 8,7, 64-32`
		ce426f	`+#endif`
		ce426f	`addi 5,5,16`
		ce426f	`or 0,0,8`
		ce426f	`bf 31,L(du4_loop)`
		ce426f	`@@ -874,23 +1041,43 @@`
		ce426f	`.align 4`
		ce426f	`/* copy 32 bytes at a time */`
		ce426f	`L(du4_loop):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 32`
		ce426f	`+ sldi 8,7, 64-32`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 32`
		ce426f	`srdi 8,7, 64-32`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,0(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 32`
		ce426f	`+ sldi 8,6, 64-32`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 32`
		ce426f	`srdi 8,6, 64-32`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,8(5)`
		ce426f	`std 0,8(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 32`
		ce426f	`+ sldi 8,7, 64-32`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 32`
		ce426f	`srdi 8,7, 64-32`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,16(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 32`
		ce426f	`+ sldi 8,6, 64-32`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 32`
		ce426f	`srdi 8,6, 64-32`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,24(4)`
		ce426f	`@@ -900,9 +1087,14 @@`
		ce426f	`.align 4`
		ce426f	`L(du4_fini):`
		ce426f	`/* calculate and store the final DW */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 32`
		ce426f	`+ sldi 8,7, 64-32`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 32`
		ce426f	`srdi 8,7, 64-32`
		ce426f	`- or 0,0,8`
		ce426f	`+#endif`
		ce426f	`+ or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`b L(du_done)`
		ce426f
		ce426f	`@@ -911,13 +1103,23 @@`
		ce426f	`bf 30,L(du5_1dw)`
		ce426f
		ce426f	`/* there are at least two DWs to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 40`
		ce426f	`+ sldi 8,7, 64-40`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 40`
		ce426f	`srdi 8,7, 64-40`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 40`
		ce426f	`+ sldi 8,6, 64-40`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 40`
		ce426f	`srdi 8,6, 64-40`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,8(4)`
		ce426f	`@@ -926,8 +1128,13 @@`
		ce426f	`blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */`
		ce426f	`bf 31,L(du5_loop)`
		ce426f	`/* there is a third DW to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 40`
		ce426f	`+ sldi 8,7, 64-40`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 40`
		ce426f	`srdi 8,7, 64-40`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`mr 6,7`
		ce426f	`@@ -938,8 +1145,13 @@`
		ce426f	`b L(du5_loop)`
		ce426f	`.align 4`
		ce426f	`L(du5_1dw):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 40`
		ce426f	`+ sldi 8,7, 64-40`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 40`
		ce426f	`srdi 8,7, 64-40`
		ce426f	`+#endif`
		ce426f	`addi 5,5,16`
		ce426f	`or 0,0,8`
		ce426f	`bf 31,L(du5_loop)`
		ce426f	`@@ -951,23 +1163,43 @@`
		ce426f	`.align 4`
		ce426f	`/* copy 32 bytes at a time */`
		ce426f	`L(du5_loop):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 40`
		ce426f	`+ sldi 8,7, 64-40`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 40`
		ce426f	`srdi 8,7, 64-40`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,0(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 40`
		ce426f	`+ sldi 8,6, 64-40`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 40`
		ce426f	`srdi 8,6, 64-40`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,8(5)`
		ce426f	`std 0,8(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 40`
		ce426f	`+ sldi 8,7, 64-40`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 40`
		ce426f	`srdi 8,7, 64-40`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,16(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 40`
		ce426f	`+ sldi 8,6, 64-40`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 40`
		ce426f	`srdi 8,6, 64-40`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,24(4)`
		ce426f	`@@ -977,9 +1209,14 @@`
		ce426f	`.align 4`
		ce426f	`L(du5_fini):`
		ce426f	`/* calculate and store the final DW */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 40`
		ce426f	`+ sldi 8,7, 64-40`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 40`
		ce426f	`srdi 8,7, 64-40`
		ce426f	`- or 0,0,8`
		ce426f	`+#endif`
		ce426f	`+ or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`b L(du_done)`
		ce426f
		ce426f	`@@ -988,13 +1225,23 @@`
		ce426f	`bf 30,L(du6_1dw)`
		ce426f
		ce426f	`/* there are at least two DWs to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 48`
		ce426f	`+ sldi 8,7, 64-48`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 48`
		ce426f	`srdi 8,7, 64-48`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 48`
		ce426f	`+ sldi 8,6, 64-48`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 48`
		ce426f	`srdi 8,6, 64-48`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,8(4)`
		ce426f	`@@ -1003,8 +1250,13 @@`
		ce426f	`blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */`
		ce426f	`bf 31,L(du6_loop)`
		ce426f	`/* there is a third DW to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 48`
		ce426f	`+ sldi 8,7, 64-48`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 48`
		ce426f	`srdi 8,7, 64-48`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`mr 6,7`
		ce426f	`@@ -1015,8 +1267,13 @@`
		ce426f	`b L(du6_loop)`
		ce426f	`.align 4`
		ce426f	`L(du6_1dw):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 48`
		ce426f	`+ sldi 8,7, 64-48`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 48`
		ce426f	`srdi 8,7, 64-48`
		ce426f	`+#endif`
		ce426f	`addi 5,5,16`
		ce426f	`or 0,0,8`
		ce426f	`bf 31,L(du6_loop)`
		ce426f	`@@ -1028,23 +1285,43 @@`
		ce426f	`.align 4`
		ce426f	`/* copy 32 bytes at a time */`
		ce426f	`L(du6_loop):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 48`
		ce426f	`+ sldi 8,7, 64-48`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 48`
		ce426f	`srdi 8,7, 64-48`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,0(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 48`
		ce426f	`+ sldi 8,6, 64-48`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 48`
		ce426f	`srdi 8,6, 64-48`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,8(5)`
		ce426f	`std 0,8(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 48`
		ce426f	`+ sldi 8,7, 64-48`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 48`
		ce426f	`srdi 8,7, 64-48`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,16(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 48`
		ce426f	`+ sldi 8,6, 64-48`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 48`
		ce426f	`srdi 8,6, 64-48`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,24(4)`
		ce426f	`@@ -1054,9 +1331,14 @@`
		ce426f	`.align 4`
		ce426f	`L(du6_fini):`
		ce426f	`/* calculate and store the final DW */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 48`
		ce426f	`+ sldi 8,7, 64-48`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 48`
		ce426f	`srdi 8,7, 64-48`
		ce426f	`- or 0,0,8`
		ce426f	`+#endif`
		ce426f	`+ or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`b L(du_done)`
		ce426f
		ce426f	`@@ -1065,13 +1347,23 @@`
		ce426f	`bf 30,L(du7_1dw)`
		ce426f
		ce426f	`/* there are at least two DWs to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 56`
		ce426f	`+ sldi 8,7, 64-56`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 56`
		ce426f	`srdi 8,7, 64-56`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 56`
		ce426f	`+ sldi 8,6, 64-56`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 56`
		ce426f	`srdi 8,6, 64-56`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,8(4)`
		ce426f	`@@ -1080,8 +1372,13 @@`
		ce426f	`blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */`
		ce426f	`bf 31,L(du7_loop)`
		ce426f	`/* there is a third DW to copy */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 56`
		ce426f	`+ sldi 8,7, 64-56`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 56`
		ce426f	`srdi 8,7, 64-56`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`mr 6,7`
		ce426f	`@@ -1092,8 +1389,13 @@`
		ce426f	`b L(du7_loop)`
		ce426f	`.align 4`
		ce426f	`L(du7_1dw):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 56`
		ce426f	`+ sldi 8,7, 64-56`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 56`
		ce426f	`srdi 8,7, 64-56`
		ce426f	`+#endif`
		ce426f	`addi 5,5,16`
		ce426f	`or 0,0,8`
		ce426f	`bf 31,L(du7_loop)`
		ce426f	`@@ -1105,23 +1407,43 @@`
		ce426f	`.align 4`
		ce426f	`/* copy 32 bytes at a time */`
		ce426f	`L(du7_loop):`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 56`
		ce426f	`+ sldi 8,7, 64-56`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 56`
		ce426f	`srdi 8,7, 64-56`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,0(5)`
		ce426f	`std 0,0(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 56`
		ce426f	`+ sldi 8,6, 64-56`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 56`
		ce426f	`srdi 8,6, 64-56`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,8(5)`
		ce426f	`std 0,8(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 56`
		ce426f	`+ sldi 8,7, 64-56`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 56`
		ce426f	`srdi 8,7, 64-56`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 6,16(5)`
		ce426f	`std 0,16(4)`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,7, 56`
		ce426f	`+ sldi 8,6, 64-56`
		ce426f	`+#else`
		ce426f	`sldi 0,7, 56`
		ce426f	`srdi 8,6, 64-56`
		ce426f	`+#endif`
		ce426f	`or 0,0,8`
		ce426f	`ld 7,24(5)`
		ce426f	`std 0,24(4)`
		ce426f	`@@ -1131,12 +1453,17 @@`
		ce426f	`.align 4`
		ce426f	`L(du7_fini):`
		ce426f	`/* calculate and store the final DW */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ srdi 0,6, 56`
		ce426f	`+ sldi 8,7, 64-56`
		ce426f	`+#else`
		ce426f	`sldi 0,6, 56`
		ce426f	`srdi 8,7, 64-56`
		ce426f	`- or 0,0,8`
		ce426f	`+#endif`
		ce426f	`+ or 0,0,8`
		ce426f	`std 0,0(4)`
		ce426f	`b L(du_done)`
		ce426f	`-`
		ce426f	`+`
		ce426f	`.align 4`
		ce426f	`L(du_done):`
		ce426f	`rldicr 0,31,0,60`
		ce426f	`@@ -1144,9 +1471,9 @@`
		ce426f	`beq cr1,0f /* If the tail is 0 bytes we are done! */`
		ce426f
		ce426f	`add 3,3,0`
		ce426f	`- add 12,12,0`
		ce426f	`+ add 12,12,0`
		ce426f	`/* At this point we have a tail of 0-7 bytes and we know that the`
		ce426f	`- destiniation is double word aligned. */`
		ce426f	`+ destination is double word aligned. */`
		ce426f	`4: bf 29,2f`
		ce426f	`lwz 6,0(12)`
		ce426f	`addi 12,12,4`
		ce426f	`@@ -1165,5 +1492,5 @@`
		ce426f	`ld 31,-8(1)`
		ce426f	`ld 3,-16(1)`
		ce426f	`blr`
		ce426f	`-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)`
		ce426f	`+END_GEN_TB (memcpy,TB_TOCLESS)`
		ce426f	`libc_hidden_builtin_def (memcpy)`
		ce426f	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S`
		ce426f	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S 2014-05-29 13:05:40.000000000 -0500`
		ce426f	`@@ -1,5 +1,5 @@`
		ce426f	`/* Optimized memcpy implementation for PowerPC64/POWER7.`
		ce426f	`- Copyright (C) 2010, 2011 Free Software Foundation, Inc.`
		ce426f	`+ Copyright (C) 2010-2014 Free Software Foundation, Inc.`
		ce426f	`Contributed by Luis Machado <luisgpm@br.ibm.com>.`
		ce426f	`This file is part of the GNU C Library.`
		ce426f
		ce426f	`@@ -18,425 +18,366 @@`
		ce426f	`<http://www.gnu.org/licenses/>. */`
		ce426f
		ce426f	`#include <sysdep.h>`
		ce426f	`-#include <bp-sym.h>`
		ce426f	`-#include <bp-asm.h>`
		ce426f
		ce426f
		ce426f	`/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);`
		ce426f	`Returns 'dst'. */`
		ce426f
		ce426f	`+#define dst 11 /* Use r11 so r3 kept unchanged. */`
		ce426f	`+#define src 4`
		ce426f	`+#define cnt 5`
		ce426f	`+`
		ce426f	`.machine power7`
		ce426f	`-EALIGN (BP_SYM (memcpy), 5, 0)`
		ce426f	`+EALIGN (memcpy, 5, 0)`
		ce426f	`CALL_MCOUNT 3`
		ce426f
		ce426f	`- cmpldi cr1,5,31`
		ce426f	`+ cmpldi cr1,cnt,31`
		ce426f	`neg 0,3`
		ce426f	`- std 3,-16(1)`
		ce426f	`- std 31,-8(1)`
		ce426f	`- cfi_offset(31,-8)`
		ce426f	`ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move`
		ce426f	`code. */`
		ce426f
		ce426f	`- andi. 11,3,7 /* Check alignment of DST. */`
		ce426f	`-`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x`
		ce426f	`+ or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy`
		ce426f	`+ loop is only used for quadword aligned copies. */`
		ce426f	`+ andi. 10,3,15`
		ce426f	`+ clrldi 11,4,60`
		ce426f	`+#else`
		ce426f	`+ andi. 10,3,7 /* Check alignment of DST. */`
		ce426f	`+ clrldi 11,4,61 /* Check alignment of SRC. */`
		ce426f	`+#endif`
		ce426f	`+ cmpld cr6,10,11 /* SRC and DST alignments match? */`
		ce426f
		ce426f	`- clrldi 10,4,61 /* Check alignment of SRC. */`
		ce426f	`- cmpld cr6,10,11 /* SRC and DST alignments match? */`
		ce426f	`- mr 12,4`
		ce426f	`- mr 31,5`
		ce426f	`+ mr dst,3`
		ce426f	`bne cr6,L(copy_GE_32_unaligned)`
		ce426f	`+ beq L(aligned_copy)`
		ce426f
		ce426f	`- srdi 9,5,3 /* Number of full quadwords remaining. */`
		ce426f	`-`
		ce426f	`- beq L(copy_GE_32_aligned_cont)`
		ce426f	`-`
		ce426f	`- clrldi 0,0,61`
		ce426f	`- mtcrf 0x01,0`
		ce426f	`- subf 31,0,5`
		ce426f	`-`
		ce426f	`- /* Get the SRC aligned to 8 bytes. */`
		ce426f	`-`
		ce426f	`-1: bf 31,2f`
		ce426f	`- lbz 6,0(12)`
		ce426f	`- addi 12,12,1`
		ce426f	`- stb 6,0(3)`
		ce426f	`- addi 3,3,1`
		ce426f	`-2: bf 30,4f`
		ce426f	`- lhz 6,0(12)`
		ce426f	`- addi 12,12,2`
		ce426f	`- sth 6,0(3)`
		ce426f	`- addi 3,3,2`
		ce426f	`-4: bf 29,0f`
		ce426f	`- lwz 6,0(12)`
		ce426f	`- addi 12,12,4`
		ce426f	`- stw 6,0(3)`
		ce426f	`- addi 3,3,4`
		ce426f	`-0:`
		ce426f	`- clrldi 10,12,61 /* Check alignment of SRC again. */`
		ce426f	`- srdi 9,31,3 /* Number of full doublewords remaining. */`
		ce426f	`-`
		ce426f	`-L(copy_GE_32_aligned_cont):`
		ce426f	`-`
		ce426f	`- clrldi 11,31,61`
		ce426f	`- mtcrf 0x01,9`
		ce426f	`-`
		ce426f	`- srdi 8,31,5`
		ce426f	`- cmpldi cr1,9,4`
		ce426f	`- cmpldi cr6,11,0`
		ce426f	`- mr 11,12`
		ce426f	`+ mtocrf 0x01,0`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ clrldi 0,0,60`
		ce426f	`+#else`
		ce426f	`+ clrldi 0,0,61`
		ce426f	`+#endif`
		ce426f
		ce426f	`- /* Copy 1~3 doublewords so the main loop starts`
		ce426f	`- at a multiple of 32 bytes. */`
		ce426f	`-`
		ce426f	`- bf 30,1f`
		ce426f	`- ld 6,0(12)`
		ce426f	`- ld 7,8(12)`
		ce426f	`- addi 11,12,16`
		ce426f	`- mtctr 8`
		ce426f	`- std 6,0(3)`
		ce426f	`- std 7,8(3)`
		ce426f	`- addi 10,3,16`
		ce426f	`- bf 31,4f`
		ce426f	`- ld 0,16(12)`
		ce426f	`- std 0,16(3)`
		ce426f	`- blt cr1,3f`
		ce426f	`- addi 11,12,24`
		ce426f	`- addi 10,3,24`
		ce426f	`- b 4f`
		ce426f	`-`
		ce426f	`- .align 4`
		ce426f	`-1: /* Copy 1 doubleword and set the counter. */`
		ce426f	`- mr 10,3`
		ce426f	`- mtctr 8`
		ce426f	`- bf 31,4f`
		ce426f	`- ld 6,0(12)`
		ce426f	`- addi 11,12,8`
		ce426f	`- std 6,0(3)`
		ce426f	`- addi 10,3,8`
		ce426f	`+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */`
		ce426f	`+1:`
		ce426f	`+ bf 31,2f`
		ce426f	`+ lbz 6,0(src)`
		ce426f	`+ addi src,src,1`
		ce426f	`+ stb 6,0(dst)`
		ce426f	`+ addi dst,dst,1`
		ce426f	`+2:`
		ce426f	`+ bf 30,4f`
		ce426f	`+ lhz 6,0(src)`
		ce426f	`+ addi src,src,2`
		ce426f	`+ sth 6,0(dst)`
		ce426f	`+ addi dst,dst,2`
		ce426f	`+4:`
		ce426f	`+ bf 29,8f`
		ce426f	`+ lwz 6,0(src)`
		ce426f	`+ addi src,src,4`
		ce426f	`+ stw 6,0(dst)`
		ce426f	`+ addi dst,dst,4`
		ce426f	`+8:`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ bf 28,16f`
		ce426f	`+ ld 6,0(src)`
		ce426f	`+ addi src,src,8`
		ce426f	`+ std 6,0(dst)`
		ce426f	`+ addi dst,dst,8`
		ce426f	`+16:`
		ce426f	`+#endif`
		ce426f	`+ subf cnt,0,cnt`
		ce426f
		ce426f	`+/* Main aligned copy loop. Copies 128 bytes at a time. */`
		ce426f	`L(aligned_copy):`
		ce426f	`- /* Main aligned copy loop. Copies up to 128-bytes at a time. */`
		ce426f	`- .align 4`
		ce426f	`-4:`
		ce426f	`- /* check for any 32-byte or 64-byte lumps that are outside of a`
		ce426f	`- nice 128-byte range. R8 contains the number of 32-byte`
		ce426f	`- lumps, so drop this into the CR, and use the SO/EQ bits to help`
		ce426f	`- handle the 32- or 64- byte lumps. Then handle the rest with an`
		ce426f	`- unrolled 128-bytes-at-a-time copy loop. */`
		ce426f	`- mtocrf 1,8`
		ce426f	`- li 6,16 # 16() index`
		ce426f	`- li 7,32 # 32() index`
		ce426f	`- li 8,48 # 48() index`
		ce426f	`-`
		ce426f	`-L(aligned_32byte):`
		ce426f	`- /* if the SO bit (indicating a 32-byte lump) is not set, move along. */`
		ce426f	`- bns cr7,L(aligned_64byte)`
		ce426f	`- lxvd2x 6,0,11`
		ce426f	`- lxvd2x 7,11,6`
		ce426f	`- addi 11,11,32`
		ce426f	`- stxvd2x 6,0,10`
		ce426f	`- stxvd2x 7,10,6`
		ce426f	`- addi 10,10,32`
		ce426f	`-`
		ce426f	`-L(aligned_64byte):`
		ce426f	`- /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */`
		ce426f	`- bne cr7,L(aligned_128setup)`
		ce426f	`- lxvd2x 6,0,11`
		ce426f	`- lxvd2x 7,11,6`
		ce426f	`- lxvd2x 8,11,7`
		ce426f	`- lxvd2x 9,11,8`
		ce426f	`- addi 11,11,64`
		ce426f	`- stxvd2x 6,0,10`
		ce426f	`- stxvd2x 7,10,6`
		ce426f	`- stxvd2x 8,10,7`
		ce426f	`- stxvd2x 9,10,8`
		ce426f	`- addi 10,10,64`
		ce426f	`-`
		ce426f	`-L(aligned_128setup):`
		ce426f	`- /* Set up for the 128-byte at a time copy loop. */`
		ce426f	`- srdi 8,31,7`
		ce426f	`- cmpdi 8,0 # Any 4x lumps left?`
		ce426f	`- beq 3f # if not, move along.`
		ce426f	`- lxvd2x 6,0,11`
		ce426f	`- lxvd2x 7,11,6`
		ce426f	`- mtctr 8 # otherwise, load the ctr and begin.`
		ce426f	`- li 8,48 # 48() index`
		ce426f	`+ li 6,16`
		ce426f	`+ li 7,32`
		ce426f	`+ li 8,48`
		ce426f	`+ mtocrf 0x02,cnt`
		ce426f	`+ srdi 12,cnt,7`
		ce426f	`+ cmpdi 12,0`
		ce426f	`+ beq L(aligned_tail)`
		ce426f	`+ lxvd2x 6,0,src`
		ce426f	`+ lxvd2x 7,src,6`
		ce426f	`+ mtctr 12`
		ce426f	`b L(aligned_128loop)`
		ce426f
		ce426f	`+ .align 4`
		ce426f	`L(aligned_128head):`
		ce426f	`/* for the 2nd + iteration of this loop. */`
		ce426f	`- lxvd2x 6,0,11`
		ce426f	`- lxvd2x 7,11,6`
		ce426f	`+ lxvd2x 6,0,src`
		ce426f	`+ lxvd2x 7,src,6`
		ce426f	`L(aligned_128loop):`
		ce426f	`- lxvd2x 8,11,7`
		ce426f	`- lxvd2x 9,11,8`
		ce426f	`- stxvd2x 6,0,10`
		ce426f	`- addi 11,11,64`
		ce426f	`- stxvd2x 7,10,6`
		ce426f	`- stxvd2x 8,10,7`
		ce426f	`- stxvd2x 9,10,8`
		ce426f	`- lxvd2x 6,0,11`
		ce426f	`- lxvd2x 7,11,6`
		ce426f	`- addi 10,10,64`
		ce426f	`- lxvd2x 8,11,7`
		ce426f	`- lxvd2x 9,11,8`
		ce426f	`- addi 11,11,64`
		ce426f	`- stxvd2x 6,0,10`
		ce426f	`- stxvd2x 7,10,6`
		ce426f	`- stxvd2x 8,10,7`
		ce426f	`- stxvd2x 9,10,8`
		ce426f	`- addi 10,10,64`
		ce426f	`+ lxvd2x 8,src,7`
		ce426f	`+ lxvd2x 9,src,8`
		ce426f	`+ stxvd2x 6,0,dst`
		ce426f	`+ addi src,src,64`
		ce426f	`+ stxvd2x 7,dst,6`
		ce426f	`+ stxvd2x 8,dst,7`
		ce426f	`+ stxvd2x 9,dst,8`
		ce426f	`+ lxvd2x 6,0,src`
		ce426f	`+ lxvd2x 7,src,6`
		ce426f	`+ addi dst,dst,64`
		ce426f	`+ lxvd2x 8,src,7`
		ce426f	`+ lxvd2x 9,src,8`
		ce426f	`+ addi src,src,64`
		ce426f	`+ stxvd2x 6,0,dst`
		ce426f	`+ stxvd2x 7,dst,6`
		ce426f	`+ stxvd2x 8,dst,7`
		ce426f	`+ stxvd2x 9,dst,8`
		ce426f	`+ addi dst,dst,64`
		ce426f	`bdnz L(aligned_128head)`
		ce426f
		ce426f	`-3:`
		ce426f	`- /* Check for tail bytes. */`
		ce426f	`- rldicr 0,31,0,60`
		ce426f	`- mtcrf 0x01,31`
		ce426f	`- beq cr6,0f`
		ce426f	`-`
		ce426f	`-.L9:`
		ce426f	`- add 3,3,0`
		ce426f	`- add 12,12,0`
		ce426f	`-`
		ce426f	`- /* At this point we have a tail of 0-7 bytes and we know that the`
		ce426f	`- destination is doubleword-aligned. */`
		ce426f	`-4: /* Copy 4 bytes. */`
		ce426f	`- bf 29,2f`
		ce426f	`-`
		ce426f	`- lwz 6,0(12)`
		ce426f	`- addi 12,12,4`
		ce426f	`- stw 6,0(3)`
		ce426f	`- addi 3,3,4`
		ce426f	`-2: /* Copy 2 bytes. */`
		ce426f	`- bf 30,1f`
		ce426f	`-`
		ce426f	`- lhz 6,0(12)`
		ce426f	`- addi 12,12,2`
		ce426f	`- sth 6,0(3)`
		ce426f	`- addi 3,3,2`
		ce426f	`-1: /* Copy 1 byte. */`
		ce426f	`- bf 31,0f`
		ce426f	`-`
		ce426f	`- lbz 6,0(12)`
		ce426f	`- stb 6,0(3)`
		ce426f	`-0: /* Return original DST pointer. */`
		ce426f	`- ld 31,-8(1)`
		ce426f	`- ld 3,-16(1)`
		ce426f	`+L(aligned_tail):`
		ce426f	`+ mtocrf 0x01,cnt`
		ce426f	`+ bf 25,32f`
		ce426f	`+ lxvd2x 6,0,src`
		ce426f	`+ lxvd2x 7,src,6`
		ce426f	`+ lxvd2x 8,src,7`
		ce426f	`+ lxvd2x 9,src,8`
		ce426f	`+ addi src,src,64`
		ce426f	`+ stxvd2x 6,0,dst`
		ce426f	`+ stxvd2x 7,dst,6`
		ce426f	`+ stxvd2x 8,dst,7`
		ce426f	`+ stxvd2x 9,dst,8`
		ce426f	`+ addi dst,dst,64`
		ce426f	`+32:`
		ce426f	`+ bf 26,16f`
		ce426f	`+ lxvd2x 6,0,src`
		ce426f	`+ lxvd2x 7,src,6`
		ce426f	`+ addi src,src,32`
		ce426f	`+ stxvd2x 6,0,dst`
		ce426f	`+ stxvd2x 7,dst,6`
		ce426f	`+ addi dst,dst,32`
		ce426f	`+16:`
		ce426f	`+ bf 27,8f`
		ce426f	`+ lxvd2x 6,0,src`
		ce426f	`+ addi src,src,16`
		ce426f	`+ stxvd2x 6,0,dst`
		ce426f	`+ addi dst,dst,16`
		ce426f	`+8:`
		ce426f	`+ bf 28,4f`
		ce426f	`+ ld 6,0(src)`
		ce426f	`+ addi src,src,8`
		ce426f	`+ std 6,0(dst)`
		ce426f	`+ addi dst,dst,8`
		ce426f	`+4: /* Copies 4~7 bytes. */`
		ce426f	`+ bf 29,L(tail2)`
		ce426f	`+ lwz 6,0(src)`
		ce426f	`+ stw 6,0(dst)`
		ce426f	`+ bf 30,L(tail5)`
		ce426f	`+ lhz 7,4(src)`
		ce426f	`+ sth 7,4(dst)`
		ce426f	`+ bflr 31`
		ce426f	`+ lbz 8,6(src)`
		ce426f	`+ stb 8,6(dst)`
		ce426f	`+ /* Return original DST pointer. */`
		ce426f	`blr`
		ce426f
		ce426f	`- /* Handle copies of 0~31 bytes. */`
		ce426f	`- .align 4`
		ce426f	`+`
		ce426f	`+/* Handle copies of 0~31 bytes. */`
		ce426f	`+ .align 4`
		ce426f	`L(copy_LT_32):`
		ce426f	`- cmpldi cr6,5,8`
		ce426f	`- mr 12,4`
		ce426f	`- mtcrf 0x01,5`
		ce426f	`+ mr dst,3`
		ce426f	`+ cmpldi cr6,cnt,8`
		ce426f	`+ mtocrf 0x01,cnt`
		ce426f	`ble cr6,L(copy_LE_8)`
		ce426f
		ce426f	`/* At least 9 bytes to go. */`
		ce426f	`neg 8,4`
		ce426f	`- clrrdi 11,4,2`
		ce426f	`- andi. 0,8,3`
		ce426f	`- cmpldi cr1,5,16`
		ce426f	`- mr 10,5`
		ce426f	`+ andi. 0,8,3`
		ce426f	`+ cmpldi cr1,cnt,16`
		ce426f	`beq L(copy_LT_32_aligned)`
		ce426f
		ce426f	`- /* Force 4-bytes alignment for SRC. */`
		ce426f	`- mtocrf 0x01,0`
		ce426f	`- subf 10,0,5`
		ce426f	`-2: bf 30,1f`
		ce426f	`-`
		ce426f	`- lhz 6,0(12)`
		ce426f	`- addi 12,12,2`
		ce426f	`- sth 6,0(3)`
		ce426f	`- addi 3,3,2`
		ce426f	`-1: bf 31,L(end_4bytes_alignment)`
		ce426f	`-`
		ce426f	`- lbz 6,0(12)`
		ce426f	`- addi 12,12,1`
		ce426f	`- stb 6,0(3)`
		ce426f	`- addi 3,3,1`
		ce426f	`+ /* Force 4-byte alignment for SRC. */`
		ce426f	`+ mtocrf 0x01,0`
		ce426f	`+ subf cnt,0,cnt`
		ce426f	`+2:`
		ce426f	`+ bf 30,1f`
		ce426f	`+ lhz 6,0(src)`
		ce426f	`+ addi src,src,2`
		ce426f	`+ sth 6,0(dst)`
		ce426f	`+ addi dst,dst,2`
		ce426f	`+1:`
		ce426f	`+ bf 31,L(end_4bytes_alignment)`
		ce426f	`+ lbz 6,0(src)`
		ce426f	`+ addi src,src,1`
		ce426f	`+ stb 6,0(dst)`
		ce426f	`+ addi dst,dst,1`
		ce426f
		ce426f	`- .align 4`
		ce426f	`+ .align 4`
		ce426f	`L(end_4bytes_alignment):`
		ce426f	`- cmpldi cr1,10,16`
		ce426f	`- mtcrf 0x01,10`
		ce426f	`+ cmpldi cr1,cnt,16`
		ce426f	`+ mtocrf 0x01,cnt`
		ce426f
		ce426f	`L(copy_LT_32_aligned):`
		ce426f	`/* At least 6 bytes to go, and SRC is word-aligned. */`
		ce426f	`blt cr1,8f`
		ce426f
		ce426f	`/* Copy 16 bytes. */`
		ce426f	`- lwz 6,0(12)`
		ce426f	`- lwz 7,4(12)`
		ce426f	`- stw 6,0(3)`
		ce426f	`- lwz 8,8(12)`
		ce426f	`- stw 7,4(3)`
		ce426f	`- lwz 6,12(12)`
		ce426f	`- addi 12,12,16`
		ce426f	`- stw 8,8(3)`
		ce426f	`- stw 6,12(3)`
		ce426f	`- addi 3,3,16`
		ce426f	`+ lwz 6,0(src)`
		ce426f	`+ lwz 7,4(src)`
		ce426f	`+ stw 6,0(dst)`
		ce426f	`+ lwz 8,8(src)`
		ce426f	`+ stw 7,4(dst)`
		ce426f	`+ lwz 6,12(src)`
		ce426f	`+ addi src,src,16`
		ce426f	`+ stw 8,8(dst)`
		ce426f	`+ stw 6,12(dst)`
		ce426f	`+ addi dst,dst,16`
		ce426f	`8: /* Copy 8 bytes. */`
		ce426f	`- bf 28,4f`
		ce426f	`+ bf 28,L(tail4)`
		ce426f	`+ lwz 6,0(src)`
		ce426f	`+ lwz 7,4(src)`
		ce426f	`+ addi src,src,8`
		ce426f	`+ stw 6,0(dst)`
		ce426f	`+ stw 7,4(dst)`
		ce426f	`+ addi dst,dst,8`
		ce426f	`+`
		ce426f	`+ .align 4`
		ce426f	`+/* Copies 4~7 bytes. */`
		ce426f	`+L(tail4):`
		ce426f	`+ bf 29,L(tail2)`
		ce426f	`+ lwz 6,0(src)`
		ce426f	`+ stw 6,0(dst)`
		ce426f	`+ bf 30,L(tail5)`
		ce426f	`+ lhz 7,4(src)`
		ce426f	`+ sth 7,4(dst)`
		ce426f	`+ bflr 31`
		ce426f	`+ lbz 8,6(src)`
		ce426f	`+ stb 8,6(dst)`
		ce426f	`+ /* Return original DST pointer. */`
		ce426f	`+ blr`
		ce426f
		ce426f	`- lwz 6,0(12)`
		ce426f	`- lwz 7,4(12)`
		ce426f	`- addi 12,12,8`
		ce426f	`- stw 6,0(3)`
		ce426f	`- stw 7,4(3)`
		ce426f	`- addi 3,3,8`
		ce426f	`-4: /* Copy 4 bytes. */`
		ce426f	`- bf 29,2f`
		ce426f	`-`
		ce426f	`- lwz 6,0(12)`
		ce426f	`- addi 12,12,4`
		ce426f	`- stw 6,0(3)`
		ce426f	`- addi 3,3,4`
		ce426f	`-2: /* Copy 2-3 bytes. */`
		ce426f	`+ .align 4`
		ce426f	`+/* Copies 2~3 bytes. */`
		ce426f	`+L(tail2):`
		ce426f	`bf 30,1f`
		ce426f	`-`
		ce426f	`- lhz 6,0(12)`
		ce426f	`- sth 6,0(3)`
		ce426f	`- bf 31,0f`
		ce426f	`- lbz 7,2(12)`
		ce426f	`- stb 7,2(3)`
		ce426f	`- ld 3,-16(1)`
		ce426f	`+ lhz 6,0(src)`
		ce426f	`+ sth 6,0(dst)`
		ce426f	`+ bflr 31`
		ce426f	`+ lbz 7,2(src)`
		ce426f	`+ stb 7,2(dst)`
		ce426f	`blr`
		ce426f
		ce426f	`- .align 4`
		ce426f	`-1: /* Copy 1 byte. */`
		ce426f	`- bf 31,0f`
		ce426f	`+ .align 4`
		ce426f	`+L(tail5):`
		ce426f	`+ bflr 31`
		ce426f	`+ lbz 6,4(src)`
		ce426f	`+ stb 6,4(dst)`
		ce426f	`+ blr`
		ce426f
		ce426f	`- lbz 6,0(12)`
		ce426f	`- stb 6,0(3)`
		ce426f	`-0: /* Return original DST pointer. */`
		ce426f	`- ld 3,-16(1)`
		ce426f	`+ .align 4`
		ce426f	`+1:`
		ce426f	`+ bflr 31`
		ce426f	`+ lbz 6,0(src)`
		ce426f	`+ stb 6,0(dst)`
		ce426f	`+ /* Return original DST pointer. */`
		ce426f	`blr`
		ce426f
		ce426f	`- /* Handles copies of 0~8 bytes. */`
		ce426f	`- .align 4`
		ce426f	`+`
		ce426f	`+/* Handles copies of 0~8 bytes. */`
		ce426f	`+ .align 4`
		ce426f	`L(copy_LE_8):`
		ce426f	`- bne cr6,4f`
		ce426f	`+ bne cr6,L(tail4)`
		ce426f
		ce426f	`/* Though we could've used ld/std here, they are still`
		ce426f	`slow for unaligned cases. */`
		ce426f
		ce426f	`- lwz 6,0(4)`
		ce426f	`- lwz 7,4(4)`
		ce426f	`- stw 6,0(3)`
		ce426f	`- stw 7,4(3)`
		ce426f	`- ld 3,-16(1) /* Return original DST pointers. */`
		ce426f	`+ lwz 6,0(src)`
		ce426f	`+ lwz 7,4(src)`
		ce426f	`+ stw 6,0(dst)`
		ce426f	`+ stw 7,4(dst)`
		ce426f	`blr`
		ce426f
		ce426f	`- .align 4`
		ce426f	`-4: /* Copies 4~7 bytes. */`
		ce426f	`- bf 29,2b`
		ce426f
		ce426f	`- lwz 6,0(4)`
		ce426f	`- stw 6,0(3)`
		ce426f	`- bf 30,5f`
		ce426f	`- lhz 7,4(4)`
		ce426f	`- sth 7,4(3)`
		ce426f	`- bf 31,0f`
		ce426f	`- lbz 8,6(4)`
		ce426f	`- stb 8,6(3)`
		ce426f	`- ld 3,-16(1)`
		ce426f	`- blr`
		ce426f	`-`
		ce426f	`- .align 4`
		ce426f	`-5: /* Copy 1 byte. */`
		ce426f	`- bf 31,0f`
		ce426f	`-`
		ce426f	`- lbz 6,4(4)`
		ce426f	`- stb 6,4(3)`
		ce426f	`-`
		ce426f	`-0: /* Return original DST pointer. */`
		ce426f	`- ld 3,-16(1)`
		ce426f	`- blr`
		ce426f	`-`
		ce426f	`- /* Handle copies of 32+ bytes where DST is aligned (to quadword) but`
		ce426f	`- SRC is not. Use aligned quadword loads from SRC, shifted to realign`
		ce426f	`- the data, allowing for aligned DST stores. */`
		ce426f	`- .align 4`
		ce426f	`+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but`
		ce426f	`+ SRC is not. Use aligned quadword loads from SRC, shifted to realign`
		ce426f	`+ the data, allowing for aligned DST stores. */`
		ce426f	`+ .align 4`
		ce426f	`L(copy_GE_32_unaligned):`
		ce426f	`- clrldi 0,0,60 /* Number of bytes until the 1st`
		ce426f	`- quadword. */`
		ce426f	`- andi. 11,3,15 /* Check alignment of DST (against`
		ce426f	`- quadwords). */`
		ce426f	`- srdi 9,5,4 /* Number of full quadwords remaining. */`
		ce426f	`+ clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */`
		ce426f	`+#ifndef __LITTLE_ENDIAN__`
		ce426f	`+ andi. 10,3,15 /* Check alignment of DST (against quadwords). */`
		ce426f	`+#endif`
		ce426f	`+ srdi 9,cnt,4 /* Number of full quadwords remaining. */`
		ce426f
		ce426f	`beq L(copy_GE_32_unaligned_cont)`
		ce426f
		ce426f	`- /* SRC is not quadword aligned, get it aligned. */`
		ce426f	`+ /* DST is not quadword aligned, get it aligned. */`
		ce426f
		ce426f	`- mtcrf 0x01,0`
		ce426f	`- subf 31,0,5`
		ce426f	`+ mtocrf 0x01,0`
		ce426f	`+ subf cnt,0,cnt`
		ce426f
		ce426f	`/* Vector instructions work best when proper alignment (16-bytes)`
		ce426f	`is present. Move 0~15 bytes as needed to get DST quadword-aligned. */`
		ce426f	`-1: /* Copy 1 byte. */`
		ce426f	`+1:`
		ce426f	`bf 31,2f`
		ce426f	`-`
		ce426f	`- lbz 6,0(12)`
		ce426f	`- addi 12,12,1`
		ce426f	`- stb 6,0(3)`
		ce426f	`- addi 3,3,1`
		ce426f	`-2: /* Copy 2 bytes. */`
		ce426f	`+ lbz 6,0(src)`
		ce426f	`+ addi src,src,1`
		ce426f	`+ stb 6,0(dst)`
		ce426f	`+ addi dst,dst,1`
		ce426f	`+2:`
		ce426f	`bf 30,4f`
		ce426f	`-`
		ce426f	`- lhz 6,0(12)`
		ce426f	`- addi 12,12,2`
		ce426f	`- sth 6,0(3)`
		ce426f	`- addi 3,3,2`
		ce426f	`-4: /* Copy 4 bytes. */`
		ce426f	`+ lhz 6,0(src)`
		ce426f	`+ addi src,src,2`
		ce426f	`+ sth 6,0(dst)`
		ce426f	`+ addi dst,dst,2`
		ce426f	`+4:`
		ce426f	`bf 29,8f`
		ce426f	`-`
		ce426f	`- lwz 6,0(12)`
		ce426f	`- addi 12,12,4`
		ce426f	`- stw 6,0(3)`
		ce426f	`- addi 3,3,4`
		ce426f	`-8: /* Copy 8 bytes. */`
		ce426f	`+ lwz 6,0(src)`
		ce426f	`+ addi src,src,4`
		ce426f	`+ stw 6,0(dst)`
		ce426f	`+ addi dst,dst,4`
		ce426f	`+8:`
		ce426f	`bf 28,0f`
		ce426f	`-`
		ce426f	`- ld 6,0(12)`
		ce426f	`- addi 12,12,8`
		ce426f	`- std 6,0(3)`
		ce426f	`- addi 3,3,8`
		ce426f	`+ ld 6,0(src)`
		ce426f	`+ addi src,src,8`
		ce426f	`+ std 6,0(dst)`
		ce426f	`+ addi dst,dst,8`
		ce426f	`0:`
		ce426f	`- clrldi 10,12,60 /* Check alignment of SRC. */`
		ce426f	`- srdi 9,31,4 /* Number of full quadwords remaining. */`
		ce426f	`+ srdi 9,cnt,4 /* Number of full quadwords remaining. */`
		ce426f
		ce426f	`/* The proper alignment is present, it is OK to copy the bytes now. */`
		ce426f	`L(copy_GE_32_unaligned_cont):`
		ce426f
		ce426f	`/* Setup two indexes to speed up the indexed vector operations. */`
		ce426f	`- clrldi 11,31,60`
		ce426f	`- li 6,16 /* Index for 16-bytes offsets. */`
		ce426f	`+ clrldi 10,cnt,60`
		ce426f	`+ li 6,16 /* Index for 16-bytes offsets. */`
		ce426f	`li 7,32 /* Index for 32-bytes offsets. */`
		ce426f	`- cmpldi cr1,11,0`
		ce426f	`- srdi 8,31,5 /* Setup the loop counter. */`
		ce426f	`- mr 10,3`
		ce426f	`- mr 11,12`
		ce426f	`- mtcrf 0x01,9`
		ce426f	`- cmpldi cr6,9,1`
		ce426f	`- lvsl 5,0,12`
		ce426f	`- lvx 3,0,12`
		ce426f	`- bf 31,L(setup_unaligned_loop)`
		ce426f	`-`
		ce426f	`- /* Copy another 16 bytes to align to 32-bytes due to the loop . */`
		ce426f	`- lvx 4,12,6`
		ce426f	`- vperm 6,3,4,5`
		ce426f	`- addi 11,12,16`
		ce426f	`- addi 10,3,16`
		ce426f	`- stvx 6,0,3`
		ce426f	`+ cmpldi cr1,10,0`
		ce426f	`+ srdi 8,cnt,5 /* Setup the loop counter. */`
		ce426f	`+ mtocrf 0x01,9`
		ce426f	`+ cmpldi cr6,9,1`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ lvsr 5,0,src`
		ce426f	`+#else`
		ce426f	`+ lvsl 5,0,src`
		ce426f	`+#endif`
		ce426f	`+ lvx 3,0,src`
		ce426f	`+ li 0,0`
		ce426f	`+ bf 31,L(setup_unaligned_loop)`
		ce426f	`+`
		ce426f	`+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */`
		ce426f	`+ lvx 4,src,6`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 6,4,3,5`
		ce426f	`+#else`
		ce426f	`+ vperm 6,3,4,5`
		ce426f	`+#endif`
		ce426f	`+ addi src,src,16`
		ce426f	`+ stvx 6,0,dst`
		ce426f	`+ addi dst,dst,16`
		ce426f	`vor 3,4,4`
		ce426f	`+ clrrdi 0,src,60`
		ce426f
		ce426f	`L(setup_unaligned_loop):`
		ce426f	`- mtctr 8`
		ce426f	`- ble cr6,L(end_unaligned_loop)`
		ce426f	`+ mtctr 8`
		ce426f	`+ ble cr6,L(end_unaligned_loop)`
		ce426f
		ce426f	`/* Copy 32 bytes at a time using vector instructions. */`
		ce426f	`- .align 4`
		ce426f	`+ .align 4`
		ce426f	`L(unaligned_loop):`
		ce426f
		ce426f	`/* Note: vr6/vr10 may contain data that was already copied,`
		ce426f	`@@ -444,63 +385,56 @@`
		ce426f	`some portions again. This is faster than having unaligned`
		ce426f	`vector instructions though. */`
		ce426f
		ce426f	`- lvx 4,11,6 /* vr4 = r11+16. */`
		ce426f	`- vperm 6,3,4,5 /* Merge the correctly-aligned portions`
		ce426f	`- of vr3/vr4 into vr6. */`
		ce426f	`- lvx 3,11,7 /* vr3 = r11+32. */`
		ce426f	`- vperm 10,4,3,5 /* Merge the correctly-aligned portions`
		ce426f	`- of vr3/vr4 into vr10. */`
		ce426f	`- addi 11,11,32`
		ce426f	`- stvx 6,0,10`
		ce426f	`- stvx 10,10,6`
		ce426f	`- addi 10,10,32`
		ce426f	`-`
		ce426f	`+ lvx 4,src,6`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 6,4,3,5`
		ce426f	`+#else`
		ce426f	`+ vperm 6,3,4,5`
		ce426f	`+#endif`
		ce426f	`+ lvx 3,src,7`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 10,3,4,5`
		ce426f	`+#else`
		ce426f	`+ vperm 10,4,3,5`
		ce426f	`+#endif`
		ce426f	`+ addi src,src,32`
		ce426f	`+ stvx 6,0,dst`
		ce426f	`+ stvx 10,dst,6`
		ce426f	`+ addi dst,dst,32`
		ce426f	`bdnz L(unaligned_loop)`
		ce426f
		ce426f	`- .align 4`
		ce426f	`+ clrrdi 0,src,60`
		ce426f	`+`
		ce426f	`+ .align 4`
		ce426f	`L(end_unaligned_loop):`
		ce426f
		ce426f	`/* Check for tail bytes. */`
		ce426f	`- rldicr 0,31,0,59`
		ce426f	`- mtcrf 0x01,31`
		ce426f	`- beq cr1,0f`
		ce426f	`+ mtocrf 0x01,cnt`
		ce426f	`+ beqlr cr1`
		ce426f
		ce426f	`- add 3,3,0`
		ce426f	`- add 12,12,0`
		ce426f	`+ add src,src,0`
		ce426f
		ce426f	`/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */`
		ce426f	`-8: /* Copy 8 bytes. */`
		ce426f	`+ /* Copy 8 bytes. */`
		ce426f	`bf 28,4f`
		ce426f	`-`
		ce426f	`- lwz 6,0(12)`
		ce426f	`- lwz 7,4(12)`
		ce426f	`- addi 12,12,8`
		ce426f	`- stw 6,0(3)`
		ce426f	`- stw 7,4(3)`
		ce426f	`- addi 3,3,8`
		ce426f	`-4: /* Copy 4 bytes. */`
		ce426f	`- bf 29,2f`
		ce426f	`-`
		ce426f	`- lwz 6,0(12)`
		ce426f	`- addi 12,12,4`
		ce426f	`- stw 6,0(3)`
		ce426f	`- addi 3,3,4`
		ce426f	`-2: /* Copy 2~3 bytes. */`
		ce426f	`- bf 30,1f`
		ce426f	`-`
		ce426f	`- lhz 6,0(12)`
		ce426f	`- addi 12,12,2`
		ce426f	`- sth 6,0(3)`
		ce426f	`- addi 3,3,2`
		ce426f	`-1: /* Copy 1 byte. */`
		ce426f	`- bf 31,0f`
		ce426f	`-`
		ce426f	`- lbz 6,0(12)`
		ce426f	`- stb 6,0(3)`
		ce426f	`-0: /* Return original DST pointer. */`
		ce426f	`- ld 31,-8(1)`
		ce426f	`- ld 3,-16(1)`
		ce426f	`+ lwz 6,0(src)`
		ce426f	`+ lwz 7,4(src)`
		ce426f	`+ addi src,src,8`
		ce426f	`+ stw 6,0(dst)`
		ce426f	`+ stw 7,4(dst)`
		ce426f	`+ addi dst,dst,8`
		ce426f	`+4: /* Copy 4~7 bytes. */`
		ce426f	`+ bf 29,L(tail2)`
		ce426f	`+ lwz 6,0(src)`
		ce426f	`+ stw 6,0(dst)`
		ce426f	`+ bf 30,L(tail5)`
		ce426f	`+ lhz 7,4(src)`
		ce426f	`+ sth 7,4(dst)`
		ce426f	`+ bflr 31`
		ce426f	`+ lbz 8,6(src)`
		ce426f	`+ stb 8,6(dst)`
		ce426f	`+ /* Return original DST pointer. */`
		ce426f	`blr`
		ce426f
		ce426f	`-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)`
		ce426f	`+END_GEN_TB (memcpy,TB_TOCLESS)`
		ce426f	`libc_hidden_builtin_def (memcpy)`
		ce426f	`diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S`
		ce426f	`--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500`
		ce426f	`@@ -367,13 +367,21 @@`
		ce426f	`mr 11,12`
		ce426f	`mtcrf 0x01,9`
		ce426f	`cmpldi cr6,9,1`
		ce426f	`- lvsl 5,0,12`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ lvsr 5,0,12`
		ce426f	`+#else`
		ce426f	`+ lvsl 5,0,12`
		ce426f	`+#endif`
		ce426f	`lvx 3,0,12`
		ce426f	`bf 31,L(setup_unaligned_loop)`
		ce426f
		ce426f	`/* Copy another 16 bytes to align to 32-bytes due to the loop . */`
		ce426f	`lvx 4,12,6`
		ce426f	`- vperm 6,3,4,5`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 6,4,3,5`
		ce426f	`+#else`
		ce426f	`+ vperm 6,3,4,5`
		ce426f	`+#endif`
		ce426f	`addi 11,12,16`
		ce426f	`addi 10,3,16`
		ce426f	`stvx 6,0,3`
		ce426f	`@@ -393,11 +401,17 @@`
		ce426f	`vector instructions though. */`
		ce426f
		ce426f	`lvx 4,11,6 /* vr4 = r11+16. */`
		ce426f	`- vperm 6,3,4,5 /* Merge the correctly-aligned portions`
		ce426f	`- of vr3/vr4 into vr6. */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 6,4,3,5`
		ce426f	`+#else`
		ce426f	`+ vperm 6,3,4,5`
		ce426f	`+#endif`
		ce426f	`lvx 3,11,7 /* vr3 = r11+32. */`
		ce426f	`- vperm 10,4,3,5 /* Merge the correctly-aligned portions`
		ce426f	`- of vr3/vr4 into vr10. */`
		ce426f	`+#ifdef __LITTLE_ENDIAN__`
		ce426f	`+ vperm 10,3,4,5`
		ce426f	`+#else`
		ce426f	`+ vperm 10,4,3,5`
		ce426f	`+#endif`
		ce426f	`addi 11,11,32`
		ce426f	`stvx 6,0,10`
		ce426f	`stvx 10,10,6`

olga / rpms / glibc

Source Code

Blame SOURCES/glibc-ppc64le-31.patch