The memmove related fix is dropped in this patch because rhel-7.5
does not have optimized memmove for POWER7.
commit 63da5cd4a097d089033d980c42254c3356fa723f
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date: Wed Oct 25 13:13:53 2017 -0200
powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove
POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load traps to the kernel, causing a performance degradation. To
handle this in memcpy and memmove, lvx/stvx is used for aligned
addresses instead of lxvd2x/stxvd2x.
Reference: https://patchwork.ozlabs.org/patch/814059/
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
lxvd2x/stxvd2x with lvx/stvx.
* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 1ccbc2e..a7cdf8b 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -91,63 +91,63 @@ L(aligned_copy):
srdi 12,cnt,7
cmpdi 12,0
beq L(aligned_tail)
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ lvx 6,0,src
+ lvx 7,src,6
mtctr 12
b L(aligned_128loop)
.align 4
L(aligned_128head):
/* for the 2nd + iteration of this loop. */
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ lvx 6,0,src
+ lvx 7,src,6
L(aligned_128loop):
- lxvd2x 8,src,7
- lxvd2x 9,src,8
- stxvd2x 6,0,dst
+ lvx 8,src,7
+ lvx 9,src,8
+ stvx 6,0,dst
addi src,src,64
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
+ lvx 6,0,src
+ lvx 7,src,6
addi dst,dst,64
- lxvd2x 8,src,7
- lxvd2x 9,src,8
+ lvx 8,src,7
+ lvx 9,src,8
addi src,src,64
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
+ stvx 6,0,dst
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
addi dst,dst,64
bdnz L(aligned_128head)
L(aligned_tail):
mtocrf 0x01,cnt
bf 25,32f
- lxvd2x 6,0,src
- lxvd2x 7,src,6
- lxvd2x 8,src,7
- lxvd2x 9,src,8
+ lvx 6,0,src
+ lvx 7,src,6
+ lvx 8,src,7
+ lvx 9,src,8
addi src,src,64
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
+ stvx 6,0,dst
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
addi dst,dst,64
32:
bf 26,16f
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ lvx 6,0,src
+ lvx 7,src,6
addi src,src,32
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
+ stvx 6,0,dst
+ stvx 7,dst,6
addi dst,dst,32
16:
bf 27,8f
- lxvd2x 6,0,src
+ lvx 6,0,src
addi src,src,16
- stxvd2x 6,0,dst
+ stvx 6,0,dst
addi dst,dst,16
8:
bf 28,4f