ce426f
The memmove related fix is dropped in this patch because rhel-7.5
ce426f
does not have optimized memmove for POWER7.
ce426f
ce426f
commit 63da5cd4a097d089033d980c42254c3356fa723f
ce426f
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
ce426f
Date:   Wed Oct 25 13:13:53 2017 -0200
ce426f
ce426f
    powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove
ce426f
    
ce426f
    POWER9 DD2.1 and earlier has an issue where some cache inhibited
ce426f
    vector load traps to the kernel, causing a performance degradation.  To
ce426f
    handle this in memcpy and memmove, lvx/stvx is used for aligned
ce426f
    addresses instead of lxvd2x/stxvd2x.
ce426f
    
ce426f
    Reference: https://patchwork.ozlabs.org/patch/814059/
ce426f
    
ce426f
            * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
ce426f
            lxvd2x/stxvd2x with lvx/stvx.
ce426f
            * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
ce426f
    
ce426f
    Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
ce426f
    Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
ce426f
ce426f
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
ce426f
index 1ccbc2e..a7cdf8b 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
ce426f
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
ce426f
@@ -91,63 +91,63 @@ L(aligned_copy):
ce426f
 	srdi	12,cnt,7
ce426f
 	cmpdi	12,0
ce426f
 	beq	L(aligned_tail)
ce426f
-	lxvd2x	6,0,src
ce426f
-	lxvd2x	7,src,6
ce426f
+	lvx	6,0,src
ce426f
+	lvx	7,src,6
ce426f
 	mtctr	12
ce426f
 	b	L(aligned_128loop)
ce426f
 
ce426f
 	.align  4
ce426f
 L(aligned_128head):
ce426f
 	/* for the 2nd + iteration of this loop. */
ce426f
-	lxvd2x	6,0,src
ce426f
-	lxvd2x	7,src,6
ce426f
+	lvx	6,0,src
ce426f
+	lvx	7,src,6
ce426f
 L(aligned_128loop):
ce426f
-	lxvd2x	8,src,7
ce426f
-	lxvd2x	9,src,8
ce426f
-	stxvd2x	6,0,dst
ce426f
+	lvx	8,src,7
ce426f
+	lvx	9,src,8
ce426f
+	stvx	6,0,dst
ce426f
 	addi	src,src,64
ce426f
-	stxvd2x	7,dst,6
ce426f
-	stxvd2x	8,dst,7
ce426f
-	stxvd2x	9,dst,8
ce426f
-	lxvd2x	6,0,src
ce426f
-	lxvd2x	7,src,6
ce426f
+	stvx	7,dst,6
ce426f
+	stvx	8,dst,7
ce426f
+	stvx	9,dst,8
ce426f
+	lvx	6,0,src
ce426f
+	lvx	7,src,6
ce426f
 	addi	dst,dst,64
ce426f
-	lxvd2x	8,src,7
ce426f
-	lxvd2x	9,src,8
ce426f
+	lvx	8,src,7
ce426f
+	lvx	9,src,8
ce426f
 	addi	src,src,64
ce426f
-	stxvd2x	6,0,dst
ce426f
-	stxvd2x	7,dst,6
ce426f
-	stxvd2x	8,dst,7
ce426f
-	stxvd2x	9,dst,8
ce426f
+	stvx	6,0,dst
ce426f
+	stvx	7,dst,6
ce426f
+	stvx	8,dst,7
ce426f
+	stvx	9,dst,8
ce426f
 	addi	dst,dst,64
ce426f
 	bdnz	L(aligned_128head)
ce426f
 
ce426f
 L(aligned_tail):
ce426f
 	mtocrf	0x01,cnt
ce426f
 	bf	25,32f
ce426f
-	lxvd2x	6,0,src
ce426f
-	lxvd2x	7,src,6
ce426f
-	lxvd2x	8,src,7
ce426f
-	lxvd2x	9,src,8
ce426f
+	lvx	6,0,src
ce426f
+	lvx	7,src,6
ce426f
+	lvx	8,src,7
ce426f
+	lvx	9,src,8
ce426f
 	addi	src,src,64
ce426f
-	stxvd2x	6,0,dst
ce426f
-	stxvd2x	7,dst,6
ce426f
-	stxvd2x	8,dst,7
ce426f
-	stxvd2x	9,dst,8
ce426f
+	stvx	6,0,dst
ce426f
+	stvx	7,dst,6
ce426f
+	stvx	8,dst,7
ce426f
+	stvx	9,dst,8
ce426f
 	addi	dst,dst,64
ce426f
 32:
ce426f
 	bf	26,16f
ce426f
-	lxvd2x	6,0,src
ce426f
-	lxvd2x	7,src,6
ce426f
+	lvx	6,0,src
ce426f
+	lvx	7,src,6
ce426f
 	addi	src,src,32
ce426f
-	stxvd2x	6,0,dst
ce426f
-	stxvd2x	7,dst,6
ce426f
+	stvx	6,0,dst
ce426f
+	stvx	7,dst,6
ce426f
 	addi	dst,dst,32
ce426f
 16:
ce426f
 	bf	27,8f
ce426f
-	lxvd2x	6,0,src
ce426f
+	lvx	6,0,src
ce426f
 	addi	src,src,16
ce426f
-	stxvd2x	6,0,dst
ce426f
+	stvx	6,0,dst
ce426f
 	addi	dst,dst,16
ce426f
 8:
ce426f
 	bf	28,4f