446cf2
commit 0792c8ae1aebf538de45ff9a0e2e401a60525de2
446cf2
Author: Stefan Liebler <stli@linux.ibm.com>
446cf2
Date:   Fri Jun 26 09:45:11 2020 +0200
446cf2
446cf2
    S390: Optimize __memcpy_z196.
446cf2
    
446cf2
    This patch introduces an extra loop without pfd instructions
446cf2
    as it turned out that the pfd instructions are usefull
446cf2
    for copies >=64KB but are counterproductive for smaller copies.
446cf2
446cf2
diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S
446cf2
index f2e9aaeb2d..dc2f491ec3 100644
446cf2
--- a/sysdeps/s390/memcpy-z900.S
446cf2
+++ b/sysdeps/s390/memcpy-z900.S
446cf2
@@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)
446cf2
 	je      .L_Z196_4
446cf2
 .L_Z196_start2:
446cf2
 	aghi    %r4,-1
446cf2
-	srlg    %r5,%r4,8
446cf2
-	ltgr    %r5,%r5
446cf2
+	risbg	%r5,%r4,8,128+63,56 # r0 = r5 / 256
446cf2
 	jne     .L_Z196_5
446cf2
 .L_Z196_3:
446cf2
 	exrl    %r4,.L_Z196_14
446cf2
 .L_Z196_4:
446cf2
 	br      %r14
446cf2
 .L_Z196_5:
446cf2
-	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
446cf2
-	jh      __memcpy_mvcle
446cf2
+	cgfi	%r5,255		# Switch to loop with pfd for copies >=64kB
446cf2
+	jh	.L_Z196_6
446cf2
 .L_Z196_2:
446cf2
-	pfd     1,768(%r3)
446cf2
-	pfd     2,768(%r1)
446cf2
 	mvc     0(256,%r1),0(%r3)
446cf2
 	aghi    %r5,-1
446cf2
 	la      %r1,256(%r1)
446cf2
 	la      %r3,256(%r3)
446cf2
 	jne     .L_Z196_2
446cf2
 	j       .L_Z196_3
446cf2
+.L_Z196_6:
446cf2
+	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
446cf2
+	jh      __memcpy_mvcle
446cf2
+.L_Z196_7:
446cf2
+	pfd     1,1024(%r3)
446cf2
+	pfd     2,1024(%r1)
446cf2
+	mvc     0(256,%r1),0(%r3)
446cf2
+	aghi    %r5,-1
446cf2
+	la      %r1,256(%r1)
446cf2
+	la      %r3,256(%r3)
446cf2
+	jne     .L_Z196_7
446cf2
+	j       .L_Z196_3
446cf2
 .L_Z196_14:
446cf2
 	mvc     0(1,%r1),0(%r3)
446cf2
 END(MEMCPY_Z196)