Blob Blame History Raw
commit 0792c8ae1aebf538de45ff9a0e2e401a60525de2
Author: Stefan Liebler <stli@linux.ibm.com>
Date:   Fri Jun 26 09:45:11 2020 +0200

    S390: Optimize __memcpy_z196.
    
    This patch introduces an extra loop without pfd instructions
    as it turned out that the pfd instructions are usefull
    for copies >=64KB but are counterproductive for smaller copies.

diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S
index f2e9aaeb2d..dc2f491ec3 100644
--- a/sysdeps/s390/memcpy-z900.S
+++ b/sysdeps/s390/memcpy-z900.S
@@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)
 	je      .L_Z196_4
 .L_Z196_start2:
 	aghi    %r4,-1
-	srlg    %r5,%r4,8
-	ltgr    %r5,%r5
+	risbg	%r5,%r4,8,128+63,56 # r0 = r5 / 256
 	jne     .L_Z196_5
 .L_Z196_3:
 	exrl    %r4,.L_Z196_14
 .L_Z196_4:
 	br      %r14
 .L_Z196_5:
-	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
-	jh      __memcpy_mvcle
+	cgfi	%r5,255		# Switch to loop with pfd for copies >=64kB
+	jh	.L_Z196_6
 .L_Z196_2:
-	pfd     1,768(%r3)
-	pfd     2,768(%r1)
 	mvc     0(256,%r1),0(%r3)
 	aghi    %r5,-1
 	la      %r1,256(%r1)
 	la      %r3,256(%r3)
 	jne     .L_Z196_2
 	j       .L_Z196_3
+.L_Z196_6:
+	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
+	jh      __memcpy_mvcle
+.L_Z196_7:
+	pfd     1,1024(%r3)
+	pfd     2,1024(%r1)
+	mvc     0(256,%r1),0(%r3)
+	aghi    %r5,-1
+	la      %r1,256(%r1)
+	la      %r3,256(%r3)
+	jne     .L_Z196_7
+	j       .L_Z196_3
 .L_Z196_14:
 	mvc     0(1,%r1),0(%r3)
 END(MEMCPY_Z196)