786673
commit 0792c8ae1aebf538de45ff9a0e2e401a60525de2
786673
Author: Stefan Liebler <stli@linux.ibm.com>
786673
Date:   Fri Jun 26 09:45:11 2020 +0200
786673
786673
    S390: Optimize __memcpy_z196.
786673
    
786673
    This patch introduces an extra loop without pfd instructions
786673
    as it turned out that the pfd instructions are usefull
786673
    for copies >=64KB but are counterproductive for smaller copies.
786673
786673
diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S
786673
index f2e9aaeb2d..dc2f491ec3 100644
786673
--- a/sysdeps/s390/memcpy-z900.S
786673
+++ b/sysdeps/s390/memcpy-z900.S
786673
@@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)
786673
 	je      .L_Z196_4
786673
 .L_Z196_start2:
786673
 	aghi    %r4,-1
786673
-	srlg    %r5,%r4,8
786673
-	ltgr    %r5,%r5
786673
+	risbg	%r5,%r4,8,128+63,56 # r0 = r5 / 256
786673
 	jne     .L_Z196_5
786673
 .L_Z196_3:
786673
 	exrl    %r4,.L_Z196_14
786673
 .L_Z196_4:
786673
 	br      %r14
786673
 .L_Z196_5:
786673
-	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
786673
-	jh      __memcpy_mvcle
786673
+	cgfi	%r5,255		# Switch to loop with pfd for copies >=64kB
786673
+	jh	.L_Z196_6
786673
 .L_Z196_2:
786673
-	pfd     1,768(%r3)
786673
-	pfd     2,768(%r1)
786673
 	mvc     0(256,%r1),0(%r3)
786673
 	aghi    %r5,-1
786673
 	la      %r1,256(%r1)
786673
 	la      %r3,256(%r3)
786673
 	jne     .L_Z196_2
786673
 	j       .L_Z196_3
786673
+.L_Z196_6:
786673
+	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
786673
+	jh      __memcpy_mvcle
786673
+.L_Z196_7:
786673
+	pfd     1,1024(%r3)
786673
+	pfd     2,1024(%r1)
786673
+	mvc     0(256,%r1),0(%r3)
786673
+	aghi    %r5,-1
786673
+	la      %r1,256(%r1)
786673
+	la      %r3,256(%r3)
786673
+	jne     .L_Z196_7
786673
+	j       .L_Z196_3
786673
 .L_Z196_14:
786673
 	mvc     0(1,%r1),0(%r3)
786673
 END(MEMCPY_Z196)