e354a5
commit 1d21fb1061cbeb50414a8f371abb36548d90f150
e354a5
Author: Stefan Liebler <stli@linux.ibm.com>
e354a5
Date:   Fri Jun 26 09:45:11 2020 +0200
e354a5
e354a5
    S390: Optimize __memset_z196.
e354a5
    
e354a5
    It turned out that an 256b-mvc instruction which depends on the
e354a5
    result of a previous 256b-mvc instruction is counterproductive.
e354a5
    Therefore this patch adjusts the 256b-loop by storing the
e354a5
    first byte with stc and setting the remaining 255b with mvc.
e354a5
    Now the 255b-mvc instruction depends on the stc instruction.
e354a5
e354a5
diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S
e354a5
index ca3eac0522..1e0c334156 100644
e354a5
--- a/sysdeps/s390/memset-z900.S
e354a5
+++ b/sysdeps/s390/memset-z900.S
e354a5
@@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196)
e354a5
 # if !defined __s390x__
e354a5
 	llgfr	%r4,%r4
e354a5
 # endif /* !defined __s390x__  */
e354a5
-	ltgr    %r4,%r4
e354a5
-	je      .L_Z196_4
e354a5
+	clgfi	%r4,1
e354a5
+	jl	.L_Z196_4	    # n == 0
e354a5
 	stc     %r3,0(%r2)
e354a5
+	je      .L_Z196_4	    # n == 1
e354a5
+	aghi	%r4,-2
e354a5
 	lgr     %r1,%r2
e354a5
-	cghi    %r4,1
e354a5
-	je      .L_Z196_4
e354a5
-	aghi    %r4,-2
e354a5
-	srlg    %r5,%r4,8
e354a5
-	ltgr    %r5,%r5
e354a5
-	jne     .L_Z196_1
e354a5
+	risbg	%r5,%r4,8,128+63,56 # r5 = n / 256
e354a5
+	jne     .L_Z196_1	    # Jump away if r5 != 0
e354a5
 .L_Z196_3:
e354a5
 	exrl    %r4,.L_Z196_17
e354a5
 .L_Z196_4:
e354a5
 	br      %r14
e354a5
 .L_Z196_1:
e354a5
 	cgfi	%r5,1048576
e354a5
-	jh	__memset_mvcle	   # Switch to mvcle for >256MB
e354a5
+	jh	__memset_mvcle	    # Switch to mvcle for >256MB
e354a5
 .L_Z196_2:
e354a5
 	pfd     2,1024(%r1)
e354a5
-	mvc     1(256,%r1),0(%r1)
e354a5
+	mvc     1(255,%r1),0(%r1)
e354a5
 	aghi    %r5,-1
e354a5
 	la      %r1,256(%r1)
e354a5
+	stc     %r3,0(%r1)
e354a5
 	jne     .L_Z196_2
e354a5
 	j       .L_Z196_3
e354a5
 .L_Z196_17: