08c3a6
commit 16245986fb9bfe396113fc7dfd1929f69a9e748e
08c3a6
Author: H.J. Lu <hjl.tools@gmail.com>
08c3a6
Date:   Fri Aug 20 06:42:24 2021 -0700
08c3a6
08c3a6
    x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
08c3a6
    
08c3a6
    Optimize loads of all bits set into ZMM register in AVX512 SVML codes
08c3a6
    by replacing
08c3a6
    
08c3a6
            vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
08c3a6
    
08c3a6
    and
08c3a6
    
08c3a6
            vmovups   .L_2il0floatpacket.13(%rip), %zmmX
08c3a6
    
08c3a6
    with
08c3a6
            vpternlogd $0xff, %zmmX, %zmmX, %zmmX
08c3a6
    
08c3a6
    This fixes BZ #28252.
08c3a6
    
08c3a6
    (cherry picked from commit 78c9ec9000f873abe7a15a91b87080a2e4308260)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
08c3a6
index e68fcdbb16a79f36..58e588a3d42a8bc9 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
08c3a6
@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
08c3a6
         vmovaps   %zmm0, %zmm8
08c3a6
 
08c3a6
 /* Check for large arguments path */
08c3a6
-        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
08c3a6
+        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
08c3a6
 
08c3a6
 /*
08c3a6
   ARGUMENT RANGE REDUCTION:
08c3a6
@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
08c3a6
         jmp       .LBL_2_7
08c3a6
 #endif
08c3a6
 END (_ZGVeN8v_cos_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.16:
08c3a6
-	.long	0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.16,@object
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
08c3a6
index dfa2acafc486b56b..f5f117d474f66176 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
08c3a6
@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
08c3a6
 
08c3a6
 /* preserve mantissa, set input exponent to 2^(-10) */
08c3a6
         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
08c3a6
-        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
08c3a6
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
08c3a6
         vpsrlq    $32, %zmm4, %zmm6
08c3a6
 
08c3a6
 /* reciprocal approximation good to at least 11 bits */
08c3a6
@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
08c3a6
         jmp       .LBL_2_7
08c3a6
 #endif
08c3a6
 END (_ZGVeN8v_log_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.12:
08c3a6
-	.long	0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.12,@object
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
08c3a6
index be8ab7c6e0e33819..48d251db16ccab9d 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
08c3a6
@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
08c3a6
         andq      $-64, %rsp
08c3a6
         subq      $1280, %rsp
08c3a6
         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
08c3a6
-        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
08c3a6
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
08c3a6
         vmovups __dAbsMask(%rax), %zmm7
08c3a6
         vmovups __dInvPI(%rax), %zmm2
08c3a6
         vmovups __dRShifter(%rax), %zmm1
08c3a6
@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
08c3a6
         jmp       .LBL_2_7
08c3a6
 #endif
08c3a6
 END (_ZGVeN8v_sin_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.14:
08c3a6
-	.long	0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.14,@object
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
08c3a6
index 611887082a545854..a4944a4feef6aa98 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
08c3a6
@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
08c3a6
 
08c3a6
 /* SinPoly = SinR*SinPoly */
08c3a6
         vfmadd213pd %zmm5, %zmm5, %zmm4
08c3a6
-        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
08c3a6
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
08c3a6
 
08c3a6
 /* Update Cos result's sign */
08c3a6
         vxorpd    %zmm2, %zmm1, %zmm1
08c3a6
@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
08c3a6
 ENTRY (_ZGVeN8vvv_sincos_skx)
08c3a6
 WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
08c3a6
 END (_ZGVeN8vvv_sincos_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.15:
08c3a6
-	.long	0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.15,@object
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
08c3a6
index f671d60d5dab5a0e..fe8474fed943e8ad 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
08c3a6
@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
08c3a6
   X = X - Y*PI1 - Y*PI2 - Y*PI3
08c3a6
  */
08c3a6
         vmovaps   %zmm0, %zmm6
08c3a6
-        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
08c3a6
+        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
08c3a6
         vmovups __sRShifter(%rax), %zmm3
08c3a6
         vmovups __sPI1_FMA(%rax), %zmm5
08c3a6
         vmovups __sA9_FMA(%rax), %zmm9
08c3a6
@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
08c3a6
         jmp       .LBL_2_7
08c3a6
 #endif
08c3a6
 END (_ZGVeN16v_cosf_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.13:
08c3a6
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.13,@object
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
08c3a6
index 637bfe3c06ab9ad4..229b7828cde04db2 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
08c3a6
@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
08c3a6
         vmovaps   %zmm0, %zmm7
08c3a6
 
08c3a6
 /* compare against threshold */
08c3a6
-        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
08c3a6
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
08c3a6
         vmovups __sInvLn2(%rax), %zmm4
08c3a6
         vmovups __sShifter(%rax), %zmm1
08c3a6
         vmovups __sLn2hi(%rax), %zmm6
08c3a6
@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
08c3a6
 
08c3a6
 #endif
08c3a6
 END (_ZGVeN16v_expf_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.13:
08c3a6
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.13,@object
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
08c3a6
index 9d790fbf0ad6c8ec..fa2aae986f543582 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
08c3a6
@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
08c3a6
         andq      $-64, %rsp
08c3a6
         subq      $1280, %rsp
08c3a6
         movq      __svml_slog_data@GOTPCREL(%rip), %rax
08c3a6
-        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
08c3a6
+        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
08c3a6
         vmovups _iBrkValue(%rax), %zmm4
08c3a6
         vmovups _sPoly_7(%rax), %zmm8
08c3a6
 
08c3a6
@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
08c3a6
 
08c3a6
 #endif
08c3a6
 END (_ZGVeN16v_logf_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.7:
08c3a6
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.7,@object
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
08c3a6
index c5c43c46ff7af5a3..6aea2a4f11d1f85f 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
08c3a6
@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
08c3a6
         vpsrlq    $32, %zmm3, %zmm2
08c3a6
         vpmovqd   %zmm2, %ymm11
08c3a6
         vcvtps2pd %ymm14, %zmm13
08c3a6
-        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
08c3a6
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
08c3a6
         vmovaps   %zmm14, %zmm26
08c3a6
         vpandd _ABSMASK(%rax), %zmm1, %zmm8
08c3a6
         vpcmpd    $1, _INF(%rax), %zmm8, %k2
08c3a6
@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
08c3a6
         vpmovqd   %zmm11, %ymm5
08c3a6
         vpxord    %zmm10, %zmm10, %zmm10
08c3a6
         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
08c3a6
-        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
08c3a6
+        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
08c3a6
         vpxord    %zmm11, %zmm11, %zmm11
08c3a6
         vcvtdq2pd %ymm7, %zmm7
08c3a6
         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
08c3a6
@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
08c3a6
         jmp       .LBL_2_7
08c3a6
 #endif
08c3a6
 END (_ZGVeN16vv_powf_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.23:
08c3a6
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.23,@object
08c3a6
-.L_2il0floatpacket.24:
08c3a6
-	.long	0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.24,@object
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
08c3a6
index 9cf359c86ff9bd70..a446c504f63c9399 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
08c3a6
@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
08c3a6
 
08c3a6
 /* Result sign calculations */
08c3a6
         vpternlogd $150, %zmm0, %zmm14, %zmm1
08c3a6
-        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
08c3a6
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
08c3a6
 
08c3a6
 /* Add correction term 0.5 for cos() part */
08c3a6
         vaddps    %zmm8, %zmm5, %zmm15
08c3a6
@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
08c3a6
 ENTRY (_ZGVeN16vvv_sincosf_skx)
08c3a6
 WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
08c3a6
 END (_ZGVeN16vvv_sincosf_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.13:
08c3a6
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.13,@object
08c3a6
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
08c3a6
index bd05109a62181f22..c1b352d0ad1992cd 100644
08c3a6
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
08c3a6
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
08c3a6
@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
08c3a6
         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
08c3a6
 
08c3a6
 /* Check for large and special values */
08c3a6
-        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
08c3a6
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
08c3a6
         vmovups __sAbsMask(%rax), %zmm5
08c3a6
         vmovups __sInvPI(%rax), %zmm1
08c3a6
         vmovups __sRShifter(%rax), %zmm2
08c3a6
@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
08c3a6
         jmp       .LBL_2_7
08c3a6
 #endif
08c3a6
 END (_ZGVeN16v_sinf_skx)
08c3a6
-
08c3a6
-	.section .rodata, "a"
08c3a6
-.L_2il0floatpacket.11:
08c3a6
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
08c3a6
-	.type	.L_2il0floatpacket.11,@object