Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/ia-opt-less_vec-memcmp-evex-movb.patch

Blob History Raw

		190885	`From 6c6a4eae9d8131531c9231f9f177d2db4130df01 Mon Sep 17 00:00:00 2001`
		190885	`From: Noah Goldstein <goldstein.w.n@gmail.com>`
		190885	`Date: Fri, 24 Dec 2021 18:54:41 -0600`
		190885	`Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S`
		190885
		190885	`No bug.`
		190885	`Optimizations are twofold.`
		190885
		190885	`1) Replace page cross and 0/1 checks with masked load instructions in`
		190885	`L(less_vec). In applications this reduces branch-misses in the`
		190885	`hot [0, 32] case.`
		190885	`2) Change controlflow so that L(less_vec) case gets the fall through.`
		190885
		190885	`Change 2) helps copies in the [0, 32] size range but comes at the cost`
		190885	`of copies in the [33, 64] size range. From profiles of GCC and`
		190885	`Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this`
		190885	`appears to the the right tradeoff.`
		190885
		190885	`Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>`
		190885	`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
		190885	`(cherry picked from commit abddd61de090ae84e380aff68a98bd94ef704667)`
		190885	`---`
		190885	`sysdeps/x86_64/multiarch/memcmp-evex-movbe.S \| 249 +++++--------------`
		190885	`1 file changed, 56 insertions(+), 193 deletions(-)`
		190885
		190885	`diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		190885	`index 640f6757..d2899e7c 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		190885	`+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		190885	`@@ -62,15 +62,18 @@ Latency:`
		190885	`# define VMOVU vmovdqu64`
		190885
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`+# define VMOVU_MASK vmovdqu32`
		190885	`# define CHAR_SIZE 4`
		190885	`# define VPCMP vpcmpd`
		190885	`# define VPTEST vptestmd`
		190885	`# else`
		190885	`+# define VMOVU_MASK vmovdqu8`
		190885	`# define CHAR_SIZE 1`
		190885	`# define VPCMP vpcmpub`
		190885	`# define VPTEST vptestmb`
		190885	`# endif`
		190885
		190885	`+`
		190885	`# define VEC_SIZE 32`
		190885	`# define PAGE_SIZE 4096`
		190885	`# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)`
		190885	`@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)`
		190885	`movl %edx, %edx`
		190885	`# endif`
		190885	`cmp $CHAR_PER_VEC, %RDX_LP`
		190885	`- jb L(less_vec)`
		190885	`+ /* Fall through for [0, VEC_SIZE] as its the hottest. */`
		190885	`+ ja L(more_1x_vec)`
		190885	`+`
		190885	`+ /* Create mask for CHAR's we want to compare. This allows us to`
		190885	`+ avoid having to include page cross logic. */`
		190885	`+ movl $-1, %ecx`
		190885	`+ bzhil %edx, %ecx, %ecx`
		190885	`+ kmovd %ecx, %k2`
		190885	`+`
		190885	`+ /* Safe to load full ymm with mask. */`
		190885	`+ VMOVU_MASK (%rsi), %YMM2{%k2}`
		190885	`+ VPCMP $4,(%rdi), %YMM2, %k1{%k2}`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885	`+ ret`
		190885
		190885	`+ .p2align 4`
		190885	`+L(return_vec_0):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl (%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`+ /* NB: no partial register stall here because xorl zero idiom`
		190885	`+ above. */`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl (%rsi, %rax), %ecx`
		190885	`+ movzbl (%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ ret`
		190885	`+`
		190885	`+`
		190885	`+ .p2align 4`
		190885	`+L(more_1x_vec):`
		190885	`/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */`
		190885	`VMOVU (%rsi), %YMM1`
		190885	`/* Use compare not equals to directly check for mismatch. */`
		190885	`- VPCMP $4, (%rdi), %YMM1, %k1`
		190885	`+ VPCMP $4,(%rdi), %YMM1, %k1`
		190885	`kmovd %k1, %eax`
		190885	`/* NB: eax must be destination register if going to`
		190885	`L(return_vec_[0,2]). For L(return_vec_3) destination register`
		190885	`@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)`
		190885
		190885	`/* Check third and fourth VEC no matter what. */`
		190885	`VMOVU (VEC_SIZE * 2)(%rsi), %YMM3`
		190885	`- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1`
		190885	`+ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1`
		190885	`kmovd %k1, %eax`
		190885	`testl %eax, %eax`
		190885	`jnz L(return_vec_2)`
		190885
		190885	`VMOVU (VEC_SIZE * 3)(%rsi), %YMM4`
		190885	`- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1`
		190885	`+ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1`
		190885	`kmovd %k1, %ecx`
		190885	`testl %ecx, %ecx`
		190885	`jnz L(return_vec_3)`
		190885	`@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)`
		190885	`VMOVU (VEC_SIZE * 3)(%rsi), %YMM4`
		190885	`/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while`
		190885	`oring with YMM1. Result is stored in YMM4. */`
		190885	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		190885	`+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		190885
		190885	`/* Or together YMM2, YMM3, and YMM4 into YMM4. */`
		190885	`vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		190885	`@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)`
		190885	`/* NB: eax must be zero to reach here. */`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`+`
		190885	`+ .p2align 4,, 8`
		190885	`L(8x_end_return_vec_0_1_2_3):`
		190885	`movq %rdx, %rdi`
		190885	`L(8x_return_vec_0_1_2_3):`
		190885	`@@ -222,23 +262,6 @@ L(return_vec_3):`
		190885	`# endif`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`-L(return_vec_0):`
		190885	`- tzcntl %eax, %eax`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- movl (%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`- xorl %edx, %edx`
		190885	`- cmpl (%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`- /* NB: no partial register stall here because xorl zero idiom`
		190885	`- above. */`
		190885	`- setg %dl`
		190885	`- leal -1(%rdx, %rdx), %eax`
		190885	`-# else`
		190885	`- movzbl (%rsi, %rax), %ecx`
		190885	`- movzbl (%rdi, %rax), %eax`
		190885	`- subl %ecx, %eax`
		190885	`-# endif`
		190885	`- ret`
		190885
		190885	`.p2align 4`
		190885	`L(return_vec_1):`
		190885	`@@ -297,7 +320,7 @@ L(loop_4x_vec):`
		190885	`VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3`
		190885	`vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3`
		190885	`VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4`
		190885	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		190885	`+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		190885	`vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		190885	`VPTEST %YMM4, %YMM4, %k1`
		190885	`kmovd %k1, %ecx`
		190885	`@@ -324,7 +347,7 @@ L(loop_4x_vec):`
		190885	`VMOVU VEC_SIZE(%rsi, %rdx), %YMM2`
		190885	`vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2`
		190885	`VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4`
		190885	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4`
		190885	`+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4`
		190885	`vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		190885	`VPTEST %YMM4, %YMM4, %k1`
		190885	`kmovd %k1, %ecx`
		190885	`@@ -336,14 +359,14 @@ L(loop_4x_vec):`
		190885	`/* Only entry is from L(more_8x_vec). */`
		190885	`.p2align 4,, 10`
		190885	`L(8x_last_2x_vec):`
		190885	`- VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1`
		190885	`+ VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1`
		190885	`kmovd %k1, %eax`
		190885	`testl %eax, %eax`
		190885	`jnz L(8x_return_vec_2)`
		190885	`/* Naturally aligned to 16 bytes. */`
		190885	`L(8x_last_1x_vec):`
		190885	`VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1`
		190885	`- VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1`
		190885	`+ VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1`
		190885	`kmovd %k1, %eax`
		190885	`testl %eax, %eax`
		190885	`jnz L(8x_return_vec_3)`
		190885	`@@ -392,7 +415,9 @@ L(last_1x_vec):`
		190885	`jnz L(return_vec_0_end)`
		190885	`ret`
		190885
		190885	`- .p2align 4,, 10`
		190885	`+`
		190885	`+ /* Don't align. Takes 2-fetch blocks either way and aligning`
		190885	`+ will cause code to spill into another cacheline. */`
		190885	`L(return_vec_1_end):`
		190885	`/* Use bsf to save code size. This is necessary to have`
		190885	`L(one_or_less) fit in aligning bytes between. */`
		190885	`@@ -411,31 +436,8 @@ L(return_vec_1_end):`
		190885	`# endif`
		190885	`ret`
		190885
		190885	`- /* NB: L(one_or_less) fits in alignment padding between`
		190885	`- L(return_vec_1_end) and L(return_vec_0_end). */`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`-L(one_or_less):`
		190885	`- jb L(zero)`
		190885	`- movl (%rdi), %ecx`
		190885	`- xorl %edx, %edx`
		190885	`- cmpl (%rsi), %ecx`
		190885	`- je L(zero)`
		190885	`- setg %dl`
		190885	`- leal -1(%rdx, %rdx), %eax`
		190885	`- ret`
		190885	`-# else`
		190885	`-L(one_or_less):`
		190885	`- jb L(zero)`
		190885	`- movzbl (%rsi), %ecx`
		190885	`- movzbl (%rdi), %eax`
		190885	`- subl %ecx, %eax`
		190885	`- ret`
		190885	`-# endif`
		190885	`-L(zero):`
		190885	`- xorl %eax, %eax`
		190885	`- ret`
		190885	`-`
		190885	`- .p2align 4`
		190885	`+ /* Don't align. Takes 2-fetch blocks either way and aligning`
		190885	`+ will cause code to spill into another cacheline. */`
		190885	`L(return_vec_0_end):`
		190885	`tzcntl %eax, %eax`
		190885	`addl %edx, %eax`
		190885	`@@ -451,146 +453,7 @@ L(return_vec_0_end):`
		190885	`subl %ecx, %eax`
		190885	`# endif`
		190885	`ret`
		190885	`+ /* 1-byte until next cache line. */`
		190885
		190885	`- .p2align 4`
		190885	`-L(less_vec):`
		190885	`- /* Check if one or less CHAR. This is necessary for size == 0`
		190885	`- but is also faster for size == CHAR_SIZE. */`
		190885	`- cmpl $1, %edx`
		190885	`- jbe L(one_or_less)`
		190885	`-`
		190885	`- /* Check if loading one VEC from either s1 or s2 could cause a`
		190885	`- page cross. This can have false positives but is by far the`
		190885	`- fastest method. */`
		190885	`- movl %edi, %eax`
		190885	`- orl %esi, %eax`
		190885	`- andl $(PAGE_SIZE - 1), %eax`
		190885	`- cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		190885	`- jg L(page_cross_less_vec)`
		190885	`-`
		190885	`- /* No page cross possible. */`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMP $4, (%rdi), %YMM2, %k1`
		190885	`- kmovd %k1, %eax`
		190885	`- /* Check if any matches where in bounds. Intentionally not`
		190885	`- storing result in eax to limit dependency chain if it goes to`
		190885	`- L(return_vec_0_lv). */`
		190885	`- bzhil %edx, %eax, %edx`
		190885	`- jnz L(return_vec_0_lv)`
		190885	`- xorl %eax, %eax`
		190885	`- ret`
		190885	`-`
		190885	`- /* Essentially duplicate of L(return_vec_0). Ends up not costing`
		190885	`- any code as shrinks L(less_vec) by allowing 2-byte encoding of`
		190885	`- the jump and ends up fitting in aligning bytes. As well fits on`
		190885	`- same cache line as L(less_vec) so also saves a line from having`
		190885	`- to be fetched on cold calls to memcmp. */`
		190885	`- .p2align 4,, 4`
		190885	`-L(return_vec_0_lv):`
		190885	`- tzcntl %eax, %eax`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- movl (%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`- xorl %edx, %edx`
		190885	`- cmpl (%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`- /* NB: no partial register stall here because xorl zero idiom`
		190885	`- above. */`
		190885	`- setg %dl`
		190885	`- leal -1(%rdx, %rdx), %eax`
		190885	`-# else`
		190885	`- movzbl (%rsi, %rax), %ecx`
		190885	`- movzbl (%rdi, %rax), %eax`
		190885	`- subl %ecx, %eax`
		190885	`-# endif`
		190885	`- ret`
		190885	`-`
		190885	`- .p2align 4`
		190885	`-L(page_cross_less_vec):`
		190885	`- /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28`
		190885	`- bytes. */`
		190885	`- cmpl $(16 / CHAR_SIZE), %edx`
		190885	`- jae L(between_16_31)`
		190885	`-# ifndef USE_AS_WMEMCMP`
		190885	`- cmpl $8, %edx`
		190885	`- jae L(between_8_15)`
		190885	`- cmpl $4, %edx`
		190885	`- jb L(between_2_3)`
		190885	`-`
		190885	`- /* Load as big endian with overlapping movbe to avoid branches.`
		190885	`- */`
		190885	`- movbe (%rdi), %eax`
		190885	`- movbe (%rsi), %ecx`
		190885	`- shlq $32, %rax`
		190885	`- shlq $32, %rcx`
		190885	`- movbe -4(%rdi, %rdx), %edi`
		190885	`- movbe -4(%rsi, %rdx), %esi`
		190885	`- orq %rdi, %rax`
		190885	`- orq %rsi, %rcx`
		190885	`- subq %rcx, %rax`
		190885	`- /* edx is guranteed to be positive int32 in range [4, 7]. */`
		190885	`- cmovne %edx, %eax`
		190885	`- /* ecx is -1 if rcx > rax. Otherwise 0. */`
		190885	`- sbbl %ecx, %ecx`
		190885	`- /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==`
		190885	`- rax then eax and ecx are zero. If rax < rax then ecx is -1 so`
		190885	`- eax doesn't matter. */`
		190885	`- orl %ecx, %eax`
		190885	`- ret`
		190885	`-`
		190885	`- .p2align 4,, 8`
		190885	`-L(between_8_15):`
		190885	`-# endif`
		190885	`- /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */`
		190885	`- vmovq (%rdi), %xmm1`
		190885	`- vmovq (%rsi), %xmm2`
		190885	`- VPCMP $4, %xmm1, %xmm2, %k1`
		190885	`- kmovd %k1, %eax`
		190885	`- testl %eax, %eax`
		190885	`- jnz L(return_vec_0_lv)`
		190885	`- /* Use overlapping loads to avoid branches. */`
		190885	`- vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1`
		190885	`- vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2`
		190885	`- VPCMP $4, %xmm1, %xmm2, %k1`
		190885	`- addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx`
		190885	`- kmovd %k1, %eax`
		190885	`- testl %eax, %eax`
		190885	`- jnz L(return_vec_0_end)`
		190885	`- ret`
		190885	`-`
		190885	`- .p2align 4,, 8`
		190885	`-L(between_16_31):`
		190885	`- /* From 16 to 31 bytes. No branch when size == 16. */`
		190885	`-`
		190885	`- /* Use movups to save code size. */`
		190885	`- vmovdqu (%rsi), %xmm2`
		190885	`- VPCMP $4, (%rdi), %xmm2, %k1`
		190885	`- kmovd %k1, %eax`
		190885	`- testl %eax, %eax`
		190885	`- jnz L(return_vec_0_lv)`
		190885	`- /* Use overlapping loads to avoid branches. */`
		190885	`- vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2`
		190885	`- VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1`
		190885	`- addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx`
		190885	`- kmovd %k1, %eax`
		190885	`- testl %eax, %eax`
		190885	`- jnz L(return_vec_0_end)`
		190885	`- ret`
		190885	`-`
		190885	`-# ifndef USE_AS_WMEMCMP`
		190885	`-L(between_2_3):`
		190885	`- /* Load as big endian to avoid branches. */`
		190885	`- movzwl (%rdi), %eax`
		190885	`- movzwl (%rsi), %ecx`
		190885	`- shll $8, %eax`
		190885	`- shll $8, %ecx`
		190885	`- bswap %eax`
		190885	`- bswap %ecx`
		190885	`- movzbl -1(%rdi, %rdx), %edi`
		190885	`- movzbl -1(%rsi, %rdx), %esi`
		190885	`- orl %edi, %eax`
		190885	`- orl %esi, %ecx`
		190885	`- /* Subtraction is okay because the upper 8 bits are zero. */`
		190885	`- subl %ecx, %eax`
		190885	`- ret`
		190885	`-# endif`
		190885	`END (MEMCMP)`
		190885	`#endif`
		190885	`--`
		190885	`GitLab`
		190885

rpms / glibc

Source Code

Blame SOURCES/ia-opt-less_vec-memcmp-evex-movb.patch