Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/ia-opt-memcmp-evex-movbe-2.patch

Blob History Raw

		190885	`From 851ab0499680a3369da724d3d6d2ba71652d530d Mon Sep 17 00:00:00 2001`
		190885	`From: Noah Goldstein <goldstein.w.n@gmail.com>`
		190885	`Date: Tue, 21 Sep 2021 18:45:03 -0500`
		190885	`Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and`
		190885	`size`
		190885
		190885	`No bug.`
		190885
		190885	`The frontend optimizations are to:`
		190885	`1. Reorganize logically connected basic blocks so they are either in`
		190885	`the same cache line or adjacent cache lines.`
		190885	`2. Avoid cases when basic blocks unnecissarily cross cache lines.`
		190885	`3. Try and 32 byte align any basic blocks possible without sacrificing`
		190885	`code size. Smaller / Less hot basic blocks are used for this.`
		190885
		190885	`Overall code size shrunk by 168 bytes. This should make up for any`
		190885	`extra costs due to aligning to 64 bytes.`
		190885
		190885	`In general performance before deviated a great deal dependending on`
		190885	`whether entry alignment % 64 was 0, 16, 32, or 48. These changes`
		190885	`essentially make it so that the current implementation is at least`
		190885	`equal to the best alignment of the original for any arguments.`
		190885
		190885	`The only additional optimization is in the page cross case. Branch on`
		190885	`equals case was removed from the size == [4, 7] case. As well the [4,`
		190885	`7] and [2, 3] case where swapped as [4, 7] is likely a more hot`
		190885	`argument size.`
		190885
		190885	`test-memcmp and test-wmemcmp are both passing.`
		190885
		190885	`(cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)`
		190885	`---`
		190885	`sysdeps/x86_64/multiarch/memcmp-evex-movbe.S \| 434 +++++++++++--------`
		190885	`1 file changed, 242 insertions(+), 192 deletions(-)`
		190885
		190885	`diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		190885	`index 654dc7ac..2761b54f 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		190885	`+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		190885	`@@ -34,7 +34,24 @@`
		190885	`area.`
		190885	`7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.`
		190885	`8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.`
		190885	`- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */`
		190885	`+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.`
		190885	`+`
		190885	`+When possible the implementation tries to optimize for frontend in the`
		190885	`+following ways:`
		190885	`+Throughput:`
		190885	`+ 1. All code sections that fit are able to run optimally out of the`
		190885	`+ LSD.`
		190885	`+ 2. All code sections that fit are able to run optimally out of the`
		190885	`+ DSB`
		190885	`+ 3. Basic blocks are contained in minimum number of fetch blocks`
		190885	`+ necessary.`
		190885	`+`
		190885	`+Latency:`
		190885	`+ 1. Logically connected basic blocks are put in the same`
		190885	`+ cache-line.`
		190885	`+ 2. Logically connected basic blocks that do not fit in the same`
		190885	`+ cache-line are put in adjacent lines. This can get beneficial`
		190885	`+ L2 spatial prefetching and L1 next-line prefetching. */`
		190885
		190885	`# include <sysdep.h>`
		190885
		190885	`@@ -47,9 +64,11 @@`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`# define CHAR_SIZE 4`
		190885	`# define VPCMP vpcmpd`
		190885	`+# define VPTEST vptestmd`
		190885	`# else`
		190885	`# define CHAR_SIZE 1`
		190885	`# define VPCMP vpcmpub`
		190885	`+# define VPTEST vptestmb`
		190885	`# endif`
		190885
		190885	`# define VEC_SIZE 32`
		190885	`@@ -75,7 +94,9 @@`
		190885	`*/`
		190885
		190885	`.section .text.evex,"ax",@progbits`
		190885	`-ENTRY (MEMCMP)`
		190885	`+/* Cache align memcmp entry. This allows for much more thorough`
		190885	`+ frontend optimization. */`
		190885	`+ENTRY_P2ALIGN (MEMCMP, 6)`
		190885	`# ifdef __ILP32__`
		190885	`/* Clear the upper 32 bits. */`
		190885	`movl %edx, %edx`
		190885	`@@ -89,7 +110,7 @@ ENTRY (MEMCMP)`
		190885	`VPCMP $4, (%rdi), %YMM1, %k1`
		190885	`kmovd %k1, %eax`
		190885	`/* NB: eax must be destination register if going to`
		190885	`- L(return_vec_[0,2]). For L(return_vec_3 destination register`
		190885	`+ L(return_vec_[0,2]). For L(return_vec_3) destination register`
		190885	`must be ecx. */`
		190885	`testl %eax, %eax`
		190885	`jnz L(return_vec_0)`
		190885	`@@ -121,10 +142,6 @@ ENTRY (MEMCMP)`
		190885	`testl %ecx, %ecx`
		190885	`jnz L(return_vec_3)`
		190885
		190885	`- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so`
		190885	`- compare with zero to get a mask is needed. */`
		190885	`- vpxorq %XMM0, %XMM0, %XMM0`
		190885	`-`
		190885	`/* Go to 4x VEC loop. */`
		190885	`cmpq $(CHAR_PER_VEC * 8), %rdx`
		190885	`ja L(more_8x_vec)`
		190885	`@@ -148,47 +165,61 @@ ENTRY (MEMCMP)`
		190885
		190885	`VMOVU (VEC_SIZE * 2)(%rsi), %YMM3`
		190885	`vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3`
		190885	`- /* Or together YMM1, YMM2, and YMM3 into YMM3. */`
		190885	`- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		190885
		190885	`VMOVU (VEC_SIZE * 3)(%rsi), %YMM4`
		190885	`/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while`
		190885	`- oring with YMM3. Result is stored in YMM4. */`
		190885	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4`
		190885	`- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */`
		190885	`- VPCMP $4, %YMM4, %YMM0, %k1`
		190885	`+ oring with YMM1. Result is stored in YMM4. */`
		190885	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		190885	`+`
		190885	`+ /* Or together YMM2, YMM3, and YMM4 into YMM4. */`
		190885	`+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		190885	`+`
		190885	`+ /* Test YMM4 against itself. Store any CHAR mismatches in k1.`
		190885	`+ */`
		190885	`+ VPTEST %YMM4, %YMM4, %k1`
		190885	`+ /* k1 must go to ecx for L(return_vec_0_1_2_3). */`
		190885	`kmovd %k1, %ecx`
		190885	`testl %ecx, %ecx`
		190885	`jnz L(return_vec_0_1_2_3)`
		190885	`/* NB: eax must be zero to reach here. */`
		190885	`ret`
		190885
		190885	`- /* NB: aligning 32 here allows for the rest of the jump targets`
		190885	`- to be tuned for 32 byte alignment. Most important this ensures`
		190885	`- the L(more_8x_vec) loop is 32 byte aligned. */`
		190885	`- .p2align 5`
		190885	`-L(less_vec):`
		190885	`- /* Check if one or less CHAR. This is necessary for size = 0 but`
		190885	`- is also faster for size = CHAR_SIZE. */`
		190885	`- cmpl $1, %edx`
		190885	`- jbe L(one_or_less)`
		190885	`+ .p2align 4`
		190885	`+L(8x_end_return_vec_0_1_2_3):`
		190885	`+ movq %rdx, %rdi`
		190885	`+L(8x_return_vec_0_1_2_3):`
		190885	`+ addq %rdi, %rsi`
		190885	`+L(return_vec_0_1_2_3):`
		190885	`+ VPTEST %YMM1, %YMM1, %k0`
		190885	`+ kmovd %k0, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885
		190885	`- /* Check if loading one VEC from either s1 or s2 could cause a`
		190885	`- page cross. This can have false positives but is by far the`
		190885	`- fastest method. */`
		190885	`- movl %edi, %eax`
		190885	`- orl %esi, %eax`
		190885	`- andl $(PAGE_SIZE - 1), %eax`
		190885	`- cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		190885	`- jg L(page_cross_less_vec)`
		190885	`+ VPTEST %YMM2, %YMM2, %k0`
		190885	`+ kmovd %k0, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_1)`
		190885
		190885	`- /* No page cross possible. */`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMP $4, (%rdi), %YMM2, %k1`
		190885	`- kmovd %k1, %eax`
		190885	`- /* Create mask in ecx for potentially in bound matches. */`
		190885	`- bzhil %edx, %eax, %eax`
		190885	`- jnz L(return_vec_0)`
		190885	`+ VPTEST %YMM3, %YMM3, %k0`
		190885	`+ kmovd %k0, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_2)`
		190885	`+L(return_vec_3):`
		190885	`+ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one`
		190885	`+ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache`
		190885	`+ line. */`
		190885	`+ bsfl %ecx, %ecx`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		190885	`+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`ret`
		190885
		190885	`.p2align 4`
		190885	`@@ -209,10 +240,11 @@ L(return_vec_0):`
		190885	`# endif`
		190885	`ret`
		190885
		190885	`- /* NB: No p2align necessary. Alignment % 16 is naturally 1`
		190885	`- which is good enough for a target not in a loop. */`
		190885	`+ .p2align 4`
		190885	`L(return_vec_1):`
		190885	`- tzcntl %eax, %eax`
		190885	`+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one`
		190885	`+ fetch block. */`
		190885	`+ bsfl %eax, %eax`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`xorl %edx, %edx`
		190885	`@@ -226,10 +258,11 @@ L(return_vec_1):`
		190885	`# endif`
		190885	`ret`
		190885
		190885	`- /* NB: No p2align necessary. Alignment % 16 is naturally 2`
		190885	`- which is good enough for a target not in a loop. */`
		190885	`+ .p2align 4,, 10`
		190885	`L(return_vec_2):`
		190885	`- tzcntl %eax, %eax`
		190885	`+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one`
		190885	`+ fetch block. */`
		190885	`+ bsfl %eax, %eax`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`xorl %edx, %edx`
		190885	`@@ -243,40 +276,6 @@ L(return_vec_2):`
		190885	`# endif`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`-L(8x_return_vec_0_1_2_3):`
		190885	`- /* Returning from L(more_8x_vec) requires restoring rsi. */`
		190885	`- addq %rdi, %rsi`
		190885	`-L(return_vec_0_1_2_3):`
		190885	`- VPCMP $4, %YMM1, %YMM0, %k0`
		190885	`- kmovd %k0, %eax`
		190885	`- testl %eax, %eax`
		190885	`- jnz L(return_vec_0)`
		190885	`-`
		190885	`- VPCMP $4, %YMM2, %YMM0, %k0`
		190885	`- kmovd %k0, %eax`
		190885	`- testl %eax, %eax`
		190885	`- jnz L(return_vec_1)`
		190885	`-`
		190885	`- VPCMP $4, %YMM3, %YMM0, %k0`
		190885	`- kmovd %k0, %eax`
		190885	`- testl %eax, %eax`
		190885	`- jnz L(return_vec_2)`
		190885	`-L(return_vec_3):`
		190885	`- tzcntl %ecx, %ecx`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax`
		190885	`- xorl %edx, %edx`
		190885	`- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax`
		190885	`- setg %dl`
		190885	`- leal -1(%rdx, %rdx), %eax`
		190885	`-# else`
		190885	`- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		190885	`- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx`
		190885	`- subl %ecx, %eax`
		190885	`-# endif`
		190885	`- ret`
		190885	`-`
		190885	`.p2align 4`
		190885	`L(more_8x_vec):`
		190885	`/* Set end of s1 in rdx. */`
		190885	`@@ -288,21 +287,19 @@ L(more_8x_vec):`
		190885	`andq $-VEC_SIZE, %rdi`
		190885	`/* Adjust because first 4x vec where check already. */`
		190885	`subq $-(VEC_SIZE * 4), %rdi`
		190885	`+`
		190885	`.p2align 4`
		190885	`L(loop_4x_vec):`
		190885	`VMOVU (%rsi, %rdi), %YMM1`
		190885	`vpxorq (%rdi), %YMM1, %YMM1`
		190885	`-`
		190885	`VMOVU VEC_SIZE(%rsi, %rdi), %YMM2`
		190885	`vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2`
		190885	`-`
		190885	`VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3`
		190885	`vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3`
		190885	`- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		190885	`-`
		190885	`VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4`
		190885	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4`
		190885	`- VPCMP $4, %YMM4, %YMM0, %k1`
		190885	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		190885	`+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		190885	`+ VPTEST %YMM4, %YMM4, %k1`
		190885	`kmovd %k1, %ecx`
		190885	`testl %ecx, %ecx`
		190885	`jnz L(8x_return_vec_0_1_2_3)`
		190885	`@@ -319,28 +316,25 @@ L(loop_4x_vec):`
		190885	`cmpl $(VEC_SIZE * 2), %edi`
		190885	`jae L(8x_last_2x_vec)`
		190885
		190885	`+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3`
		190885	`+`
		190885	`VMOVU (%rsi, %rdx), %YMM1`
		190885	`vpxorq (%rdx), %YMM1, %YMM1`
		190885
		190885	`VMOVU VEC_SIZE(%rsi, %rdx), %YMM2`
		190885	`vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2`
		190885	`-`
		190885	`- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3`
		190885	`- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		190885	`-`
		190885	`VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4`
		190885	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4`
		190885	`- VPCMP $4, %YMM4, %YMM0, %k1`
		190885	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4`
		190885	`+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		190885	`+ VPTEST %YMM4, %YMM4, %k1`
		190885	`kmovd %k1, %ecx`
		190885	`- /* Restore s1 pointer to rdi. */`
		190885	`- movq %rdx, %rdi`
		190885	`testl %ecx, %ecx`
		190885	`- jnz L(8x_return_vec_0_1_2_3)`
		190885	`+ jnz L(8x_end_return_vec_0_1_2_3)`
		190885	`/* NB: eax must be zero to reach here. */`
		190885	`ret`
		190885
		190885	`/* Only entry is from L(more_8x_vec). */`
		190885	`- .p2align 4`
		190885	`+ .p2align 4,, 10`
		190885	`L(8x_last_2x_vec):`
		190885	`VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1`
		190885	`kmovd %k1, %eax`
		190885	`@@ -355,7 +349,31 @@ L(8x_last_1x_vec):`
		190885	`jnz L(8x_return_vec_3)`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`+ /* Not ideally aligned (at offset +9 bytes in fetch block) but`
		190885	`+ not aligning keeps it in the same cache line as`
		190885	`+ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code`
		190885	`+ size. */`
		190885	`+ .p2align 4,, 4`
		190885	`+L(8x_return_vec_2):`
		190885	`+ subq $VEC_SIZE, %rdx`
		190885	`+L(8x_return_vec_3):`
		190885	`+ bsfl %eax, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ leaq (%rdx, %rax, CHAR_SIZE), %rax`
		190885	`+ movl (VEC_SIZE * 3)(%rax), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ addq %rdx, %rax`
		190885	`+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		190885	`+ movzbl (VEC_SIZE * 3)(%rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ ret`
		190885	`+`
		190885	`+ .p2align 4,, 10`
		190885	`L(last_2x_vec):`
		190885	`/* Check second to last VEC. */`
		190885	`VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1`
		190885	`@@ -374,26 +392,49 @@ L(last_1x_vec):`
		190885	`jnz L(return_vec_0_end)`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`-L(8x_return_vec_2):`
		190885	`- subq $VEC_SIZE, %rdx`
		190885	`-L(8x_return_vec_3):`
		190885	`- tzcntl %eax, %eax`
		190885	`+ .p2align 4,, 10`
		190885	`+L(return_vec_1_end):`
		190885	`+ /* Use bsf to save code size. This is necessary to have`
		190885	`+ L(one_or_less) fit in aligning bytes between. */`
		190885	`+ bsfl %eax, %eax`
		190885	`+ addl %edx, %eax`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`- leaq (%rdx, %rax, CHAR_SIZE), %rax`
		190885	`- movl (VEC_SIZE * 3)(%rax), %ecx`
		190885	`+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`xorl %edx, %edx`
		190885	`- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		190885	`+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`setg %dl`
		190885	`leal -1(%rdx, %rdx), %eax`
		190885	`# else`
		190885	`- addq %rdx, %rax`
		190885	`- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		190885	`- movzbl (VEC_SIZE * 3)(%rax), %eax`
		190885	`+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx`
		190885	`+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax`
		190885	`subl %ecx, %eax`
		190885	`# endif`
		190885	`ret`
		190885
		190885	`+ /* NB: L(one_or_less) fits in alignment padding between`
		190885	`+ L(return_vec_1_end) and L(return_vec_0_end). */`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+L(one_or_less):`
		190885	`+ jb L(zero)`
		190885	`+ movl (%rdi), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (%rsi), %ecx`
		190885	`+ je L(zero)`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+ ret`
		190885	`+# else`
		190885	`+L(one_or_less):`
		190885	`+ jb L(zero)`
		190885	`+ movzbl (%rsi), %ecx`
		190885	`+ movzbl (%rdi), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+ ret`
		190885	`+# endif`
		190885	`+L(zero):`
		190885	`+ xorl %eax, %eax`
		190885	`+ ret`
		190885	`+`
		190885	`.p2align 4`
		190885	`L(return_vec_0_end):`
		190885	`tzcntl %eax, %eax`
		190885	`@@ -412,23 +453,56 @@ L(return_vec_0_end):`
		190885	`ret`
		190885
		190885	`.p2align 4`
		190885	`-L(return_vec_1_end):`
		190885	`+L(less_vec):`
		190885	`+ /* Check if one or less CHAR. This is necessary for size == 0`
		190885	`+ but is also faster for size == CHAR_SIZE. */`
		190885	`+ cmpl $1, %edx`
		190885	`+ jbe L(one_or_less)`
		190885	`+`
		190885	`+ /* Check if loading one VEC from either s1 or s2 could cause a`
		190885	`+ page cross. This can have false positives but is by far the`
		190885	`+ fastest method. */`
		190885	`+ movl %edi, %eax`
		190885	`+ orl %esi, %eax`
		190885	`+ andl $(PAGE_SIZE - 1), %eax`
		190885	`+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		190885	`+ jg L(page_cross_less_vec)`
		190885	`+`
		190885	`+ /* No page cross possible. */`
		190885	`+ VMOVU (%rsi), %YMM2`
		190885	`+ VPCMP $4, (%rdi), %YMM2, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ /* Check if any matches where in bounds. Intentionally not`
		190885	`+ storing result in eax to limit dependency chain if it goes to`
		190885	`+ L(return_vec_0_lv). */`
		190885	`+ bzhil %edx, %eax, %edx`
		190885	`+ jnz L(return_vec_0_lv)`
		190885	`+ xorl %eax, %eax`
		190885	`+ ret`
		190885	`+`
		190885	`+ /* Essentially duplicate of L(return_vec_0). Ends up not costing`
		190885	`+ any code as shrinks L(less_vec) by allowing 2-byte encoding of`
		190885	`+ the jump and ends up fitting in aligning bytes. As well fits on`
		190885	`+ same cache line as L(less_vec) so also saves a line from having`
		190885	`+ to be fetched on cold calls to memcmp. */`
		190885	`+ .p2align 4,, 4`
		190885	`+L(return_vec_0_lv):`
		190885	`tzcntl %eax, %eax`
		190885	`- addl %edx, %eax`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`+ movl (%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`xorl %edx, %edx`
		190885	`- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`+ /* NB: no partial register stall here because xorl zero idiom`
		190885	`+ above. */`
		190885	`setg %dl`
		190885	`leal -1(%rdx, %rdx), %eax`
		190885	`# else`
		190885	`- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx`
		190885	`- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax`
		190885	`+ movzbl (%rsi, %rax), %ecx`
		190885	`+ movzbl (%rdi, %rax), %eax`
		190885	`subl %ecx, %eax`
		190885	`# endif`
		190885	`ret`
		190885
		190885	`-`
		190885	`.p2align 4`
		190885	`L(page_cross_less_vec):`
		190885	`/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28`
		190885	`@@ -439,108 +513,84 @@ L(page_cross_less_vec):`
		190885	`cmpl $8, %edx`
		190885	`jae L(between_8_15)`
		190885	`cmpl $4, %edx`
		190885	`- jae L(between_4_7)`
		190885	`-L(between_2_3):`
		190885	`- /* Load as big endian to avoid branches. */`
		190885	`- movzwl (%rdi), %eax`
		190885	`- movzwl (%rsi), %ecx`
		190885	`- shll $8, %eax`
		190885	`- shll $8, %ecx`
		190885	`- bswap %eax`
		190885	`- bswap %ecx`
		190885	`- movzbl -1(%rdi, %rdx), %edi`
		190885	`- movzbl -1(%rsi, %rdx), %esi`
		190885	`- orl %edi, %eax`
		190885	`- orl %esi, %ecx`
		190885	`- /* Subtraction is okay because the upper 8 bits are zero. */`
		190885	`- subl %ecx, %eax`
		190885	`- ret`
		190885	`- .p2align 4`
		190885	`-L(one_or_less):`
		190885	`- jb L(zero)`
		190885	`- movzbl (%rsi), %ecx`
		190885	`- movzbl (%rdi), %eax`
		190885	`- subl %ecx, %eax`
		190885	`+ jb L(between_2_3)`
		190885	`+`
		190885	`+ /* Load as big endian with overlapping movbe to avoid branches.`
		190885	`+ */`
		190885	`+ movbe (%rdi), %eax`
		190885	`+ movbe (%rsi), %ecx`
		190885	`+ shlq $32, %rax`
		190885	`+ shlq $32, %rcx`
		190885	`+ movbe -4(%rdi, %rdx), %edi`
		190885	`+ movbe -4(%rsi, %rdx), %esi`
		190885	`+ orq %rdi, %rax`
		190885	`+ orq %rsi, %rcx`
		190885	`+ subq %rcx, %rax`
		190885	`+ /* edx is guranteed to be positive int32 in range [4, 7]. */`
		190885	`+ cmovne %edx, %eax`
		190885	`+ /* ecx is -1 if rcx > rax. Otherwise 0. */`
		190885	`+ sbbl %ecx, %ecx`
		190885	`+ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==`
		190885	`+ rax then eax and ecx are zero. If rax < rax then ecx is -1 so`
		190885	`+ eax doesn't matter. */`
		190885	`+ orl %ecx, %eax`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`+ .p2align 4,, 8`
		190885	`L(between_8_15):`
		190885	`# endif`
		190885	`/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */`
		190885	`- vmovq (%rdi), %XMM1`
		190885	`- vmovq (%rsi), %XMM2`
		190885	`- VPCMP $4, %XMM1, %XMM2, %k1`
		190885	`+ vmovq (%rdi), %xmm1`
		190885	`+ vmovq (%rsi), %xmm2`
		190885	`+ VPCMP $4, %xmm1, %xmm2, %k1`
		190885	`kmovd %k1, %eax`
		190885	`testl %eax, %eax`
		190885	`- jnz L(return_vec_0)`
		190885	`+ jnz L(return_vec_0_lv)`
		190885	`/* Use overlapping loads to avoid branches. */`
		190885	`- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi`
		190885	`- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi`
		190885	`- vmovq (%rdi), %XMM1`
		190885	`- vmovq (%rsi), %XMM2`
		190885	`- VPCMP $4, %XMM1, %XMM2, %k1`
		190885	`+ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1`
		190885	`+ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2`
		190885	`+ VPCMP $4, %xmm1, %xmm2, %k1`
		190885	`+ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx`
		190885	`kmovd %k1, %eax`
		190885	`testl %eax, %eax`
		190885	`- jnz L(return_vec_0)`
		190885	`- ret`
		190885	`-`
		190885	`- .p2align 4`
		190885	`-L(zero):`
		190885	`- xorl %eax, %eax`
		190885	`+ jnz L(return_vec_0_end)`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`+ .p2align 4,, 8`
		190885	`L(between_16_31):`
		190885	`/* From 16 to 31 bytes. No branch when size == 16. */`
		190885	`- VMOVU (%rsi), %XMM2`
		190885	`- VPCMP $4, (%rdi), %XMM2, %k1`
		190885	`+`
		190885	`+ /* Use movups to save code size. */`
		190885	`+ movups (%rsi), %xmm2`
		190885	`+ VPCMP $4, (%rdi), %xmm2, %k1`
		190885	`kmovd %k1, %eax`
		190885	`testl %eax, %eax`
		190885	`- jnz L(return_vec_0)`
		190885	`-`
		190885	`+ jnz L(return_vec_0_lv)`
		190885	`/* Use overlapping loads to avoid branches. */`
		190885	`-`
		190885	`- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2`
		190885	`- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi`
		190885	`- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi`
		190885	`- VPCMP $4, (%rdi), %XMM2, %k1`
		190885	`+ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2`
		190885	`+ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1`
		190885	`+ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx`
		190885	`kmovd %k1, %eax`
		190885	`testl %eax, %eax`
		190885	`- jnz L(return_vec_0)`
		190885	`- ret`
		190885	`-`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- .p2align 4`
		190885	`-L(one_or_less):`
		190885	`- jb L(zero)`
		190885	`- movl (%rdi), %ecx`
		190885	`- xorl %edx, %edx`
		190885	`- cmpl (%rsi), %ecx`
		190885	`- je L(zero)`
		190885	`- setg %dl`
		190885	`- leal -1(%rdx, %rdx), %eax`
		190885	`+ jnz L(return_vec_0_end)`
		190885	`ret`
		190885	`-# else`
		190885
		190885	`- .p2align 4`
		190885	`-L(between_4_7):`
		190885	`- /* Load as big endian with overlapping movbe to avoid branches.`
		190885	`- */`
		190885	`- movbe (%rdi), %eax`
		190885	`- movbe (%rsi), %ecx`
		190885	`- shlq $32, %rax`
		190885	`- shlq $32, %rcx`
		190885	`- movbe -4(%rdi, %rdx), %edi`
		190885	`- movbe -4(%rsi, %rdx), %esi`
		190885	`- orq %rdi, %rax`
		190885	`- orq %rsi, %rcx`
		190885	`- subq %rcx, %rax`
		190885	`- jz L(zero_4_7)`
		190885	`- sbbl %eax, %eax`
		190885	`- orl $1, %eax`
		190885	`-L(zero_4_7):`
		190885	`+# ifndef USE_AS_WMEMCMP`
		190885	`+L(between_2_3):`
		190885	`+ /* Load as big endian to avoid branches. */`
		190885	`+ movzwl (%rdi), %eax`
		190885	`+ movzwl (%rsi), %ecx`
		190885	`+ shll $8, %eax`
		190885	`+ shll $8, %ecx`
		190885	`+ bswap %eax`
		190885	`+ bswap %ecx`
		190885	`+ movzbl -1(%rdi, %rdx), %edi`
		190885	`+ movzbl -1(%rsi, %rdx), %esi`
		190885	`+ orl %edi, %eax`
		190885	`+ orl %esi, %ecx`
		190885	`+ /* Subtraction is okay because the upper 8 bits are zero. */`
		190885	`+ subl %ecx, %eax`
		190885	`ret`
		190885	`# endif`
		190885	`-`
		190885	`END (MEMCMP)`
		190885	`#endif`
		190885	`--`
		190885	`GitLab`
		190885

rpms / glibc

Source Code

Blame SOURCES/ia-opt-memcmp-evex-movbe-2.patch