Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-upstream-2.34-175.patch

Blob History Raw

		08c3a6	`commit 5ec3416853c4150c4d13312e05f93a053586d528`
		08c3a6	`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
		08c3a6	`Date: Tue Sep 21 18:45:03 2021 -0500`
		08c3a6
		08c3a6	`x86: Optimize memcmp-evex-movbe.S for frontend behavior and size`
		08c3a6
		08c3a6	`No bug.`
		08c3a6
		08c3a6	`The frontend optimizations are to:`
		08c3a6	`1. Reorganize logically connected basic blocks so they are either in`
		08c3a6	`the same cache line or adjacent cache lines.`
		08c3a6	`2. Avoid cases when basic blocks unnecissarily cross cache lines.`
		08c3a6	`3. Try and 32 byte align any basic blocks possible without sacrificing`
		08c3a6	`code size. Smaller / Less hot basic blocks are used for this.`
		08c3a6
		08c3a6	`Overall code size shrunk by 168 bytes. This should make up for any`
		08c3a6	`extra costs due to aligning to 64 bytes.`
		08c3a6
		08c3a6	`In general performance before deviated a great deal dependending on`
		08c3a6	`whether entry alignment % 64 was 0, 16, 32, or 48. These changes`
		08c3a6	`essentially make it so that the current implementation is at least`
		08c3a6	`equal to the best alignment of the original for any arguments.`
		08c3a6
		08c3a6	`The only additional optimization is in the page cross case. Branch on`
		08c3a6	`equals case was removed from the size == [4, 7] case. As well the [4,`
		08c3a6	`7] and [2, 3] case where swapped as [4, 7] is likely a more hot`
		08c3a6	`argument size.`
		08c3a6
		08c3a6	`test-memcmp and test-wmemcmp are both passing.`
		08c3a6
		08c3a6	`(cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)`
		08c3a6
		08c3a6	`diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		08c3a6	`index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644`
		08c3a6	`--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		08c3a6	`+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		08c3a6	`@@ -34,7 +34,24 @@`
		08c3a6	`area.`
		08c3a6	`7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.`
		08c3a6	`8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.`
		08c3a6	`- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */`
		08c3a6	`+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.`
		08c3a6	`+`
		08c3a6	`+When possible the implementation tries to optimize for frontend in the`
		08c3a6	`+following ways:`
		08c3a6	`+Throughput:`
		08c3a6	`+ 1. All code sections that fit are able to run optimally out of the`
		08c3a6	`+ LSD.`
		08c3a6	`+ 2. All code sections that fit are able to run optimally out of the`
		08c3a6	`+ DSB`
		08c3a6	`+ 3. Basic blocks are contained in minimum number of fetch blocks`
		08c3a6	`+ necessary.`
		08c3a6	`+`
		08c3a6	`+Latency:`
		08c3a6	`+ 1. Logically connected basic blocks are put in the same`
		08c3a6	`+ cache-line.`
		08c3a6	`+ 2. Logically connected basic blocks that do not fit in the same`
		08c3a6	`+ cache-line are put in adjacent lines. This can get beneficial`
		08c3a6	`+ L2 spatial prefetching and L1 next-line prefetching. */`
		08c3a6
		08c3a6	`# include <sysdep.h>`
		08c3a6
		08c3a6	`@@ -47,9 +64,11 @@`
		08c3a6	`# ifdef USE_AS_WMEMCMP`
		08c3a6	`# define CHAR_SIZE 4`
		08c3a6	`# define VPCMP vpcmpd`
		08c3a6	`+# define VPTEST vptestmd`
		08c3a6	`# else`
		08c3a6	`# define CHAR_SIZE 1`
		08c3a6	`# define VPCMP vpcmpub`
		08c3a6	`+# define VPTEST vptestmb`
		08c3a6	`# endif`
		08c3a6
		08c3a6	`# define VEC_SIZE 32`
		08c3a6	`@@ -75,7 +94,9 @@`
		08c3a6	`*/`
		08c3a6
		08c3a6	`.section .text.evex,"ax",@progbits`
		08c3a6	`-ENTRY (MEMCMP)`
		08c3a6	`+/* Cache align memcmp entry. This allows for much more thorough`
		08c3a6	`+ frontend optimization. */`
		08c3a6	`+ENTRY_P2ALIGN (MEMCMP, 6)`
		08c3a6	`# ifdef __ILP32__`
		08c3a6	`/* Clear the upper 32 bits. */`
		08c3a6	`movl %edx, %edx`
		08c3a6	`@@ -89,7 +110,7 @@ ENTRY (MEMCMP)`
		08c3a6	`VPCMP $4, (%rdi), %YMM1, %k1`
		08c3a6	`kmovd %k1, %eax`
		08c3a6	`/* NB: eax must be destination register if going to`
		08c3a6	`- L(return_vec_[0,2]). For L(return_vec_3 destination register`
		08c3a6	`+ L(return_vec_[0,2]). For L(return_vec_3) destination register`
		08c3a6	`must be ecx. */`
		08c3a6	`testl %eax, %eax`
		08c3a6	`jnz L(return_vec_0)`
		08c3a6	`@@ -121,10 +142,6 @@ ENTRY (MEMCMP)`
		08c3a6	`testl %ecx, %ecx`
		08c3a6	`jnz L(return_vec_3)`
		08c3a6
		08c3a6	`- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so`
		08c3a6	`- compare with zero to get a mask is needed. */`
		08c3a6	`- vpxorq %XMM0, %XMM0, %XMM0`
		08c3a6	`-`
		08c3a6	`/* Go to 4x VEC loop. */`
		08c3a6	`cmpq $(CHAR_PER_VEC * 8), %rdx`
		08c3a6	`ja L(more_8x_vec)`
		08c3a6	`@@ -148,47 +165,61 @@ ENTRY (MEMCMP)`
		08c3a6
		08c3a6	`VMOVU (VEC_SIZE * 2)(%rsi), %YMM3`
		08c3a6	`vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3`
		08c3a6	`- /* Or together YMM1, YMM2, and YMM3 into YMM3. */`
		08c3a6	`- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		08c3a6
		08c3a6	`VMOVU (VEC_SIZE * 3)(%rsi), %YMM4`
		08c3a6	`/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while`
		08c3a6	`- oring with YMM3. Result is stored in YMM4. */`
		08c3a6	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4`
		08c3a6	`- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */`
		08c3a6	`- VPCMP $4, %YMM4, %YMM0, %k1`
		08c3a6	`+ oring with YMM1. Result is stored in YMM4. */`
		08c3a6	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		08c3a6	`+`
		08c3a6	`+ /* Or together YMM2, YMM3, and YMM4 into YMM4. */`
		08c3a6	`+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		08c3a6	`+`
		08c3a6	`+ /* Test YMM4 against itself. Store any CHAR mismatches in k1.`
		08c3a6	`+ */`
		08c3a6	`+ VPTEST %YMM4, %YMM4, %k1`
		08c3a6	`+ /* k1 must go to ecx for L(return_vec_0_1_2_3). */`
		08c3a6	`kmovd %k1, %ecx`
		08c3a6	`testl %ecx, %ecx`
		08c3a6	`jnz L(return_vec_0_1_2_3)`
		08c3a6	`/* NB: eax must be zero to reach here. */`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- /* NB: aligning 32 here allows for the rest of the jump targets`
		08c3a6	`- to be tuned for 32 byte alignment. Most important this ensures`
		08c3a6	`- the L(more_8x_vec) loop is 32 byte aligned. */`
		08c3a6	`- .p2align 5`
		08c3a6	`-L(less_vec):`
		08c3a6	`- /* Check if one or less CHAR. This is necessary for size = 0 but`
		08c3a6	`- is also faster for size = CHAR_SIZE. */`
		08c3a6	`- cmpl $1, %edx`
		08c3a6	`- jbe L(one_or_less)`
		08c3a6	`+ .p2align 4`
		08c3a6	`+L(8x_end_return_vec_0_1_2_3):`
		08c3a6	`+ movq %rdx, %rdi`
		08c3a6	`+L(8x_return_vec_0_1_2_3):`
		08c3a6	`+ addq %rdi, %rsi`
		08c3a6	`+L(return_vec_0_1_2_3):`
		08c3a6	`+ VPTEST %YMM1, %YMM1, %k0`
		08c3a6	`+ kmovd %k0, %eax`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(return_vec_0)`
		08c3a6
		08c3a6	`- /* Check if loading one VEC from either s1 or s2 could cause a`
		08c3a6	`- page cross. This can have false positives but is by far the`
		08c3a6	`- fastest method. */`
		08c3a6	`- movl %edi, %eax`
		08c3a6	`- orl %esi, %eax`
		08c3a6	`- andl $(PAGE_SIZE - 1), %eax`
		08c3a6	`- cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		08c3a6	`- jg L(page_cross_less_vec)`
		08c3a6	`+ VPTEST %YMM2, %YMM2, %k0`
		08c3a6	`+ kmovd %k0, %eax`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(return_vec_1)`
		08c3a6
		08c3a6	`- /* No page cross possible. */`
		08c3a6	`- VMOVU (%rsi), %YMM2`
		08c3a6	`- VPCMP $4, (%rdi), %YMM2, %k1`
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`- /* Create mask in ecx for potentially in bound matches. */`
		08c3a6	`- bzhil %edx, %eax, %eax`
		08c3a6	`- jnz L(return_vec_0)`
		08c3a6	`+ VPTEST %YMM3, %YMM3, %k0`
		08c3a6	`+ kmovd %k0, %eax`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(return_vec_2)`
		08c3a6	`+L(return_vec_3):`
		08c3a6	`+ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one`
		08c3a6	`+ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache`
		08c3a6	`+ line. */`
		08c3a6	`+ bsfl %ecx, %ecx`
		08c3a6	`+# ifdef USE_AS_WMEMCMP`
		08c3a6	`+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax`
		08c3a6	`+ xorl %edx, %edx`
		08c3a6	`+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax`
		08c3a6	`+ setg %dl`
		08c3a6	`+ leal -1(%rdx, %rdx), %eax`
		08c3a6	`+# else`
		08c3a6	`+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		08c3a6	`+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx`
		08c3a6	`+ subl %ecx, %eax`
		08c3a6	`+# endif`
		08c3a6	`ret`
		08c3a6
		08c3a6	`.p2align 4`
		08c3a6	`@@ -209,10 +240,11 @@ L(return_vec_0):`
		08c3a6	`# endif`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- /* NB: No p2align necessary. Alignment % 16 is naturally 1`
		08c3a6	`- which is good enough for a target not in a loop. */`
		08c3a6	`+ .p2align 4`
		08c3a6	`L(return_vec_1):`
		08c3a6	`- tzcntl %eax, %eax`
		08c3a6	`+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one`
		08c3a6	`+ fetch block. */`
		08c3a6	`+ bsfl %eax, %eax`
		08c3a6	`# ifdef USE_AS_WMEMCMP`
		08c3a6	`movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx`
		08c3a6	`xorl %edx, %edx`
		08c3a6	`@@ -226,10 +258,11 @@ L(return_vec_1):`
		08c3a6	`# endif`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- /* NB: No p2align necessary. Alignment % 16 is naturally 2`
		08c3a6	`- which is good enough for a target not in a loop. */`
		08c3a6	`+ .p2align 4,, 10`
		08c3a6	`L(return_vec_2):`
		08c3a6	`- tzcntl %eax, %eax`
		08c3a6	`+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one`
		08c3a6	`+ fetch block. */`
		08c3a6	`+ bsfl %eax, %eax`
		08c3a6	`# ifdef USE_AS_WMEMCMP`
		08c3a6	`movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		08c3a6	`xorl %edx, %edx`
		08c3a6	`@@ -243,40 +276,6 @@ L(return_vec_2):`
		08c3a6	`# endif`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(8x_return_vec_0_1_2_3):`
		08c3a6	`- /* Returning from L(more_8x_vec) requires restoring rsi. */`
		08c3a6	`- addq %rdi, %rsi`
		08c3a6	`-L(return_vec_0_1_2_3):`
		08c3a6	`- VPCMP $4, %YMM1, %YMM0, %k0`
		08c3a6	`- kmovd %k0, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(return_vec_0)`
		08c3a6	`-`
		08c3a6	`- VPCMP $4, %YMM2, %YMM0, %k0`
		08c3a6	`- kmovd %k0, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(return_vec_1)`
		08c3a6	`-`
		08c3a6	`- VPCMP $4, %YMM3, %YMM0, %k0`
		08c3a6	`- kmovd %k0, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(return_vec_2)`
		08c3a6	`-L(return_vec_3):`
		08c3a6	`- tzcntl %ecx, %ecx`
		08c3a6	`-# ifdef USE_AS_WMEMCMP`
		08c3a6	`- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax`
		08c3a6	`- xorl %edx, %edx`
		08c3a6	`- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax`
		08c3a6	`- setg %dl`
		08c3a6	`- leal -1(%rdx, %rdx), %eax`
		08c3a6	`-# else`
		08c3a6	`- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		08c3a6	`- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx`
		08c3a6	`- subl %ecx, %eax`
		08c3a6	`-# endif`
		08c3a6	`- ret`
		08c3a6	`-`
		08c3a6	`.p2align 4`
		08c3a6	`L(more_8x_vec):`
		08c3a6	`/* Set end of s1 in rdx. */`
		08c3a6	`@@ -288,21 +287,19 @@ L(more_8x_vec):`
		08c3a6	`andq $-VEC_SIZE, %rdi`
		08c3a6	`/* Adjust because first 4x vec where check already. */`
		08c3a6	`subq $-(VEC_SIZE * 4), %rdi`
		08c3a6	`+`
		08c3a6	`.p2align 4`
		08c3a6	`L(loop_4x_vec):`
		08c3a6	`VMOVU (%rsi, %rdi), %YMM1`
		08c3a6	`vpxorq (%rdi), %YMM1, %YMM1`
		08c3a6	`-`
		08c3a6	`VMOVU VEC_SIZE(%rsi, %rdi), %YMM2`
		08c3a6	`vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2`
		08c3a6	`-`
		08c3a6	`VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3`
		08c3a6	`vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3`
		08c3a6	`- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		08c3a6	`-`
		08c3a6	`VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4`
		08c3a6	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4`
		08c3a6	`- VPCMP $4, %YMM4, %YMM0, %k1`
		08c3a6	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		08c3a6	`+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		08c3a6	`+ VPTEST %YMM4, %YMM4, %k1`
		08c3a6	`kmovd %k1, %ecx`
		08c3a6	`testl %ecx, %ecx`
		08c3a6	`jnz L(8x_return_vec_0_1_2_3)`
		08c3a6	`@@ -319,28 +316,25 @@ L(loop_4x_vec):`
		08c3a6	`cmpl $(VEC_SIZE * 2), %edi`
		08c3a6	`jae L(8x_last_2x_vec)`
		08c3a6
		08c3a6	`+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3`
		08c3a6	`+`
		08c3a6	`VMOVU (%rsi, %rdx), %YMM1`
		08c3a6	`vpxorq (%rdx), %YMM1, %YMM1`
		08c3a6
		08c3a6	`VMOVU VEC_SIZE(%rsi, %rdx), %YMM2`
		08c3a6	`vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2`
		08c3a6	`-`
		08c3a6	`- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3`
		08c3a6	`- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		08c3a6	`-`
		08c3a6	`VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4`
		08c3a6	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4`
		08c3a6	`- VPCMP $4, %YMM4, %YMM0, %k1`
		08c3a6	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4`
		08c3a6	`+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		08c3a6	`+ VPTEST %YMM4, %YMM4, %k1`
		08c3a6	`kmovd %k1, %ecx`
		08c3a6	`- /* Restore s1 pointer to rdi. */`
		08c3a6	`- movq %rdx, %rdi`
		08c3a6	`testl %ecx, %ecx`
		08c3a6	`- jnz L(8x_return_vec_0_1_2_3)`
		08c3a6	`+ jnz L(8x_end_return_vec_0_1_2_3)`
		08c3a6	`/* NB: eax must be zero to reach here. */`
		08c3a6	`ret`
		08c3a6
		08c3a6	`/* Only entry is from L(more_8x_vec). */`
		08c3a6	`- .p2align 4`
		08c3a6	`+ .p2align 4,, 10`
		08c3a6	`L(8x_last_2x_vec):`
		08c3a6	`VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1`
		08c3a6	`kmovd %k1, %eax`
		08c3a6	`@@ -355,7 +349,31 @@ L(8x_last_1x_vec):`
		08c3a6	`jnz L(8x_return_vec_3)`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`+ /* Not ideally aligned (at offset +9 bytes in fetch block) but`
		08c3a6	`+ not aligning keeps it in the same cache line as`
		08c3a6	`+ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code`
		08c3a6	`+ size. */`
		08c3a6	`+ .p2align 4,, 4`
		08c3a6	`+L(8x_return_vec_2):`
		08c3a6	`+ subq $VEC_SIZE, %rdx`
		08c3a6	`+L(8x_return_vec_3):`
		08c3a6	`+ bsfl %eax, %eax`
		08c3a6	`+# ifdef USE_AS_WMEMCMP`
		08c3a6	`+ leaq (%rdx, %rax, CHAR_SIZE), %rax`
		08c3a6	`+ movl (VEC_SIZE * 3)(%rax), %ecx`
		08c3a6	`+ xorl %edx, %edx`
		08c3a6	`+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		08c3a6	`+ setg %dl`
		08c3a6	`+ leal -1(%rdx, %rdx), %eax`
		08c3a6	`+# else`
		08c3a6	`+ addq %rdx, %rax`
		08c3a6	`+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		08c3a6	`+ movzbl (VEC_SIZE * 3)(%rax), %eax`
		08c3a6	`+ subl %ecx, %eax`
		08c3a6	`+# endif`
		08c3a6	`+ ret`
		08c3a6	`+`
		08c3a6	`+ .p2align 4,, 10`
		08c3a6	`L(last_2x_vec):`
		08c3a6	`/* Check second to last VEC. */`
		08c3a6	`VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1`
		08c3a6	`@@ -374,26 +392,49 @@ L(last_1x_vec):`
		08c3a6	`jnz L(return_vec_0_end)`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(8x_return_vec_2):`
		08c3a6	`- subq $VEC_SIZE, %rdx`
		08c3a6	`-L(8x_return_vec_3):`
		08c3a6	`- tzcntl %eax, %eax`
		08c3a6	`+ .p2align 4,, 10`
		08c3a6	`+L(return_vec_1_end):`
		08c3a6	`+ /* Use bsf to save code size. This is necessary to have`
		08c3a6	`+ L(one_or_less) fit in aligning bytes between. */`
		08c3a6	`+ bsfl %eax, %eax`
		08c3a6	`+ addl %edx, %eax`
		08c3a6	`# ifdef USE_AS_WMEMCMP`
		08c3a6	`- leaq (%rdx, %rax, CHAR_SIZE), %rax`
		08c3a6	`- movl (VEC_SIZE * 3)(%rax), %ecx`
		08c3a6	`+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		08c3a6	`xorl %edx, %edx`
		08c3a6	`- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		08c3a6	`+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx`
		08c3a6	`setg %dl`
		08c3a6	`leal -1(%rdx, %rdx), %eax`
		08c3a6	`# else`
		08c3a6	`- addq %rdx, %rax`
		08c3a6	`- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		08c3a6	`- movzbl (VEC_SIZE * 3)(%rax), %eax`
		08c3a6	`+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx`
		08c3a6	`+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax`
		08c3a6	`subl %ecx, %eax`
		08c3a6	`# endif`
		08c3a6	`ret`
		08c3a6
		08c3a6	`+ /* NB: L(one_or_less) fits in alignment padding between`
		08c3a6	`+ L(return_vec_1_end) and L(return_vec_0_end). */`
		08c3a6	`+# ifdef USE_AS_WMEMCMP`
		08c3a6	`+L(one_or_less):`
		08c3a6	`+ jb L(zero)`
		08c3a6	`+ movl (%rdi), %ecx`
		08c3a6	`+ xorl %edx, %edx`
		08c3a6	`+ cmpl (%rsi), %ecx`
		08c3a6	`+ je L(zero)`
		08c3a6	`+ setg %dl`
		08c3a6	`+ leal -1(%rdx, %rdx), %eax`
		08c3a6	`+ ret`
		08c3a6	`+# else`
		08c3a6	`+L(one_or_less):`
		08c3a6	`+ jb L(zero)`
		08c3a6	`+ movzbl (%rsi), %ecx`
		08c3a6	`+ movzbl (%rdi), %eax`
		08c3a6	`+ subl %ecx, %eax`
		08c3a6	`+ ret`
		08c3a6	`+# endif`
		08c3a6	`+L(zero):`
		08c3a6	`+ xorl %eax, %eax`
		08c3a6	`+ ret`
		08c3a6	`+`
		08c3a6	`.p2align 4`
		08c3a6	`L(return_vec_0_end):`
		08c3a6	`tzcntl %eax, %eax`
		08c3a6	`@@ -412,23 +453,56 @@ L(return_vec_0_end):`
		08c3a6	`ret`
		08c3a6
		08c3a6	`.p2align 4`
		08c3a6	`-L(return_vec_1_end):`
		08c3a6	`+L(less_vec):`
		08c3a6	`+ /* Check if one or less CHAR. This is necessary for size == 0`
		08c3a6	`+ but is also faster for size == CHAR_SIZE. */`
		08c3a6	`+ cmpl $1, %edx`
		08c3a6	`+ jbe L(one_or_less)`
		08c3a6	`+`
		08c3a6	`+ /* Check if loading one VEC from either s1 or s2 could cause a`
		08c3a6	`+ page cross. This can have false positives but is by far the`
		08c3a6	`+ fastest method. */`
		08c3a6	`+ movl %edi, %eax`
		08c3a6	`+ orl %esi, %eax`
		08c3a6	`+ andl $(PAGE_SIZE - 1), %eax`
		08c3a6	`+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		08c3a6	`+ jg L(page_cross_less_vec)`
		08c3a6	`+`
		08c3a6	`+ /* No page cross possible. */`
		08c3a6	`+ VMOVU (%rsi), %YMM2`
		08c3a6	`+ VPCMP $4, (%rdi), %YMM2, %k1`
		08c3a6	`+ kmovd %k1, %eax`
		08c3a6	`+ /* Check if any matches where in bounds. Intentionally not`
		08c3a6	`+ storing result in eax to limit dependency chain if it goes to`
		08c3a6	`+ L(return_vec_0_lv). */`
		08c3a6	`+ bzhil %edx, %eax, %edx`
		08c3a6	`+ jnz L(return_vec_0_lv)`
		08c3a6	`+ xorl %eax, %eax`
		08c3a6	`+ ret`
		08c3a6	`+`
		08c3a6	`+ /* Essentially duplicate of L(return_vec_0). Ends up not costing`
		08c3a6	`+ any code as shrinks L(less_vec) by allowing 2-byte encoding of`
		08c3a6	`+ the jump and ends up fitting in aligning bytes. As well fits on`
		08c3a6	`+ same cache line as L(less_vec) so also saves a line from having`
		08c3a6	`+ to be fetched on cold calls to memcmp. */`
		08c3a6	`+ .p2align 4,, 4`
		08c3a6	`+L(return_vec_0_lv):`
		08c3a6	`tzcntl %eax, %eax`
		08c3a6	`- addl %edx, %eax`
		08c3a6	`# ifdef USE_AS_WMEMCMP`
		08c3a6	`- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		08c3a6	`+ movl (%rdi, %rax, CHAR_SIZE), %ecx`
		08c3a6	`xorl %edx, %edx`
		08c3a6	`- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx`
		08c3a6	`+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx`
		08c3a6	`+ /* NB: no partial register stall here because xorl zero idiom`
		08c3a6	`+ above. */`
		08c3a6	`setg %dl`
		08c3a6	`leal -1(%rdx, %rdx), %eax`
		08c3a6	`# else`
		08c3a6	`- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx`
		08c3a6	`- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax`
		08c3a6	`+ movzbl (%rsi, %rax), %ecx`
		08c3a6	`+ movzbl (%rdi, %rax), %eax`
		08c3a6	`subl %ecx, %eax`
		08c3a6	`# endif`
		08c3a6	`ret`
		08c3a6
		08c3a6	`-`
		08c3a6	`.p2align 4`
		08c3a6	`L(page_cross_less_vec):`
		08c3a6	`/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28`
		08c3a6	`@@ -439,108 +513,84 @@ L(page_cross_less_vec):`
		08c3a6	`cmpl $8, %edx`
		08c3a6	`jae L(between_8_15)`
		08c3a6	`cmpl $4, %edx`
		08c3a6	`- jae L(between_4_7)`
		08c3a6	`-L(between_2_3):`
		08c3a6	`- /* Load as big endian to avoid branches. */`
		08c3a6	`- movzwl (%rdi), %eax`
		08c3a6	`- movzwl (%rsi), %ecx`
		08c3a6	`- shll $8, %eax`
		08c3a6	`- shll $8, %ecx`
		08c3a6	`- bswap %eax`
		08c3a6	`- bswap %ecx`
		08c3a6	`- movzbl -1(%rdi, %rdx), %edi`
		08c3a6	`- movzbl -1(%rsi, %rdx), %esi`
		08c3a6	`- orl %edi, %eax`
		08c3a6	`- orl %esi, %ecx`
		08c3a6	`- /* Subtraction is okay because the upper 8 bits are zero. */`
		08c3a6	`- subl %ecx, %eax`
		08c3a6	`- ret`
		08c3a6	`- .p2align 4`
		08c3a6	`-L(one_or_less):`
		08c3a6	`- jb L(zero)`
		08c3a6	`- movzbl (%rsi), %ecx`
		08c3a6	`- movzbl (%rdi), %eax`
		08c3a6	`- subl %ecx, %eax`
		08c3a6	`+ jb L(between_2_3)`
		08c3a6	`+`
		08c3a6	`+ /* Load as big endian with overlapping movbe to avoid branches.`
		08c3a6	`+ */`
		08c3a6	`+ movbe (%rdi), %eax`
		08c3a6	`+ movbe (%rsi), %ecx`
		08c3a6	`+ shlq $32, %rax`
		08c3a6	`+ shlq $32, %rcx`
		08c3a6	`+ movbe -4(%rdi, %rdx), %edi`
		08c3a6	`+ movbe -4(%rsi, %rdx), %esi`
		08c3a6	`+ orq %rdi, %rax`
		08c3a6	`+ orq %rsi, %rcx`
		08c3a6	`+ subq %rcx, %rax`
		08c3a6	`+ /* edx is guranteed to be positive int32 in range [4, 7]. */`
		08c3a6	`+ cmovne %edx, %eax`
		08c3a6	`+ /* ecx is -1 if rcx > rax. Otherwise 0. */`
		08c3a6	`+ sbbl %ecx, %ecx`
		08c3a6	`+ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==`
		08c3a6	`+ rax then eax and ecx are zero. If rax < rax then ecx is -1 so`
		08c3a6	`+ eax doesn't matter. */`
		08c3a6	`+ orl %ecx, %eax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`L(between_8_15):`
		08c3a6	`# endif`
		08c3a6	`/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */`
		08c3a6	`- vmovq (%rdi), %XMM1`
		08c3a6	`- vmovq (%rsi), %XMM2`
		08c3a6	`- VPCMP $4, %XMM1, %XMM2, %k1`
		08c3a6	`+ vmovq (%rdi), %xmm1`
		08c3a6	`+ vmovq (%rsi), %xmm2`
		08c3a6	`+ VPCMP $4, %xmm1, %xmm2, %k1`
		08c3a6	`kmovd %k1, %eax`
		08c3a6	`testl %eax, %eax`
		08c3a6	`- jnz L(return_vec_0)`
		08c3a6	`+ jnz L(return_vec_0_lv)`
		08c3a6	`/* Use overlapping loads to avoid branches. */`
		08c3a6	`- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi`
		08c3a6	`- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi`
		08c3a6	`- vmovq (%rdi), %XMM1`
		08c3a6	`- vmovq (%rsi), %XMM2`
		08c3a6	`- VPCMP $4, %XMM1, %XMM2, %k1`
		08c3a6	`+ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1`
		08c3a6	`+ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2`
		08c3a6	`+ VPCMP $4, %xmm1, %xmm2, %k1`
		08c3a6	`+ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx`
		08c3a6	`kmovd %k1, %eax`
		08c3a6	`testl %eax, %eax`
		08c3a6	`- jnz L(return_vec_0)`
		08c3a6	`- ret`
		08c3a6	`-`
		08c3a6	`- .p2align 4`
		08c3a6	`-L(zero):`
		08c3a6	`- xorl %eax, %eax`
		08c3a6	`+ jnz L(return_vec_0_end)`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`L(between_16_31):`
		08c3a6	`/* From 16 to 31 bytes. No branch when size == 16. */`
		08c3a6	`- VMOVU (%rsi), %XMM2`
		08c3a6	`- VPCMP $4, (%rdi), %XMM2, %k1`
		08c3a6	`+`
		08c3a6	`+ /* Use movups to save code size. */`
		08c3a6	`+ movups (%rsi), %xmm2`
		08c3a6	`+ VPCMP $4, (%rdi), %xmm2, %k1`
		08c3a6	`kmovd %k1, %eax`
		08c3a6	`testl %eax, %eax`
		08c3a6	`- jnz L(return_vec_0)`
		08c3a6	`-`
		08c3a6	`+ jnz L(return_vec_0_lv)`
		08c3a6	`/* Use overlapping loads to avoid branches. */`
		08c3a6	`-`
		08c3a6	`- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2`
		08c3a6	`- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi`
		08c3a6	`- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi`
		08c3a6	`- VPCMP $4, (%rdi), %XMM2, %k1`
		08c3a6	`+ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2`
		08c3a6	`+ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1`
		08c3a6	`+ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx`
		08c3a6	`kmovd %k1, %eax`
		08c3a6	`testl %eax, %eax`
		08c3a6	`- jnz L(return_vec_0)`
		08c3a6	`- ret`
		08c3a6	`-`
		08c3a6	`-# ifdef USE_AS_WMEMCMP`
		08c3a6	`- .p2align 4`
		08c3a6	`-L(one_or_less):`
		08c3a6	`- jb L(zero)`
		08c3a6	`- movl (%rdi), %ecx`
		08c3a6	`- xorl %edx, %edx`
		08c3a6	`- cmpl (%rsi), %ecx`
		08c3a6	`- je L(zero)`
		08c3a6	`- setg %dl`
		08c3a6	`- leal -1(%rdx, %rdx), %eax`
		08c3a6	`+ jnz L(return_vec_0_end)`
		08c3a6	`ret`
		08c3a6	`-# else`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(between_4_7):`
		08c3a6	`- /* Load as big endian with overlapping movbe to avoid branches.`
		08c3a6	`- */`
		08c3a6	`- movbe (%rdi), %eax`
		08c3a6	`- movbe (%rsi), %ecx`
		08c3a6	`- shlq $32, %rax`
		08c3a6	`- shlq $32, %rcx`
		08c3a6	`- movbe -4(%rdi, %rdx), %edi`
		08c3a6	`- movbe -4(%rsi, %rdx), %esi`
		08c3a6	`- orq %rdi, %rax`
		08c3a6	`- orq %rsi, %rcx`
		08c3a6	`- subq %rcx, %rax`
		08c3a6	`- jz L(zero_4_7)`
		08c3a6	`- sbbl %eax, %eax`
		08c3a6	`- orl $1, %eax`
		08c3a6	`-L(zero_4_7):`
		08c3a6	`+# ifndef USE_AS_WMEMCMP`
		08c3a6	`+L(between_2_3):`
		08c3a6	`+ /* Load as big endian to avoid branches. */`
		08c3a6	`+ movzwl (%rdi), %eax`
		08c3a6	`+ movzwl (%rsi), %ecx`
		08c3a6	`+ shll $8, %eax`
		08c3a6	`+ shll $8, %ecx`
		08c3a6	`+ bswap %eax`
		08c3a6	`+ bswap %ecx`
		08c3a6	`+ movzbl -1(%rdi, %rdx), %edi`
		08c3a6	`+ movzbl -1(%rsi, %rdx), %esi`
		08c3a6	`+ orl %edi, %eax`
		08c3a6	`+ orl %esi, %ecx`
		08c3a6	`+ /* Subtraction is okay because the upper 8 bits are zero. */`
		08c3a6	`+ subl %ecx, %eax`
		08c3a6	`ret`
		08c3a6	`# endif`
		08c3a6	`-`
		08c3a6	`END (MEMCMP)`
		08c3a6	`#endif`

rpms / glibc

Source Code

Blame SOURCES/glibc-upstream-2.34-175.patch