Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-upstream-2.34-175.patch

Blob History Raw

		076f82	`commit 5ec3416853c4150c4d13312e05f93a053586d528`
		076f82	`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
		076f82	`Date: Tue Sep 21 18:45:03 2021 -0500`
		076f82
		076f82	`x86: Optimize memcmp-evex-movbe.S for frontend behavior and size`
		076f82
		076f82	`No bug.`
		076f82
		076f82	`The frontend optimizations are to:`
		076f82	`1. Reorganize logically connected basic blocks so they are either in`
		076f82	`the same cache line or adjacent cache lines.`
		076f82	`2. Avoid cases when basic blocks unnecissarily cross cache lines.`
		076f82	`3. Try and 32 byte align any basic blocks possible without sacrificing`
		076f82	`code size. Smaller / Less hot basic blocks are used for this.`
		076f82
		076f82	`Overall code size shrunk by 168 bytes. This should make up for any`
		076f82	`extra costs due to aligning to 64 bytes.`
		076f82
		076f82	`In general performance before deviated a great deal dependending on`
		076f82	`whether entry alignment % 64 was 0, 16, 32, or 48. These changes`
		076f82	`essentially make it so that the current implementation is at least`
		076f82	`equal to the best alignment of the original for any arguments.`
		076f82
		076f82	`The only additional optimization is in the page cross case. Branch on`
		076f82	`equals case was removed from the size == [4, 7] case. As well the [4,`
		076f82	`7] and [2, 3] case where swapped as [4, 7] is likely a more hot`
		076f82	`argument size.`
		076f82
		076f82	`test-memcmp and test-wmemcmp are both passing.`
		076f82
		076f82	`(cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)`
		076f82
		076f82	`diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		076f82	`index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644`
		076f82	`--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		076f82	`+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		076f82	`@@ -34,7 +34,24 @@`
		076f82	`area.`
		076f82	`7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.`
		076f82	`8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.`
		076f82	`- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */`
		076f82	`+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.`
		076f82	`+`
		076f82	`+When possible the implementation tries to optimize for frontend in the`
		076f82	`+following ways:`
		076f82	`+Throughput:`
		076f82	`+ 1. All code sections that fit are able to run optimally out of the`
		076f82	`+ LSD.`
		076f82	`+ 2. All code sections that fit are able to run optimally out of the`
		076f82	`+ DSB`
		076f82	`+ 3. Basic blocks are contained in minimum number of fetch blocks`
		076f82	`+ necessary.`
		076f82	`+`
		076f82	`+Latency:`
		076f82	`+ 1. Logically connected basic blocks are put in the same`
		076f82	`+ cache-line.`
		076f82	`+ 2. Logically connected basic blocks that do not fit in the same`
		076f82	`+ cache-line are put in adjacent lines. This can get beneficial`
		076f82	`+ L2 spatial prefetching and L1 next-line prefetching. */`
		076f82
		076f82	`# include <sysdep.h>`
		076f82
		076f82	`@@ -47,9 +64,11 @@`
		076f82	`# ifdef USE_AS_WMEMCMP`
		076f82	`# define CHAR_SIZE 4`
		076f82	`# define VPCMP vpcmpd`
		076f82	`+# define VPTEST vptestmd`
		076f82	`# else`
		076f82	`# define CHAR_SIZE 1`
		076f82	`# define VPCMP vpcmpub`
		076f82	`+# define VPTEST vptestmb`
		076f82	`# endif`
		076f82
		076f82	`# define VEC_SIZE 32`
		076f82	`@@ -75,7 +94,9 @@`
		076f82	`*/`
		076f82
		076f82	`.section .text.evex,"ax",@progbits`
		076f82	`-ENTRY (MEMCMP)`
		076f82	`+/* Cache align memcmp entry. This allows for much more thorough`
		076f82	`+ frontend optimization. */`
		076f82	`+ENTRY_P2ALIGN (MEMCMP, 6)`
		076f82	`# ifdef __ILP32__`
		076f82	`/* Clear the upper 32 bits. */`
		076f82	`movl %edx, %edx`
		076f82	`@@ -89,7 +110,7 @@ ENTRY (MEMCMP)`
		076f82	`VPCMP $4, (%rdi), %YMM1, %k1`
		076f82	`kmovd %k1, %eax`
		076f82	`/* NB: eax must be destination register if going to`
		076f82	`- L(return_vec_[0,2]). For L(return_vec_3 destination register`
		076f82	`+ L(return_vec_[0,2]). For L(return_vec_3) destination register`
		076f82	`must be ecx. */`
		076f82	`testl %eax, %eax`
		076f82	`jnz L(return_vec_0)`
		076f82	`@@ -121,10 +142,6 @@ ENTRY (MEMCMP)`
		076f82	`testl %ecx, %ecx`
		076f82	`jnz L(return_vec_3)`
		076f82
		076f82	`- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so`
		076f82	`- compare with zero to get a mask is needed. */`
		076f82	`- vpxorq %XMM0, %XMM0, %XMM0`
		076f82	`-`
		076f82	`/* Go to 4x VEC loop. */`
		076f82	`cmpq $(CHAR_PER_VEC * 8), %rdx`
		076f82	`ja L(more_8x_vec)`
		076f82	`@@ -148,47 +165,61 @@ ENTRY (MEMCMP)`
		076f82
		076f82	`VMOVU (VEC_SIZE * 2)(%rsi), %YMM3`
		076f82	`vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3`
		076f82	`- /* Or together YMM1, YMM2, and YMM3 into YMM3. */`
		076f82	`- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		076f82
		076f82	`VMOVU (VEC_SIZE * 3)(%rsi), %YMM4`
		076f82	`/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while`
		076f82	`- oring with YMM3. Result is stored in YMM4. */`
		076f82	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4`
		076f82	`- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */`
		076f82	`- VPCMP $4, %YMM4, %YMM0, %k1`
		076f82	`+ oring with YMM1. Result is stored in YMM4. */`
		076f82	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		076f82	`+`
		076f82	`+ /* Or together YMM2, YMM3, and YMM4 into YMM4. */`
		076f82	`+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		076f82	`+`
		076f82	`+ /* Test YMM4 against itself. Store any CHAR mismatches in k1.`
		076f82	`+ */`
		076f82	`+ VPTEST %YMM4, %YMM4, %k1`
		076f82	`+ /* k1 must go to ecx for L(return_vec_0_1_2_3). */`
		076f82	`kmovd %k1, %ecx`
		076f82	`testl %ecx, %ecx`
		076f82	`jnz L(return_vec_0_1_2_3)`
		076f82	`/* NB: eax must be zero to reach here. */`
		076f82	`ret`
		076f82
		076f82	`- /* NB: aligning 32 here allows for the rest of the jump targets`
		076f82	`- to be tuned for 32 byte alignment. Most important this ensures`
		076f82	`- the L(more_8x_vec) loop is 32 byte aligned. */`
		076f82	`- .p2align 5`
		076f82	`-L(less_vec):`
		076f82	`- /* Check if one or less CHAR. This is necessary for size = 0 but`
		076f82	`- is also faster for size = CHAR_SIZE. */`
		076f82	`- cmpl $1, %edx`
		076f82	`- jbe L(one_or_less)`
		076f82	`+ .p2align 4`
		076f82	`+L(8x_end_return_vec_0_1_2_3):`
		076f82	`+ movq %rdx, %rdi`
		076f82	`+L(8x_return_vec_0_1_2_3):`
		076f82	`+ addq %rdi, %rsi`
		076f82	`+L(return_vec_0_1_2_3):`
		076f82	`+ VPTEST %YMM1, %YMM1, %k0`
		076f82	`+ kmovd %k0, %eax`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(return_vec_0)`
		076f82
		076f82	`- /* Check if loading one VEC from either s1 or s2 could cause a`
		076f82	`- page cross. This can have false positives but is by far the`
		076f82	`- fastest method. */`
		076f82	`- movl %edi, %eax`
		076f82	`- orl %esi, %eax`
		076f82	`- andl $(PAGE_SIZE - 1), %eax`
		076f82	`- cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		076f82	`- jg L(page_cross_less_vec)`
		076f82	`+ VPTEST %YMM2, %YMM2, %k0`
		076f82	`+ kmovd %k0, %eax`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(return_vec_1)`
		076f82
		076f82	`- /* No page cross possible. */`
		076f82	`- VMOVU (%rsi), %YMM2`
		076f82	`- VPCMP $4, (%rdi), %YMM2, %k1`
		076f82	`- kmovd %k1, %eax`
		076f82	`- /* Create mask in ecx for potentially in bound matches. */`
		076f82	`- bzhil %edx, %eax, %eax`
		076f82	`- jnz L(return_vec_0)`
		076f82	`+ VPTEST %YMM3, %YMM3, %k0`
		076f82	`+ kmovd %k0, %eax`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(return_vec_2)`
		076f82	`+L(return_vec_3):`
		076f82	`+ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one`
		076f82	`+ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache`
		076f82	`+ line. */`
		076f82	`+ bsfl %ecx, %ecx`
		076f82	`+# ifdef USE_AS_WMEMCMP`
		076f82	`+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax`
		076f82	`+ xorl %edx, %edx`
		076f82	`+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax`
		076f82	`+ setg %dl`
		076f82	`+ leal -1(%rdx, %rdx), %eax`
		076f82	`+# else`
		076f82	`+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		076f82	`+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx`
		076f82	`+ subl %ecx, %eax`
		076f82	`+# endif`
		076f82	`ret`
		076f82
		076f82	`.p2align 4`
		076f82	`@@ -209,10 +240,11 @@ L(return_vec_0):`
		076f82	`# endif`
		076f82	`ret`
		076f82
		076f82	`- /* NB: No p2align necessary. Alignment % 16 is naturally 1`
		076f82	`- which is good enough for a target not in a loop. */`
		076f82	`+ .p2align 4`
		076f82	`L(return_vec_1):`
		076f82	`- tzcntl %eax, %eax`
		076f82	`+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one`
		076f82	`+ fetch block. */`
		076f82	`+ bsfl %eax, %eax`
		076f82	`# ifdef USE_AS_WMEMCMP`
		076f82	`movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx`
		076f82	`xorl %edx, %edx`
		076f82	`@@ -226,10 +258,11 @@ L(return_vec_1):`
		076f82	`# endif`
		076f82	`ret`
		076f82
		076f82	`- /* NB: No p2align necessary. Alignment % 16 is naturally 2`
		076f82	`- which is good enough for a target not in a loop. */`
		076f82	`+ .p2align 4,, 10`
		076f82	`L(return_vec_2):`
		076f82	`- tzcntl %eax, %eax`
		076f82	`+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one`
		076f82	`+ fetch block. */`
		076f82	`+ bsfl %eax, %eax`
		076f82	`# ifdef USE_AS_WMEMCMP`
		076f82	`movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		076f82	`xorl %edx, %edx`
		076f82	`@@ -243,40 +276,6 @@ L(return_vec_2):`
		076f82	`# endif`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(8x_return_vec_0_1_2_3):`
		076f82	`- /* Returning from L(more_8x_vec) requires restoring rsi. */`
		076f82	`- addq %rdi, %rsi`
		076f82	`-L(return_vec_0_1_2_3):`
		076f82	`- VPCMP $4, %YMM1, %YMM0, %k0`
		076f82	`- kmovd %k0, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(return_vec_0)`
		076f82	`-`
		076f82	`- VPCMP $4, %YMM2, %YMM0, %k0`
		076f82	`- kmovd %k0, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(return_vec_1)`
		076f82	`-`
		076f82	`- VPCMP $4, %YMM3, %YMM0, %k0`
		076f82	`- kmovd %k0, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(return_vec_2)`
		076f82	`-L(return_vec_3):`
		076f82	`- tzcntl %ecx, %ecx`
		076f82	`-# ifdef USE_AS_WMEMCMP`
		076f82	`- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax`
		076f82	`- xorl %edx, %edx`
		076f82	`- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax`
		076f82	`- setg %dl`
		076f82	`- leal -1(%rdx, %rdx), %eax`
		076f82	`-# else`
		076f82	`- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		076f82	`- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx`
		076f82	`- subl %ecx, %eax`
		076f82	`-# endif`
		076f82	`- ret`
		076f82	`-`
		076f82	`.p2align 4`
		076f82	`L(more_8x_vec):`
		076f82	`/* Set end of s1 in rdx. */`
		076f82	`@@ -288,21 +287,19 @@ L(more_8x_vec):`
		076f82	`andq $-VEC_SIZE, %rdi`
		076f82	`/* Adjust because first 4x vec where check already. */`
		076f82	`subq $-(VEC_SIZE * 4), %rdi`
		076f82	`+`
		076f82	`.p2align 4`
		076f82	`L(loop_4x_vec):`
		076f82	`VMOVU (%rsi, %rdi), %YMM1`
		076f82	`vpxorq (%rdi), %YMM1, %YMM1`
		076f82	`-`
		076f82	`VMOVU VEC_SIZE(%rsi, %rdi), %YMM2`
		076f82	`vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2`
		076f82	`-`
		076f82	`VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3`
		076f82	`vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3`
		076f82	`- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		076f82	`-`
		076f82	`VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4`
		076f82	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4`
		076f82	`- VPCMP $4, %YMM4, %YMM0, %k1`
		076f82	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4`
		076f82	`+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		076f82	`+ VPTEST %YMM4, %YMM4, %k1`
		076f82	`kmovd %k1, %ecx`
		076f82	`testl %ecx, %ecx`
		076f82	`jnz L(8x_return_vec_0_1_2_3)`
		076f82	`@@ -319,28 +316,25 @@ L(loop_4x_vec):`
		076f82	`cmpl $(VEC_SIZE * 2), %edi`
		076f82	`jae L(8x_last_2x_vec)`
		076f82
		076f82	`+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3`
		076f82	`+`
		076f82	`VMOVU (%rsi, %rdx), %YMM1`
		076f82	`vpxorq (%rdx), %YMM1, %YMM1`
		076f82
		076f82	`VMOVU VEC_SIZE(%rsi, %rdx), %YMM2`
		076f82	`vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2`
		076f82	`-`
		076f82	`- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3`
		076f82	`- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		076f82	`-`
		076f82	`VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4`
		076f82	`- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4`
		076f82	`- VPCMP $4, %YMM4, %YMM0, %k1`
		076f82	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4`
		076f82	`+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4`
		076f82	`+ VPTEST %YMM4, %YMM4, %k1`
		076f82	`kmovd %k1, %ecx`
		076f82	`- /* Restore s1 pointer to rdi. */`
		076f82	`- movq %rdx, %rdi`
		076f82	`testl %ecx, %ecx`
		076f82	`- jnz L(8x_return_vec_0_1_2_3)`
		076f82	`+ jnz L(8x_end_return_vec_0_1_2_3)`
		076f82	`/* NB: eax must be zero to reach here. */`
		076f82	`ret`
		076f82
		076f82	`/* Only entry is from L(more_8x_vec). */`
		076f82	`- .p2align 4`
		076f82	`+ .p2align 4,, 10`
		076f82	`L(8x_last_2x_vec):`
		076f82	`VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1`
		076f82	`kmovd %k1, %eax`
		076f82	`@@ -355,7 +349,31 @@ L(8x_last_1x_vec):`
		076f82	`jnz L(8x_return_vec_3)`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`+ /* Not ideally aligned (at offset +9 bytes in fetch block) but`
		076f82	`+ not aligning keeps it in the same cache line as`
		076f82	`+ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code`
		076f82	`+ size. */`
		076f82	`+ .p2align 4,, 4`
		076f82	`+L(8x_return_vec_2):`
		076f82	`+ subq $VEC_SIZE, %rdx`
		076f82	`+L(8x_return_vec_3):`
		076f82	`+ bsfl %eax, %eax`
		076f82	`+# ifdef USE_AS_WMEMCMP`
		076f82	`+ leaq (%rdx, %rax, CHAR_SIZE), %rax`
		076f82	`+ movl (VEC_SIZE * 3)(%rax), %ecx`
		076f82	`+ xorl %edx, %edx`
		076f82	`+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		076f82	`+ setg %dl`
		076f82	`+ leal -1(%rdx, %rdx), %eax`
		076f82	`+# else`
		076f82	`+ addq %rdx, %rax`
		076f82	`+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		076f82	`+ movzbl (VEC_SIZE * 3)(%rax), %eax`
		076f82	`+ subl %ecx, %eax`
		076f82	`+# endif`
		076f82	`+ ret`
		076f82	`+`
		076f82	`+ .p2align 4,, 10`
		076f82	`L(last_2x_vec):`
		076f82	`/* Check second to last VEC. */`
		076f82	`VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1`
		076f82	`@@ -374,26 +392,49 @@ L(last_1x_vec):`
		076f82	`jnz L(return_vec_0_end)`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(8x_return_vec_2):`
		076f82	`- subq $VEC_SIZE, %rdx`
		076f82	`-L(8x_return_vec_3):`
		076f82	`- tzcntl %eax, %eax`
		076f82	`+ .p2align 4,, 10`
		076f82	`+L(return_vec_1_end):`
		076f82	`+ /* Use bsf to save code size. This is necessary to have`
		076f82	`+ L(one_or_less) fit in aligning bytes between. */`
		076f82	`+ bsfl %eax, %eax`
		076f82	`+ addl %edx, %eax`
		076f82	`# ifdef USE_AS_WMEMCMP`
		076f82	`- leaq (%rdx, %rax, CHAR_SIZE), %rax`
		076f82	`- movl (VEC_SIZE * 3)(%rax), %ecx`
		076f82	`+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		076f82	`xorl %edx, %edx`
		076f82	`- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		076f82	`+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx`
		076f82	`setg %dl`
		076f82	`leal -1(%rdx, %rdx), %eax`
		076f82	`# else`
		076f82	`- addq %rdx, %rax`
		076f82	`- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		076f82	`- movzbl (VEC_SIZE * 3)(%rax), %eax`
		076f82	`+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx`
		076f82	`+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax`
		076f82	`subl %ecx, %eax`
		076f82	`# endif`
		076f82	`ret`
		076f82
		076f82	`+ /* NB: L(one_or_less) fits in alignment padding between`
		076f82	`+ L(return_vec_1_end) and L(return_vec_0_end). */`
		076f82	`+# ifdef USE_AS_WMEMCMP`
		076f82	`+L(one_or_less):`
		076f82	`+ jb L(zero)`
		076f82	`+ movl (%rdi), %ecx`
		076f82	`+ xorl %edx, %edx`
		076f82	`+ cmpl (%rsi), %ecx`
		076f82	`+ je L(zero)`
		076f82	`+ setg %dl`
		076f82	`+ leal -1(%rdx, %rdx), %eax`
		076f82	`+ ret`
		076f82	`+# else`
		076f82	`+L(one_or_less):`
		076f82	`+ jb L(zero)`
		076f82	`+ movzbl (%rsi), %ecx`
		076f82	`+ movzbl (%rdi), %eax`
		076f82	`+ subl %ecx, %eax`
		076f82	`+ ret`
		076f82	`+# endif`
		076f82	`+L(zero):`
		076f82	`+ xorl %eax, %eax`
		076f82	`+ ret`
		076f82	`+`
		076f82	`.p2align 4`
		076f82	`L(return_vec_0_end):`
		076f82	`tzcntl %eax, %eax`
		076f82	`@@ -412,23 +453,56 @@ L(return_vec_0_end):`
		076f82	`ret`
		076f82
		076f82	`.p2align 4`
		076f82	`-L(return_vec_1_end):`
		076f82	`+L(less_vec):`
		076f82	`+ /* Check if one or less CHAR. This is necessary for size == 0`
		076f82	`+ but is also faster for size == CHAR_SIZE. */`
		076f82	`+ cmpl $1, %edx`
		076f82	`+ jbe L(one_or_less)`
		076f82	`+`
		076f82	`+ /* Check if loading one VEC from either s1 or s2 could cause a`
		076f82	`+ page cross. This can have false positives but is by far the`
		076f82	`+ fastest method. */`
		076f82	`+ movl %edi, %eax`
		076f82	`+ orl %esi, %eax`
		076f82	`+ andl $(PAGE_SIZE - 1), %eax`
		076f82	`+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		076f82	`+ jg L(page_cross_less_vec)`
		076f82	`+`
		076f82	`+ /* No page cross possible. */`
		076f82	`+ VMOVU (%rsi), %YMM2`
		076f82	`+ VPCMP $4, (%rdi), %YMM2, %k1`
		076f82	`+ kmovd %k1, %eax`
		076f82	`+ /* Check if any matches where in bounds. Intentionally not`
		076f82	`+ storing result in eax to limit dependency chain if it goes to`
		076f82	`+ L(return_vec_0_lv). */`
		076f82	`+ bzhil %edx, %eax, %edx`
		076f82	`+ jnz L(return_vec_0_lv)`
		076f82	`+ xorl %eax, %eax`
		076f82	`+ ret`
		076f82	`+`
		076f82	`+ /* Essentially duplicate of L(return_vec_0). Ends up not costing`
		076f82	`+ any code as shrinks L(less_vec) by allowing 2-byte encoding of`
		076f82	`+ the jump and ends up fitting in aligning bytes. As well fits on`
		076f82	`+ same cache line as L(less_vec) so also saves a line from having`
		076f82	`+ to be fetched on cold calls to memcmp. */`
		076f82	`+ .p2align 4,, 4`
		076f82	`+L(return_vec_0_lv):`
		076f82	`tzcntl %eax, %eax`
		076f82	`- addl %edx, %eax`
		076f82	`# ifdef USE_AS_WMEMCMP`
		076f82	`- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		076f82	`+ movl (%rdi, %rax, CHAR_SIZE), %ecx`
		076f82	`xorl %edx, %edx`
		076f82	`- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx`
		076f82	`+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx`
		076f82	`+ /* NB: no partial register stall here because xorl zero idiom`
		076f82	`+ above. */`
		076f82	`setg %dl`
		076f82	`leal -1(%rdx, %rdx), %eax`
		076f82	`# else`
		076f82	`- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx`
		076f82	`- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax`
		076f82	`+ movzbl (%rsi, %rax), %ecx`
		076f82	`+ movzbl (%rdi, %rax), %eax`
		076f82	`subl %ecx, %eax`
		076f82	`# endif`
		076f82	`ret`
		076f82
		076f82	`-`
		076f82	`.p2align 4`
		076f82	`L(page_cross_less_vec):`
		076f82	`/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28`
		076f82	`@@ -439,108 +513,84 @@ L(page_cross_less_vec):`
		076f82	`cmpl $8, %edx`
		076f82	`jae L(between_8_15)`
		076f82	`cmpl $4, %edx`
		076f82	`- jae L(between_4_7)`
		076f82	`-L(between_2_3):`
		076f82	`- /* Load as big endian to avoid branches. */`
		076f82	`- movzwl (%rdi), %eax`
		076f82	`- movzwl (%rsi), %ecx`
		076f82	`- shll $8, %eax`
		076f82	`- shll $8, %ecx`
		076f82	`- bswap %eax`
		076f82	`- bswap %ecx`
		076f82	`- movzbl -1(%rdi, %rdx), %edi`
		076f82	`- movzbl -1(%rsi, %rdx), %esi`
		076f82	`- orl %edi, %eax`
		076f82	`- orl %esi, %ecx`
		076f82	`- /* Subtraction is okay because the upper 8 bits are zero. */`
		076f82	`- subl %ecx, %eax`
		076f82	`- ret`
		076f82	`- .p2align 4`
		076f82	`-L(one_or_less):`
		076f82	`- jb L(zero)`
		076f82	`- movzbl (%rsi), %ecx`
		076f82	`- movzbl (%rdi), %eax`
		076f82	`- subl %ecx, %eax`
		076f82	`+ jb L(between_2_3)`
		076f82	`+`
		076f82	`+ /* Load as big endian with overlapping movbe to avoid branches.`
		076f82	`+ */`
		076f82	`+ movbe (%rdi), %eax`
		076f82	`+ movbe (%rsi), %ecx`
		076f82	`+ shlq $32, %rax`
		076f82	`+ shlq $32, %rcx`
		076f82	`+ movbe -4(%rdi, %rdx), %edi`
		076f82	`+ movbe -4(%rsi, %rdx), %esi`
		076f82	`+ orq %rdi, %rax`
		076f82	`+ orq %rsi, %rcx`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ /* edx is guranteed to be positive int32 in range [4, 7]. */`
		076f82	`+ cmovne %edx, %eax`
		076f82	`+ /* ecx is -1 if rcx > rax. Otherwise 0. */`
		076f82	`+ sbbl %ecx, %ecx`
		076f82	`+ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==`
		076f82	`+ rax then eax and ecx are zero. If rax < rax then ecx is -1 so`
		076f82	`+ eax doesn't matter. */`
		076f82	`+ orl %ecx, %eax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`+ .p2align 4,, 8`
		076f82	`L(between_8_15):`
		076f82	`# endif`
		076f82	`/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */`
		076f82	`- vmovq (%rdi), %XMM1`
		076f82	`- vmovq (%rsi), %XMM2`
		076f82	`- VPCMP $4, %XMM1, %XMM2, %k1`
		076f82	`+ vmovq (%rdi), %xmm1`
		076f82	`+ vmovq (%rsi), %xmm2`
		076f82	`+ VPCMP $4, %xmm1, %xmm2, %k1`
		076f82	`kmovd %k1, %eax`
		076f82	`testl %eax, %eax`
		076f82	`- jnz L(return_vec_0)`
		076f82	`+ jnz L(return_vec_0_lv)`
		076f82	`/* Use overlapping loads to avoid branches. */`
		076f82	`- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi`
		076f82	`- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi`
		076f82	`- vmovq (%rdi), %XMM1`
		076f82	`- vmovq (%rsi), %XMM2`
		076f82	`- VPCMP $4, %XMM1, %XMM2, %k1`
		076f82	`+ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1`
		076f82	`+ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2`
		076f82	`+ VPCMP $4, %xmm1, %xmm2, %k1`
		076f82	`+ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx`
		076f82	`kmovd %k1, %eax`
		076f82	`testl %eax, %eax`
		076f82	`- jnz L(return_vec_0)`
		076f82	`- ret`
		076f82	`-`
		076f82	`- .p2align 4`
		076f82	`-L(zero):`
		076f82	`- xorl %eax, %eax`
		076f82	`+ jnz L(return_vec_0_end)`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`+ .p2align 4,, 8`
		076f82	`L(between_16_31):`
		076f82	`/* From 16 to 31 bytes. No branch when size == 16. */`
		076f82	`- VMOVU (%rsi), %XMM2`
		076f82	`- VPCMP $4, (%rdi), %XMM2, %k1`
		076f82	`+`
		076f82	`+ /* Use movups to save code size. */`
		076f82	`+ movups (%rsi), %xmm2`
		076f82	`+ VPCMP $4, (%rdi), %xmm2, %k1`
		076f82	`kmovd %k1, %eax`
		076f82	`testl %eax, %eax`
		076f82	`- jnz L(return_vec_0)`
		076f82	`-`
		076f82	`+ jnz L(return_vec_0_lv)`
		076f82	`/* Use overlapping loads to avoid branches. */`
		076f82	`-`
		076f82	`- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2`
		076f82	`- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi`
		076f82	`- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi`
		076f82	`- VPCMP $4, (%rdi), %XMM2, %k1`
		076f82	`+ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2`
		076f82	`+ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1`
		076f82	`+ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx`
		076f82	`kmovd %k1, %eax`
		076f82	`testl %eax, %eax`
		076f82	`- jnz L(return_vec_0)`
		076f82	`- ret`
		076f82	`-`
		076f82	`-# ifdef USE_AS_WMEMCMP`
		076f82	`- .p2align 4`
		076f82	`-L(one_or_less):`
		076f82	`- jb L(zero)`
		076f82	`- movl (%rdi), %ecx`
		076f82	`- xorl %edx, %edx`
		076f82	`- cmpl (%rsi), %ecx`
		076f82	`- je L(zero)`
		076f82	`- setg %dl`
		076f82	`- leal -1(%rdx, %rdx), %eax`
		076f82	`+ jnz L(return_vec_0_end)`
		076f82	`ret`
		076f82	`-# else`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(between_4_7):`
		076f82	`- /* Load as big endian with overlapping movbe to avoid branches.`
		076f82	`- */`
		076f82	`- movbe (%rdi), %eax`
		076f82	`- movbe (%rsi), %ecx`
		076f82	`- shlq $32, %rax`
		076f82	`- shlq $32, %rcx`
		076f82	`- movbe -4(%rdi, %rdx), %edi`
		076f82	`- movbe -4(%rsi, %rdx), %esi`
		076f82	`- orq %rdi, %rax`
		076f82	`- orq %rsi, %rcx`
		076f82	`- subq %rcx, %rax`
		076f82	`- jz L(zero_4_7)`
		076f82	`- sbbl %eax, %eax`
		076f82	`- orl $1, %eax`
		076f82	`-L(zero_4_7):`
		076f82	`+# ifndef USE_AS_WMEMCMP`
		076f82	`+L(between_2_3):`
		076f82	`+ /* Load as big endian to avoid branches. */`
		076f82	`+ movzwl (%rdi), %eax`
		076f82	`+ movzwl (%rsi), %ecx`
		076f82	`+ shll $8, %eax`
		076f82	`+ shll $8, %ecx`
		076f82	`+ bswap %eax`
		076f82	`+ bswap %ecx`
		076f82	`+ movzbl -1(%rdi, %rdx), %edi`
		076f82	`+ movzbl -1(%rsi, %rdx), %esi`
		076f82	`+ orl %edi, %eax`
		076f82	`+ orl %esi, %ecx`
		076f82	`+ /* Subtraction is okay because the upper 8 bits are zero. */`
		076f82	`+ subl %ecx, %eax`
		076f82	`ret`
		076f82	`# endif`
		076f82	`-`
		076f82	`END (MEMCMP)`
		076f82	`#endif`

rpms / glibc

Source Code

Blame SOURCES/glibc-upstream-2.34-175.patch