Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/ia-opt-memmove-vec-unaligned-erms.patch

Blob History Raw

		190885	`From b27eed69c1aa2e0fcdcda8b34249ee5b50b913d6 Mon Sep 17 00:00:00 2001`
		190885	`From: Noah Goldstein <goldstein.w.n@gmail.com>`
		190885	`Date: Mon, 1 Nov 2021 00:49:51 -0500`
		190885	`Subject: [PATCH] x86: Optimize memmove-vec-unaligned-erms.S`
		190885
		190885	`No bug.`
		190885
		190885	`The optimizations are as follows:`
		190885
		190885	`1) Always align entry to 64 bytes. This makes behavior more`
		190885	`predictable and makes other frontend optimizations easier.`
		190885
		190885	`2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have`
		190885	`significant benefits in the case that:`
		190885	`0 < (dst - src) < [256, 512]`
		190885
		190885	3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
		190885	`improvement and for FSRM [-10%, 25%].`
		190885
		190885	`In addition to these primary changes there is general cleanup`
		190885	`throughout to optimize the aligning routines and control flow logic.`
		190885
		190885	`Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>`
		190885	`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
		190885	`(cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6)`
		190885	`---`
		190885	`sysdeps/x86_64/memmove.S \| 2 +-`
		190885	`.../memmove-avx-unaligned-erms-rtm.S \| 2 +-`
		190885	`.../multiarch/memmove-avx-unaligned-erms.S \| 2 +-`
		190885	`.../multiarch/memmove-avx512-unaligned-erms.S \| 2 +-`
		190885	`.../multiarch/memmove-evex-unaligned-erms.S \| 2 +-`
		190885	`.../multiarch/memmove-vec-unaligned-erms.S \| 595 +++++++++++-------`
		190885	`6 files changed, 381 insertions(+), 224 deletions(-)`
		190885
		190885	`diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S`
		190885	`index 9cc92ff9..990fa6c5 100644`
		190885	`--- a/sysdeps/x86_64/memmove.S`
		190885	`+++ b/sysdeps/x86_64/memmove.S`
		190885	`@@ -25,7 +25,7 @@`
		190885	`/* Use movups and movaps for smaller code sizes. */`
		190885	`#define VMOVU movups`
		190885	`#define VMOVA movaps`
		190885	`-`
		190885	`+#define MOV_SIZE 3`
		190885	`#define SECTION(p) p`
		190885
		190885	`#ifdef USE_MULTIARCH`
		190885	`diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S`
		190885	`index 1ec1962e..67a55f0c 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S`
		190885	`+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S`
		190885	`@@ -4,7 +4,7 @@`
		190885	`# define VMOVNT vmovntdq`
		190885	`# define VMOVU vmovdqu`
		190885	`# define VMOVA vmovdqa`
		190885	`-`
		190885	`+# define MOV_SIZE 4`
		190885	`# define ZERO_UPPER_VEC_REGISTERS_RETURN \`
		190885	`ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST`
		190885
		190885	`diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S`
		190885	`index e195e93f..975ae6c0 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S`
		190885	`+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S`
		190885	`@@ -4,7 +4,7 @@`
		190885	`# define VMOVNT vmovntdq`
		190885	`# define VMOVU vmovdqu`
		190885	`# define VMOVA vmovdqa`
		190885	`-`
		190885	`+# define MOV_SIZE 4`
		190885	`# define SECTION(p) p##.avx`
		190885	`# define MEMMOVE_SYMBOL(p,s) p##_avx_##s`
		190885
		190885	`diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S`
		190885	`index 848848ab..0fa71268 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S`
		190885	`+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S`
		190885	`@@ -25,7 +25,7 @@`
		190885	`# define VMOVU vmovdqu64`
		190885	`# define VMOVA vmovdqa64`
		190885	`# define VZEROUPPER`
		190885	`-`
		190885	`+# define MOV_SIZE 6`
		190885	`# define SECTION(p) p##.evex512`
		190885	`# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s`
		190885
		190885	`diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S`
		190885	`index 0cbce8f9..88715441 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S`
		190885	`+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S`
		190885	`@@ -25,7 +25,7 @@`
		190885	`# define VMOVU vmovdqu64`
		190885	`# define VMOVA vmovdqa64`
		190885	`# define VZEROUPPER`
		190885	`-`
		190885	`+# define MOV_SIZE 6`
		190885	`# define SECTION(p) p##.evex`
		190885	`# define MEMMOVE_SYMBOL(p,s) p##_evex_##s`
		190885
		190885	`diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S`
		190885	`index c0809b1b..e5495286 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S`
		190885	`+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S`
		190885	`@@ -76,6 +76,25 @@`
		190885	`# endif`
		190885	`#endif`
		190885
		190885	`+/* Whether to align before movsb. Ultimately we want 64 byte`
		190885	`+ align and not worth it to load 4x VEC for VEC_SIZE == 16. */`
		190885	`+#define ALIGN_MOVSB (VEC_SIZE > 16)`
		190885	`+/* Number of bytes to align movsb to. */`
		190885	`+#define MOVSB_ALIGN_TO 64`
		190885	`+`
		190885	`+#define SMALL_MOV_SIZE (MOV_SIZE <= 4)`
		190885	`+#define LARGE_MOV_SIZE (MOV_SIZE > 4)`
		190885	`+`
		190885	`+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1`
		190885	`+# error MOV_SIZE Unknown`
		190885	`+#endif`
		190885	`+`
		190885	`+#if LARGE_MOV_SIZE`
		190885	`+# define SMALL_SIZE_OFFSET (4)`
		190885	`+#else`
		190885	`+# define SMALL_SIZE_OFFSET (0)`
		190885	`+#endif`
		190885	`+`
		190885	`#ifndef PAGE_SIZE`
		190885	`# define PAGE_SIZE 4096`
		190885	`#endif`
		190885	`@@ -199,25 +218,21 @@ L(start):`
		190885	`# endif`
		190885	`cmp $VEC_SIZE, %RDX_LP`
		190885	`jb L(less_vec)`
		190885	`+ /* Load regardless. */`
		190885	`+ VMOVU (%rsi), %VEC(0)`
		190885	`cmp $(VEC_SIZE * 2), %RDX_LP`
		190885	`ja L(more_2x_vec)`
		190885	`-#if !defined USE_MULTIARCH \|\| !IS_IN (libc)`
		190885	`-L(last_2x_vec):`
		190885	`-#endif`
		190885	`/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */`
		190885	`- VMOVU (%rsi), %VEC(0)`
		190885	`VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)`
		190885	`VMOVU %VEC(0), (%rdi)`
		190885	`VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)`
		190885	`-#if !defined USE_MULTIARCH \|\| !IS_IN (libc)`
		190885	`-L(nop):`
		190885	`- ret`
		190885	`+#if !(defined USE_MULTIARCH && IS_IN (libc))`
		190885	`+ ZERO_UPPER_VEC_REGISTERS_RETURN`
		190885	`#else`
		190885	`VZEROUPPER_RETURN`
		190885	`#endif`
		190885	`#if defined USE_MULTIARCH && IS_IN (libc)`
		190885	`END (MEMMOVE_SYMBOL (__memmove, unaligned))`
		190885	`-`
		190885	`# if VEC_SIZE == 16`
		190885	`ENTRY (__mempcpy_chk_erms)`
		190885	`cmp %RDX_LP, %RCX_LP`
		190885	`@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))`
		190885	`END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))`
		190885	`# endif`
		190885
		190885	`-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))`
		190885	`+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)`
		190885	`movq %rdi, %rax`
		190885	`L(start_erms):`
		190885	`# ifdef __ILP32__`
		190885	`@@ -298,310 +313,448 @@ L(start_erms):`
		190885	`# endif`
		190885	`cmp $VEC_SIZE, %RDX_LP`
		190885	`jb L(less_vec)`
		190885	`+ /* Load regardless. */`
		190885	`+ VMOVU (%rsi), %VEC(0)`
		190885	`cmp $(VEC_SIZE * 2), %RDX_LP`
		190885	`ja L(movsb_more_2x_vec)`
		190885	`-L(last_2x_vec):`
		190885	`- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */`
		190885	`- VMOVU (%rsi), %VEC(0)`
		190885	`- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)`
		190885	`+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.`
		190885	`+ */`
		190885	`+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)`
		190885	`VMOVU %VEC(0), (%rdi)`
		190885	`- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)`
		190885	`+ VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)`
		190885	`L(return):`
		190885	`-#if VEC_SIZE > 16`
		190885	`+# if VEC_SIZE > 16`
		190885	`ZERO_UPPER_VEC_REGISTERS_RETURN`
		190885	`-#else`
		190885	`+# else`
		190885	`ret`
		190885	`+# endif`
		190885	`#endif`
		190885
		190885	`-L(movsb):`
		190885	`- cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP`
		190885	`- jae L(more_8x_vec)`
		190885	`- cmpq %rsi, %rdi`
		190885	`- jb 1f`
		190885	`- /* Source == destination is less common. */`
		190885	`- je L(nop)`
		190885	`- leaq (%rsi,%rdx), %r9`
		190885	`- cmpq %r9, %rdi`
		190885	`- /* Avoid slow backward REP MOVSB. */`
		190885	`- jb L(more_8x_vec_backward)`
		190885	`-# if AVOID_SHORT_DISTANCE_REP_MOVSB`
		190885	`- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)`
		190885	`- jz 3f`
		190885	`- movq %rdi, %rcx`
		190885	`- subq %rsi, %rcx`
		190885	`- jmp 2f`
		190885	`-# endif`
		190885	`-1:`
		190885	`-# if AVOID_SHORT_DISTANCE_REP_MOVSB`
		190885	`- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)`
		190885	`- jz 3f`
		190885	`- movq %rsi, %rcx`
		190885	`- subq %rdi, %rcx`
		190885	`-2:`
		190885	`-/* Avoid "rep movsb" if RCX, the distance between source and destination,`
		190885	`- is N4GB + [1..63] with N >= 0. /`
		190885	`- cmpl $63, %ecx`
		190885	`- jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */`
		190885	`-3:`
		190885	`-# endif`
		190885	`- mov %RDX_LP, %RCX_LP`
		190885	`- rep movsb`
		190885	`-L(nop):`
		190885	`+#if LARGE_MOV_SIZE`
		190885	`+ /* If LARGE_MOV_SIZE this fits in the aligning bytes between the`
		190885	`+ ENTRY block and L(less_vec). */`
		190885	`+ .p2align 4,, 8`
		190885	`+L(between_4_7):`
		190885	`+ /* From 4 to 7. No branch when size == 4. */`
		190885	`+ movl (%rsi), %ecx`
		190885	`+ movl (%rsi, %rdx), %esi`
		190885	`+ movl %ecx, (%rdi)`
		190885	`+ movl %esi, (%rdi, %rdx)`
		190885	`ret`
		190885	`#endif`
		190885
		190885	`+ .p2align 4`
		190885	`L(less_vec):`
		190885	`/* Less than 1 VEC. */`
		190885	`#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64`
		190885	`# error Unsupported VEC_SIZE!`
		190885	`#endif`
		190885	`#if VEC_SIZE > 32`
		190885	`- cmpb $32, %dl`
		190885	`+ cmpl $32, %edx`
		190885	`jae L(between_32_63)`
		190885	`#endif`
		190885	`#if VEC_SIZE > 16`
		190885	`- cmpb $16, %dl`
		190885	`+ cmpl $16, %edx`
		190885	`jae L(between_16_31)`
		190885	`#endif`
		190885	`- cmpb $8, %dl`
		190885	`+ cmpl $8, %edx`
		190885	`jae L(between_8_15)`
		190885	`- cmpb $4, %dl`
		190885	`+#if SMALL_MOV_SIZE`
		190885	`+ cmpl $4, %edx`
		190885	`+#else`
		190885	`+ subq $4, %rdx`
		190885	`+#endif`
		190885	`jae L(between_4_7)`
		190885	`- cmpb $1, %dl`
		190885	`- ja L(between_2_3)`
		190885	`- jb 1f`
		190885	`- movzbl (%rsi), %ecx`
		190885	`+ cmpl $(1 - SMALL_SIZE_OFFSET), %edx`
		190885	`+ jl L(copy_0)`
		190885	`+ movb (%rsi), %cl`
		190885	`+ je L(copy_1)`
		190885	`+ movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi`
		190885	`+ movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)`
		190885	`+L(copy_1):`
		190885	`movb %cl, (%rdi)`
		190885	`-1:`
		190885	`+L(copy_0):`
		190885	`ret`
		190885	`+`
		190885	`+#if SMALL_MOV_SIZE`
		190885	`+ .p2align 4,, 8`
		190885	`+L(between_4_7):`
		190885	`+ /* From 4 to 7. No branch when size == 4. */`
		190885	`+ movl -4(%rsi, %rdx), %ecx`
		190885	`+ movl (%rsi), %esi`
		190885	`+ movl %ecx, -4(%rdi, %rdx)`
		190885	`+ movl %esi, (%rdi)`
		190885	`+ ret`
		190885	`+#endif`
		190885	`+`
		190885	`+#if VEC_SIZE > 16`
		190885	`+ /* From 16 to 31. No branch when size == 16. */`
		190885	`+ .p2align 4,, 8`
		190885	`+L(between_16_31):`
		190885	`+ vmovdqu (%rsi), %xmm0`
		190885	`+ vmovdqu -16(%rsi, %rdx), %xmm1`
		190885	`+ vmovdqu %xmm0, (%rdi)`
		190885	`+ vmovdqu %xmm1, -16(%rdi, %rdx)`
		190885	`+ /* No ymm registers have been touched. */`
		190885	`+ ret`
		190885	`+#endif`
		190885	`+`
		190885	`#if VEC_SIZE > 32`
		190885	`+ .p2align 4,, 10`
		190885	`L(between_32_63):`
		190885	`/* From 32 to 63. No branch when size == 32. */`
		190885	`VMOVU (%rsi), %YMM0`
		190885	`- VMOVU -32(%rsi,%rdx), %YMM1`
		190885	`+ VMOVU -32(%rsi, %rdx), %YMM1`
		190885	`VMOVU %YMM0, (%rdi)`
		190885	`- VMOVU %YMM1, -32(%rdi,%rdx)`
		190885	`- VZEROUPPER_RETURN`
		190885	`-#endif`
		190885	`-#if VEC_SIZE > 16`
		190885	`- /* From 16 to 31. No branch when size == 16. */`
		190885	`-L(between_16_31):`
		190885	`- VMOVU (%rsi), %XMM0`
		190885	`- VMOVU -16(%rsi,%rdx), %XMM1`
		190885	`- VMOVU %XMM0, (%rdi)`
		190885	`- VMOVU %XMM1, -16(%rdi,%rdx)`
		190885	`+ VMOVU %YMM1, -32(%rdi, %rdx)`
		190885	`VZEROUPPER_RETURN`
		190885	`#endif`
		190885	`+`
		190885	`+ .p2align 4,, 10`
		190885	`L(between_8_15):`
		190885	`/* From 8 to 15. No branch when size == 8. */`
		190885	`- movq -8(%rsi,%rdx), %rcx`
		190885	`+ movq -8(%rsi, %rdx), %rcx`
		190885	`movq (%rsi), %rsi`
		190885	`- movq %rcx, -8(%rdi,%rdx)`
		190885	`movq %rsi, (%rdi)`
		190885	`+ movq %rcx, -8(%rdi, %rdx)`
		190885	`ret`
		190885	`-L(between_4_7):`
		190885	`- /* From 4 to 7. No branch when size == 4. */`
		190885	`- movl -4(%rsi,%rdx), %ecx`
		190885	`- movl (%rsi), %esi`
		190885	`- movl %ecx, -4(%rdi,%rdx)`
		190885	`- movl %esi, (%rdi)`
		190885	`- ret`
		190885	`-L(between_2_3):`
		190885	`- /* From 2 to 3. No branch when size == 2. */`
		190885	`- movzwl -2(%rsi,%rdx), %ecx`
		190885	`- movzwl (%rsi), %esi`
		190885	`- movw %cx, -2(%rdi,%rdx)`
		190885	`- movw %si, (%rdi)`
		190885	`- ret`
		190885
		190885	`+ .p2align 4,, 10`
		190885	`+L(last_4x_vec):`
		190885	`+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */`
		190885	`+`
		190885	`+ /* VEC(0) and VEC(1) have already been loaded. */`
		190885	`+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)`
		190885	`+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)`
		190885	`+ VMOVU %VEC(0), (%rdi)`
		190885	`+ VMOVU %VEC(1), VEC_SIZE(%rdi)`
		190885	`+ VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)`
		190885	`+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+`
		190885	`+ .p2align 4`
		190885	`#if defined USE_MULTIARCH && IS_IN (libc)`
		190885	`L(movsb_more_2x_vec):`
		190885	`cmp __x86_rep_movsb_threshold(%rip), %RDX_LP`
		190885	`ja L(movsb)`
		190885	`#endif`
		190885	`L(more_2x_vec):`
		190885	`- /* More than 2 * VEC and there may be overlap between destination`
		190885	`- and source. */`
		190885	`+ /* More than 2 * VEC and there may be overlap between`
		190885	`+ destination and source. */`
		190885	`cmpq $(VEC_SIZE * 8), %rdx`
		190885	`ja L(more_8x_vec)`
		190885	`+ /* Load VEC(1) regardless. VEC(0) has already been loaded. */`
		190885	`+ VMOVU VEC_SIZE(%rsi), %VEC(1)`
		190885	`cmpq $(VEC_SIZE * 4), %rdx`
		190885	`jbe L(last_4x_vec)`
		190885	`- /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */`
		190885	`- VMOVU (%rsi), %VEC(0)`
		190885	`- VMOVU VEC_SIZE(%rsi), %VEC(1)`
		190885	`+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */`
		190885	`VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)`
		190885	`VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)`
		190885	`- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)`
		190885	`- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)`
		190885	`- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)`
		190885	`- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)`
		190885	`+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)`
		190885	`+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)`
		190885	`+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)`
		190885	`+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)`
		190885	`VMOVU %VEC(0), (%rdi)`
		190885	`VMOVU %VEC(1), VEC_SIZE(%rdi)`
		190885	`VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)`
		190885	`VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)`
		190885	`- VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)`
		190885	`- VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)`
		190885	`- VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)`
		190885	`- VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)`
		190885	`- VZEROUPPER_RETURN`
		190885	`-L(last_4x_vec):`
		190885	`- /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */`
		190885	`- VMOVU (%rsi), %VEC(0)`
		190885	`- VMOVU VEC_SIZE(%rsi), %VEC(1)`
		190885	`- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)`
		190885	`- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)`
		190885	`- VMOVU %VEC(0), (%rdi)`
		190885	`- VMOVU %VEC(1), VEC_SIZE(%rdi)`
		190885	`- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)`
		190885	`- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)`
		190885	`+ VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)`
		190885	`+ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)`
		190885	`+ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)`
		190885	`+ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)`
		190885	`VZEROUPPER_RETURN`
		190885
		190885	`+ .p2align 4,, 4`
		190885	`L(more_8x_vec):`
		190885	`+ movq %rdi, %rcx`
		190885	`+ subq %rsi, %rcx`
		190885	`+ /* Go to backwards temporal copy if overlap no matter what as`
		190885	`+ backward REP MOVSB is slow and we don't want to use NT stores if`
		190885	`+ there is overlap. */`
		190885	`+ cmpq %rdx, %rcx`
		190885	`+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */`
		190885	`+ jb L(more_8x_vec_backward_check_nop)`
		190885	`/* Check if non-temporal move candidate. */`
		190885	`#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)`
		190885	`/* Check non-temporal store threshold. */`
		190885	`- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP`
		190885	`+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP`
		190885	`ja L(large_memcpy_2x)`
		190885	`#endif`
		190885	`- /* Entry if rdx is greater than non-temporal threshold but there`
		190885	`- is overlap. */`
		190885	`+ /* To reach this point there cannot be overlap and dst > src. So`
		190885	`+ check for overlap and src > dst in which case correctness`
		190885	`+ requires forward copy. Otherwise decide between backward/forward`
		190885	`+ copy depending on address aliasing. */`
		190885	`+`
		190885	`+ /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold`
		190885	`+ but less than __x86_shared_non_temporal_threshold. */`
		190885	`L(more_8x_vec_check):`
		190885	`- cmpq %rsi, %rdi`
		190885	`- ja L(more_8x_vec_backward)`
		190885	`- /* Source == destination is less common. */`
		190885	`- je L(nop)`
		190885	`- /* Load the first VEC and last 4 * VEC to support overlapping`
		190885	`- addresses. */`
		190885	`- VMOVU (%rsi), %VEC(4)`
		190885	`+ /* rcx contains dst - src. Add back length (rdx). */`
		190885	`+ leaq (%rcx, %rdx), %r8`
		190885	`+ /* If r8 has different sign than rcx then there is overlap so we`
		190885	`+ must do forward copy. */`
		190885	`+ xorq %rcx, %r8`
		190885	`+ /* Isolate just sign bit of r8. */`
		190885	`+ shrq $63, %r8`
		190885	`+ /* Get 4k difference dst - src. */`
		190885	`+ andl $(PAGE_SIZE - 256), %ecx`
		190885	`+ /* If r8 is non-zero must do foward for correctness. Otherwise`
		190885	`+ if ecx is non-zero there is 4k False Alaising so do backward`
		190885	`+ copy. */`
		190885	`+ addl %r8d, %ecx`
		190885	`+ jz L(more_8x_vec_backward)`
		190885	`+`
		190885	`+ /* if rdx is greater than __x86_shared_non_temporal_threshold`
		190885	`+ but there is overlap, or from short distance movsb. */`
		190885	`+L(more_8x_vec_forward):`
		190885	`+ /* Load first and last 4 * VEC to support overlapping addresses.`
		190885	`+ */`
		190885	`+`
		190885	`+ /* First vec was already loaded into VEC(0). */`
		190885	`VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)`
		190885	`VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)`
		190885	`+ /* Save begining of dst. */`
		190885	`+ movq %rdi, %rcx`
		190885	`+ /* Align dst to VEC_SIZE - 1. */`
		190885	`+ orq $(VEC_SIZE - 1), %rdi`
		190885	`VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)`
		190885	`VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)`
		190885	`- /* Save start and stop of the destination buffer. */`
		190885	`- movq %rdi, %r11`
		190885	`- leaq -VEC_SIZE(%rdi, %rdx), %rcx`
		190885	`- /* Align destination for aligned stores in the loop. Compute`
		190885	`- how much destination is misaligned. */`
		190885	`- movq %rdi, %r8`
		190885	`- andq $(VEC_SIZE - 1), %r8`
		190885	`- /* Get the negative of offset for alignment. */`
		190885	`- subq $VEC_SIZE, %r8`
		190885	`- /* Adjust source. */`
		190885	`- subq %r8, %rsi`
		190885	`- /* Adjust destination which should be aligned now. */`
		190885	`- subq %r8, %rdi`
		190885	`- /* Adjust length. */`
		190885	`- addq %r8, %rdx`
		190885
		190885	`- .p2align 4`
		190885	`+ /* Subtract dst from src. Add back after dst aligned. */`
		190885	`+ subq %rcx, %rsi`
		190885	`+ /* Finish aligning dst. */`
		190885	`+ incq %rdi`
		190885	`+ /* Restore src adjusted with new value for aligned dst. */`
		190885	`+ addq %rdi, %rsi`
		190885	`+ /* Store end of buffer minus tail in rdx. */`
		190885	`+ leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx`
		190885	`+`
		190885	`+ /* Dont use multi-byte nop to align. */`
		190885	`+ .p2align 4,, 11`
		190885	`L(loop_4x_vec_forward):`
		190885	`/* Copy 4 * VEC a time forward. */`
		190885	`- VMOVU (%rsi), %VEC(0)`
		190885	`- VMOVU VEC_SIZE(%rsi), %VEC(1)`
		190885	`- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)`
		190885	`- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)`
		190885	`+ VMOVU (%rsi), %VEC(1)`
		190885	`+ VMOVU VEC_SIZE(%rsi), %VEC(2)`
		190885	`+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)`
		190885	`+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)`
		190885	`subq $-(VEC_SIZE * 4), %rsi`
		190885	`- addq $-(VEC_SIZE * 4), %rdx`
		190885	`- VMOVA %VEC(0), (%rdi)`
		190885	`- VMOVA %VEC(1), VEC_SIZE(%rdi)`
		190885	`- VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)`
		190885	`- VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)`
		190885	`+ VMOVA %VEC(1), (%rdi)`
		190885	`+ VMOVA %VEC(2), VEC_SIZE(%rdi)`
		190885	`+ VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)`
		190885	`+ VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)`
		190885	`subq $-(VEC_SIZE * 4), %rdi`
		190885	`- cmpq $(VEC_SIZE * 4), %rdx`
		190885	`+ cmpq %rdi, %rdx`
		190885	`ja L(loop_4x_vec_forward)`
		190885	`/* Store the last 4 * VEC. */`
		190885	`- VMOVU %VEC(5), (%rcx)`
		190885	`- VMOVU %VEC(6), -VEC_SIZE(%rcx)`
		190885	`- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)`
		190885	`- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)`
		190885	`+ VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)`
		190885	`+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)`
		190885	`+ VMOVU %VEC(7), VEC_SIZE(%rdx)`
		190885	`+ VMOVU %VEC(8), (%rdx)`
		190885	`/* Store the first VEC. */`
		190885	`- VMOVU %VEC(4), (%r11)`
		190885	`+ VMOVU %VEC(0), (%rcx)`
		190885	`+ /* Keep L(nop_backward) target close to jmp for 2-byte encoding.`
		190885	`+ */`
		190885	`+L(nop_backward):`
		190885	`VZEROUPPER_RETURN`
		190885
		190885	`+ .p2align 4,, 8`
		190885	`+L(more_8x_vec_backward_check_nop):`
		190885	`+ /* rcx contains dst - src. Test for dst == src to skip all of`
		190885	`+ memmove. */`
		190885	`+ testq %rcx, %rcx`
		190885	`+ jz L(nop_backward)`
		190885	`L(more_8x_vec_backward):`
		190885	`/* Load the first 4 * VEC and last VEC to support overlapping`
		190885	`addresses. */`
		190885	`- VMOVU (%rsi), %VEC(4)`
		190885	`+`
		190885	`+ /* First vec was also loaded into VEC(0). */`
		190885	`VMOVU VEC_SIZE(%rsi), %VEC(5)`
		190885	`VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)`
		190885	`+ /* Begining of region for 4x backward copy stored in rcx. */`
		190885	`+ leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx`
		190885	`VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)`
		190885	`- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)`
		190885	`- /* Save stop of the destination buffer. */`
		190885	`- leaq -VEC_SIZE(%rdi, %rdx), %r11`
		190885	`- /* Align destination end for aligned stores in the loop. Compute`
		190885	`- how much destination end is misaligned. */`
		190885	`- leaq -VEC_SIZE(%rsi, %rdx), %rcx`
		190885	`- movq %r11, %r9`
		190885	`- movq %r11, %r8`
		190885	`- andq $(VEC_SIZE - 1), %r8`
		190885	`- /* Adjust source. */`
		190885	`- subq %r8, %rcx`
		190885	`- /* Adjust the end of destination which should be aligned now. */`
		190885	`- subq %r8, %r9`
		190885	`- /* Adjust length. */`
		190885	`- subq %r8, %rdx`
		190885	`-`
		190885	`- .p2align 4`
		190885	`+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)`
		190885	`+ /* Subtract dst from src. Add back after dst aligned. */`
		190885	`+ subq %rdi, %rsi`
		190885	`+ /* Align dst. */`
		190885	`+ andq $-(VEC_SIZE), %rcx`
		190885	`+ /* Restore src. */`
		190885	`+ addq %rcx, %rsi`
		190885	`+`
		190885	`+ /* Don't use multi-byte nop to align. */`
		190885	`+ .p2align 4,, 11`
		190885	`L(loop_4x_vec_backward):`
		190885	`/* Copy 4 * VEC a time backward. */`
		190885	`- VMOVU (%rcx), %VEC(0)`
		190885	`- VMOVU -VEC_SIZE(%rcx), %VEC(1)`
		190885	`- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)`
		190885	`- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)`
		190885	`- addq $-(VEC_SIZE * 4), %rcx`
		190885	`- addq $-(VEC_SIZE * 4), %rdx`
		190885	`- VMOVA %VEC(0), (%r9)`
		190885	`- VMOVA %VEC(1), -VEC_SIZE(%r9)`
		190885	`- VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)`
		190885	`- VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)`
		190885	`- addq $-(VEC_SIZE * 4), %r9`
		190885	`- cmpq $(VEC_SIZE * 4), %rdx`
		190885	`- ja L(loop_4x_vec_backward)`
		190885	`+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)`
		190885	`+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)`
		190885	`+ VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)`
		190885	`+ VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)`
		190885	`+ addq $(VEC_SIZE * -4), %rsi`
		190885	`+ VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)`
		190885	`+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)`
		190885	`+ VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)`
		190885	`+ VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)`
		190885	`+ addq $(VEC_SIZE * -4), %rcx`
		190885	`+ cmpq %rcx, %rdi`
		190885	`+ jb L(loop_4x_vec_backward)`
		190885	`/* Store the first 4 * VEC. */`
		190885	`- VMOVU %VEC(4), (%rdi)`
		190885	`+ VMOVU %VEC(0), (%rdi)`
		190885	`VMOVU %VEC(5), VEC_SIZE(%rdi)`
		190885	`VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)`
		190885	`VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)`
		190885	`/* Store the last VEC. */`
		190885	`- VMOVU %VEC(8), (%r11)`
		190885	`+ VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+`
		190885	`+#if defined USE_MULTIARCH && IS_IN (libc)`
		190885	`+ /* L(skip_short_movsb_check) is only used with ERMS. Not for`
		190885	`+ FSRM. */`
		190885	`+ .p2align 5,, 16`
		190885	`+# if ALIGN_MOVSB`
		190885	`+L(skip_short_movsb_check):`
		190885	`+# if MOVSB_ALIGN_TO > VEC_SIZE`
		190885	`+ VMOVU VEC_SIZE(%rsi), %VEC(1)`
		190885	`+# endif`
		190885	`+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)`
		190885	`+# error Unsupported MOVSB_ALIGN_TO`
		190885	`+# endif`
		190885	`+ /* If CPU does not have FSRM two options for aligning. Align src`
		190885	`+ if dst and src 4k alias. Otherwise align dst. */`
		190885	`+ testl $(PAGE_SIZE - 512), %ecx`
		190885	`+ jnz L(movsb_align_dst)`
		190885	`+ /* Fall through. dst and src 4k alias. It's better to align src`
		190885	`+ here because the bottleneck will be loads dues to the false`
		190885	`+ dependency on dst. */`
		190885	`+`
		190885	`+ /* rcx already has dst - src. */`
		190885	`+ movq %rcx, %r9`
		190885	`+ /* Add src to len. Subtract back after src aligned. -1 because`
		190885	`+ src is initially aligned to MOVSB_ALIGN_TO - 1. */`
		190885	`+ leaq -1(%rsi, %rdx), %rcx`
		190885	`+ /* Inclusively align src to MOVSB_ALIGN_TO - 1. */`
		190885	`+ orq $(MOVSB_ALIGN_TO - 1), %rsi`
		190885	`+ /* Restore dst and len adjusted with new values for aligned dst.`
		190885	`+ */`
		190885	`+ leaq 1(%rsi, %r9), %rdi`
		190885	`+ subq %rsi, %rcx`
		190885	`+ /* Finish aligning src. */`
		190885	`+ incq %rsi`
		190885	`+`
		190885	`+ rep movsb`
		190885	`+`
		190885	`+ VMOVU %VEC(0), (%r8)`
		190885	`+# if MOVSB_ALIGN_TO > VEC_SIZE`
		190885	`+ VMOVU %VEC(1), VEC_SIZE(%r8)`
		190885	`+# endif`
		190885	`VZEROUPPER_RETURN`
		190885	`+# endif`
		190885	`+`
		190885	`+ .p2align 4,, 12`
		190885	`+L(movsb):`
		190885	`+ movq %rdi, %rcx`
		190885	`+ subq %rsi, %rcx`
		190885	`+ /* Go to backwards temporal copy if overlap no matter what as`
		190885	`+ backward REP MOVSB is slow and we don't want to use NT stores if`
		190885	`+ there is overlap. */`
		190885	`+ cmpq %rdx, %rcx`
		190885	`+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */`
		190885	`+ jb L(more_8x_vec_backward_check_nop)`
		190885	`+# if ALIGN_MOVSB`
		190885	`+ /* Save dest for storing aligning VECs later. */`
		190885	`+ movq %rdi, %r8`
		190885	`+# endif`
		190885	`+ /* If above __x86_rep_movsb_stop_threshold most likely is`
		190885	`+ candidate for NT moves aswell. */`
		190885	`+ cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP`
		190885	`+ jae L(large_memcpy_2x_check)`
		190885	`+# if AVOID_SHORT_DISTANCE_REP_MOVSB \|\| ALIGN_MOVSB`
		190885	`+ /* Only avoid short movsb if CPU has FSRM. */`
		190885	`+ testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)`
		190885	`+ jz L(skip_short_movsb_check)`
		190885	`+# if AVOID_SHORT_DISTANCE_REP_MOVSB`
		190885	`+ /* Avoid "rep movsb" if RCX, the distance between source and`
		190885	`+ destination, is N4GB + [1..63] with N >= 0. /`
		190885	`+`
		190885	`+ /* ecx contains dst - src. Early check for backward copy`
		190885	`+ conditions means only case of slow movsb with src = dst + [0,`
		190885	`+ 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check`
		190885	`+ for that case. */`
		190885	`+ cmpl $-64, %ecx`
		190885	`+ ja L(more_8x_vec_forward)`
		190885	`+# endif`
		190885	`+# endif`
		190885	`+# if ALIGN_MOVSB`
		190885	`+# if MOVSB_ALIGN_TO > VEC_SIZE`
		190885	`+ VMOVU VEC_SIZE(%rsi), %VEC(1)`
		190885	`+# endif`
		190885	`+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)`
		190885	`+# error Unsupported MOVSB_ALIGN_TO`
		190885	`+# endif`
		190885	`+ /* Fall through means cpu has FSRM. In that case exclusively`
		190885	`+ align destination. */`
		190885	`+L(movsb_align_dst):`
		190885	`+ /* Subtract dst from src. Add back after dst aligned. */`
		190885	`+ subq %rdi, %rsi`
		190885	`+ /* Exclusively align dst to MOVSB_ALIGN_TO (64). */`
		190885	`+ addq $(MOVSB_ALIGN_TO - 1), %rdi`
		190885	`+ /* Add dst to len. Subtract back after dst aligned. */`
		190885	`+ leaq (%r8, %rdx), %rcx`
		190885	`+ /* Finish aligning dst. */`
		190885	`+ andq $-(MOVSB_ALIGN_TO), %rdi`
		190885	`+ /* Restore src and len adjusted with new values for aligned dst.`
		190885	`+ */`
		190885	`+ addq %rdi, %rsi`
		190885	`+ subq %rdi, %rcx`
		190885	`+`
		190885	`+ rep movsb`
		190885	`+`
		190885	`+ /* Store VECs loaded for aligning. */`
		190885	`+ VMOVU %VEC(0), (%r8)`
		190885	`+# if MOVSB_ALIGN_TO > VEC_SIZE`
		190885	`+ VMOVU %VEC(1), VEC_SIZE(%r8)`
		190885	`+# endif`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+# else /* !ALIGN_MOVSB. */`
		190885	`+L(skip_short_movsb_check):`
		190885	`+ mov %RDX_LP, %RCX_LP`
		190885	`+ rep movsb`
		190885	`+ ret`
		190885	`+# endif`
		190885	`+#endif`
		190885
		190885	`+ .p2align 4,, 10`
		190885	`#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)`
		190885	`- .p2align 4`
		190885	`+L(large_memcpy_2x_check):`
		190885	`+ cmp __x86_rep_movsb_threshold(%rip), %RDX_LP`
		190885	`+ jb L(more_8x_vec_check)`
		190885	`L(large_memcpy_2x):`
		190885	`- /* Compute absolute value of difference between source and`
		190885	`- destination. */`
		190885	`- movq %rdi, %r9`
		190885	`- subq %rsi, %r9`
		190885	`- movq %r9, %r8`
		190885	`- leaq -1(%r9), %rcx`
		190885	`- sarq $63, %r8`
		190885	`- xorq %r8, %r9`
		190885	`- subq %r8, %r9`
		190885	`- /* Don't use non-temporal store if there is overlap between`
		190885	`- destination and source since destination may be in cache when`
		190885	`- source is loaded. */`
		190885	`- cmpq %r9, %rdx`
		190885	`- ja L(more_8x_vec_check)`
		190885	`+ /* To reach this point it is impossible for dst > src and`
		190885	`+ overlap. Remaining to check is src > dst and overlap. rcx`
		190885	`+ already contains dst - src. Negate rcx to get src - dst. If`
		190885	`+ length > rcx then there is overlap and forward copy is best. */`
		190885	`+ negq %rcx`
		190885	`+ cmpq %rcx, %rdx`
		190885	`+ ja L(more_8x_vec_forward)`
		190885
		190885	`/* Cache align destination. First store the first 64 bytes then`
		190885	`adjust alignments. */`
		190885	`- VMOVU (%rsi), %VEC(8)`
		190885	`-#if VEC_SIZE < 64`
		190885	`- VMOVU VEC_SIZE(%rsi), %VEC(9)`
		190885	`-#if VEC_SIZE < 32`
		190885	`- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)`
		190885	`- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)`
		190885	`-#endif`
		190885	`-#endif`
		190885	`- VMOVU %VEC(8), (%rdi)`
		190885	`-#if VEC_SIZE < 64`
		190885	`- VMOVU %VEC(9), VEC_SIZE(%rdi)`
		190885	`-#if VEC_SIZE < 32`
		190885	`- VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)`
		190885	`- VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)`
		190885	`-#endif`
		190885	`-#endif`
		190885	`+`
		190885	`+ /* First vec was also loaded into VEC(0). */`
		190885	`+# if VEC_SIZE < 64`
		190885	`+ VMOVU VEC_SIZE(%rsi), %VEC(1)`
		190885	`+# if VEC_SIZE < 32`
		190885	`+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)`
		190885	`+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)`
		190885	`+# endif`
		190885	`+# endif`
		190885	`+ VMOVU %VEC(0), (%rdi)`
		190885	`+# if VEC_SIZE < 64`
		190885	`+ VMOVU %VEC(1), VEC_SIZE(%rdi)`
		190885	`+# if VEC_SIZE < 32`
		190885	`+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)`
		190885	`+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)`
		190885	`+# endif`
		190885	`+# endif`
		190885	`+`
		190885	`/* Adjust source, destination, and size. */`
		190885	`movq %rdi, %r8`
		190885	`andq $63, %r8`
		190885	`@@ -614,9 +767,13 @@ L(large_memcpy_2x):`
		190885	`/* Adjust length. */`
		190885	`addq %r8, %rdx`
		190885
		190885	`- /* Test if source and destination addresses will alias. If they do`
		190885	`- the larger pipeline in large_memcpy_4x alleviated the`
		190885	`+ /* Test if source and destination addresses will alias. If they`
		190885	`+ do the larger pipeline in large_memcpy_4x alleviated the`
		190885	`performance drop. */`
		190885	`+`
		190885	`+ /* ecx contains -(dst - src). not ecx will return dst - src - 1`
		190885	`+ which works for testing aliasing. */`
		190885	`+ notl %ecx`
		190885	`testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx`
		190885	`jz L(large_memcpy_4x)`
		190885
		190885	`@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):`
		190885	`/* ecx stores inner loop counter. */`
		190885	`movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx`
		190885	`L(loop_large_memcpy_4x_inner):`
		190885	`- /* Only one prefetch set per page as doing 4 pages give more time`
		190885	`- for prefetcher to keep up. */`
		190885	`+ /* Only one prefetch set per page as doing 4 pages give more`
		190885	`+ time for prefetcher to keep up. */`
		190885	`PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)`
		190885	`PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)`
		190885	`PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)`
		190885	`--`
		190885	`GitLab`
		190885

rpms / glibc

Source Code

Blame SOURCES/ia-opt-memmove-vec-unaligned-erms.patch