076f82
commit a7392db2ff2b9dd906500941ac6361dbe2211b0d
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Mon Nov 1 00:49:51 2021 -0500
076f82
076f82
    x86: Optimize memmove-vec-unaligned-erms.S
076f82
    
076f82
    No bug.
076f82
    
076f82
    The optimizations are as follows:
076f82
    
076f82
    1) Always align entry to 64 bytes. This makes behavior more
076f82
       predictable and makes other frontend optimizations easier.
076f82
    
076f82
    2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
076f82
       significant benefits in the case that:
076f82
            0 < (dst - src) < [256, 512]
076f82
    
076f82
    3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
076f82
       improvement and for FSRM [-10%, 25%].
076f82
    
076f82
    In addition to these primary changes there is general cleanup
076f82
    throughout to optimize the aligning routines and control flow logic.
076f82
    
076f82
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    (cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6)
076f82
076f82
diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
076f82
index db106a7a1f23f268..b2b318084823dceb 100644
076f82
--- a/sysdeps/x86_64/memmove.S
076f82
+++ b/sysdeps/x86_64/memmove.S
076f82
@@ -25,7 +25,7 @@
076f82
 /* Use movups and movaps for smaller code sizes.  */
076f82
 #define VMOVU		movups
076f82
 #define VMOVA		movaps
076f82
-
076f82
+#define MOV_SIZE	3
076f82
 #define SECTION(p)		p
076f82
 
076f82
 #ifdef USE_MULTIARCH
076f82
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
076f82
index 1ec1962e861dbf63..67a55f0c85af841c 100644
076f82
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
076f82
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
076f82
@@ -4,7 +4,7 @@
076f82
 # define VMOVNT		vmovntdq
076f82
 # define VMOVU		vmovdqu
076f82
 # define VMOVA		vmovdqa
076f82
-
076f82
+# define MOV_SIZE	4
076f82
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
076f82
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
076f82
 
076f82
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
076f82
index e195e93f153c9512..975ae6c0515b83cb 100644
076f82
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
076f82
@@ -4,7 +4,7 @@
076f82
 # define VMOVNT		vmovntdq
076f82
 # define VMOVU		vmovdqu
076f82
 # define VMOVA		vmovdqa
076f82
-
076f82
+# define MOV_SIZE	4
076f82
 # define SECTION(p)		p##.avx
076f82
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
076f82
 
076f82
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
076f82
index 848848ab39ff9326..0fa7126830af7acb 100644
076f82
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
076f82
@@ -25,7 +25,7 @@
076f82
 # define VMOVU		vmovdqu64
076f82
 # define VMOVA		vmovdqa64
076f82
 # define VZEROUPPER
076f82
-
076f82
+# define MOV_SIZE	6
076f82
 # define SECTION(p)		p##.evex512
076f82
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
076f82
 
076f82
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
076f82
index 0cbce8f944da51a0..88715441feaaccf5 100644
076f82
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
076f82
@@ -25,7 +25,7 @@
076f82
 # define VMOVU		vmovdqu64
076f82
 # define VMOVA		vmovdqa64
076f82
 # define VZEROUPPER
076f82
-
076f82
+# define MOV_SIZE	6
076f82
 # define SECTION(p)		p##.evex
076f82
 # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
076f82
 
076f82
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
076f82
index abde8438d41f2320..7b27cbdda5fb99f7 100644
076f82
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
076f82
@@ -76,6 +76,25 @@
076f82
 # endif
076f82
 #endif
076f82
 
076f82
+/* Whether to align before movsb. Ultimately we want 64 byte
076f82
+   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
076f82
+#define ALIGN_MOVSB	(VEC_SIZE > 16)
076f82
+/* Number of bytes to align movsb to.  */
076f82
+#define MOVSB_ALIGN_TO	64
076f82
+
076f82
+#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
076f82
+#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
076f82
+
076f82
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
076f82
+# error MOV_SIZE Unknown
076f82
+#endif
076f82
+
076f82
+#if LARGE_MOV_SIZE
076f82
+# define SMALL_SIZE_OFFSET	(4)
076f82
+#else
076f82
+# define SMALL_SIZE_OFFSET	(0)
076f82
+#endif
076f82
+
076f82
 #ifndef PAGE_SIZE
076f82
 # define PAGE_SIZE 4096
076f82
 #endif
076f82
@@ -199,25 +218,21 @@ L(start):
076f82
 # endif
076f82
 	cmp	$VEC_SIZE, %RDX_LP
076f82
 	jb	L(less_vec)
076f82
+	/* Load regardless.  */
076f82
+	VMOVU	(%rsi), %VEC(0)
076f82
 	cmp	$(VEC_SIZE * 2), %RDX_LP
076f82
 	ja	L(more_2x_vec)
076f82
-#if !defined USE_MULTIARCH || !IS_IN (libc)
076f82
-L(last_2x_vec):
076f82
-#endif
076f82
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
076f82
-	VMOVU	(%rsi), %VEC(0)
076f82
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
076f82
 	VMOVU	%VEC(0), (%rdi)
076f82
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
076f82
-#if !defined USE_MULTIARCH || !IS_IN (libc)
076f82
-L(nop):
076f82
-	ret
076f82
+#if !(defined USE_MULTIARCH && IS_IN (libc))
076f82
+	ZERO_UPPER_VEC_REGISTERS_RETURN
076f82
 #else
076f82
 	VZEROUPPER_RETURN
076f82
 #endif
076f82
 #if defined USE_MULTIARCH && IS_IN (libc)
076f82
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
076f82
-
076f82
 # if VEC_SIZE == 16
076f82
 ENTRY (__mempcpy_chk_erms)
076f82
 	cmp	%RDX_LP, %RCX_LP
076f82
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
076f82
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
076f82
 # endif
076f82
 
076f82
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
076f82
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
076f82
 	movq	%rdi, %rax
076f82
 L(start_erms):
076f82
 # ifdef __ILP32__
076f82
@@ -298,310 +313,448 @@ L(start_erms):
076f82
 # endif
076f82
 	cmp	$VEC_SIZE, %RDX_LP
076f82
 	jb	L(less_vec)
076f82
+	/* Load regardless.  */
076f82
+	VMOVU	(%rsi), %VEC(0)
076f82
 	cmp	$(VEC_SIZE * 2), %RDX_LP
076f82
 	ja	L(movsb_more_2x_vec)
076f82
-L(last_2x_vec):
076f82
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
076f82
-	VMOVU	(%rsi), %VEC(0)
076f82
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
076f82
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
076f82
+	 */
076f82
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
076f82
 	VMOVU	%VEC(0), (%rdi)
076f82
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
076f82
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
076f82
 L(return):
076f82
-#if VEC_SIZE > 16
076f82
+# if VEC_SIZE > 16
076f82
 	ZERO_UPPER_VEC_REGISTERS_RETURN
076f82
-#else
076f82
+# else
076f82
 	ret
076f82
+# endif
076f82
 #endif
076f82
 
076f82
-L(movsb):
076f82
-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
076f82
-	jae	L(more_8x_vec)
076f82
-	cmpq	%rsi, %rdi
076f82
-	jb	1f
076f82
-	/* Source == destination is less common.  */
076f82
-	je	L(nop)
076f82
-	leaq	(%rsi,%rdx), %r9
076f82
-	cmpq	%r9, %rdi
076f82
-	/* Avoid slow backward REP MOVSB.  */
076f82
-	jb	L(more_8x_vec_backward)
076f82
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
076f82
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
076f82
-	jz	3f
076f82
-	movq	%rdi, %rcx
076f82
-	subq	%rsi, %rcx
076f82
-	jmp	2f
076f82
-# endif
076f82
-1:
076f82
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
076f82
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
076f82
-	jz	3f
076f82
-	movq	%rsi, %rcx
076f82
-	subq	%rdi, %rcx
076f82
-2:
076f82
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
076f82
-   is N*4GB + [1..63] with N >= 0.  */
076f82
-	cmpl	$63, %ecx
076f82
-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
076f82
-3:
076f82
-# endif
076f82
-	mov	%RDX_LP, %RCX_LP
076f82
-	rep movsb
076f82
-L(nop):
076f82
+#if LARGE_MOV_SIZE
076f82
+	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
076f82
+	   ENTRY block and L(less_vec).  */
076f82
+	.p2align 4,, 8
076f82
+L(between_4_7):
076f82
+	/* From 4 to 7.  No branch when size == 4.  */
076f82
+	movl	(%rsi), %ecx
076f82
+	movl	(%rsi, %rdx), %esi
076f82
+	movl	%ecx, (%rdi)
076f82
+	movl	%esi, (%rdi, %rdx)
076f82
 	ret
076f82
 #endif
076f82
 
076f82
+	.p2align 4
076f82
 L(less_vec):
076f82
 	/* Less than 1 VEC.  */
076f82
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
076f82
 # error Unsupported VEC_SIZE!
076f82
 #endif
076f82
 #if VEC_SIZE > 32
076f82
-	cmpb	$32, %dl
076f82
+	cmpl	$32, %edx
076f82
 	jae	L(between_32_63)
076f82
 #endif
076f82
 #if VEC_SIZE > 16
076f82
-	cmpb	$16, %dl
076f82
+	cmpl	$16, %edx
076f82
 	jae	L(between_16_31)
076f82
 #endif
076f82
-	cmpb	$8, %dl
076f82
+	cmpl	$8, %edx
076f82
 	jae	L(between_8_15)
076f82
-	cmpb	$4, %dl
076f82
+#if SMALL_MOV_SIZE
076f82
+	cmpl	$4, %edx
076f82
+#else
076f82
+	subq	$4, %rdx
076f82
+#endif
076f82
 	jae	L(between_4_7)
076f82
-	cmpb	$1, %dl
076f82
-	ja	L(between_2_3)
076f82
-	jb	1f
076f82
-	movzbl	(%rsi), %ecx
076f82
+	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
076f82
+	jl	L(copy_0)
076f82
+	movb	(%rsi), %cl
076f82
+	je	L(copy_1)
076f82
+	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
076f82
+	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
076f82
+L(copy_1):
076f82
 	movb	%cl, (%rdi)
076f82
-1:
076f82
+L(copy_0):
076f82
 	ret
076f82
+
076f82
+#if SMALL_MOV_SIZE
076f82
+	.p2align 4,, 8
076f82
+L(between_4_7):
076f82
+	/* From 4 to 7.  No branch when size == 4.  */
076f82
+	movl	-4(%rsi, %rdx), %ecx
076f82
+	movl	(%rsi), %esi
076f82
+	movl	%ecx, -4(%rdi, %rdx)
076f82
+	movl	%esi, (%rdi)
076f82
+	ret
076f82
+#endif
076f82
+
076f82
+#if VEC_SIZE > 16
076f82
+	/* From 16 to 31.  No branch when size == 16.  */
076f82
+	.p2align 4,, 8
076f82
+L(between_16_31):
076f82
+	vmovdqu	(%rsi), %xmm0
076f82
+	vmovdqu	-16(%rsi, %rdx), %xmm1
076f82
+	vmovdqu	%xmm0, (%rdi)
076f82
+	vmovdqu	%xmm1, -16(%rdi, %rdx)
076f82
+	/* No ymm registers have been touched.  */
076f82
+	ret
076f82
+#endif
076f82
+
076f82
 #if VEC_SIZE > 32
076f82
+	.p2align 4,, 10
076f82
 L(between_32_63):
076f82
 	/* From 32 to 63.  No branch when size == 32.  */
076f82
 	VMOVU	(%rsi), %YMM0
076f82
-	VMOVU	-32(%rsi,%rdx), %YMM1
076f82
+	VMOVU	-32(%rsi, %rdx), %YMM1
076f82
 	VMOVU	%YMM0, (%rdi)
076f82
-	VMOVU	%YMM1, -32(%rdi,%rdx)
076f82
-	VZEROUPPER_RETURN
076f82
-#endif
076f82
-#if VEC_SIZE > 16
076f82
-	/* From 16 to 31.  No branch when size == 16.  */
076f82
-L(between_16_31):
076f82
-	VMOVU	(%rsi), %XMM0
076f82
-	VMOVU	-16(%rsi,%rdx), %XMM1
076f82
-	VMOVU	%XMM0, (%rdi)
076f82
-	VMOVU	%XMM1, -16(%rdi,%rdx)
076f82
+	VMOVU	%YMM1, -32(%rdi, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
 #endif
076f82
+
076f82
+	.p2align 4,, 10
076f82
 L(between_8_15):
076f82
 	/* From 8 to 15.  No branch when size == 8.  */
076f82
-	movq	-8(%rsi,%rdx), %rcx
076f82
+	movq	-8(%rsi, %rdx), %rcx
076f82
 	movq	(%rsi), %rsi
076f82
-	movq	%rcx, -8(%rdi,%rdx)
076f82
 	movq	%rsi, (%rdi)
076f82
+	movq	%rcx, -8(%rdi, %rdx)
076f82
 	ret
076f82
-L(between_4_7):
076f82
-	/* From 4 to 7.  No branch when size == 4.  */
076f82
-	movl	-4(%rsi,%rdx), %ecx
076f82
-	movl	(%rsi), %esi
076f82
-	movl	%ecx, -4(%rdi,%rdx)
076f82
-	movl	%esi, (%rdi)
076f82
-	ret
076f82
-L(between_2_3):
076f82
-	/* From 2 to 3.  No branch when size == 2.  */
076f82
-	movzwl	-2(%rsi,%rdx), %ecx
076f82
-	movzwl	(%rsi), %esi
076f82
-	movw	%cx, -2(%rdi,%rdx)
076f82
-	movw	%si, (%rdi)
076f82
-	ret
076f82
 
076f82
+	.p2align 4,, 10
076f82
+L(last_4x_vec):
076f82
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
076f82
+
076f82
+	/* VEC(0) and VEC(1) have already been loaded.  */
076f82
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
076f82
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
076f82
+	VMOVU	%VEC(0), (%rdi)
076f82
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
076f82
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
076f82
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
076f82
+	VZEROUPPER_RETURN
076f82
+
076f82
+	.p2align 4
076f82
 #if defined USE_MULTIARCH && IS_IN (libc)
076f82
 L(movsb_more_2x_vec):
076f82
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
076f82
 	ja	L(movsb)
076f82
 #endif
076f82
 L(more_2x_vec):
076f82
-	/* More than 2 * VEC and there may be overlap between destination
076f82
-	   and source.  */
076f82
+	/* More than 2 * VEC and there may be overlap between
076f82
+	   destination and source.  */
076f82
 	cmpq	$(VEC_SIZE * 8), %rdx
076f82
 	ja	L(more_8x_vec)
076f82
+	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
076f82
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
076f82
 	cmpq	$(VEC_SIZE * 4), %rdx
076f82
 	jbe	L(last_4x_vec)
076f82
-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
076f82
-	VMOVU	(%rsi), %VEC(0)
076f82
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
076f82
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
076f82
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
076f82
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
076f82
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
076f82
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
076f82
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
076f82
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
076f82
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
076f82
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
076f82
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
076f82
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
076f82
 	VMOVU	%VEC(0), (%rdi)
076f82
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
076f82
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
076f82
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
076f82
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
076f82
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
076f82
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
076f82
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
076f82
-	VZEROUPPER_RETURN
076f82
-L(last_4x_vec):
076f82
-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
076f82
-	VMOVU	(%rsi), %VEC(0)
076f82
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
076f82
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
076f82
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
076f82
-	VMOVU	%VEC(0), (%rdi)
076f82
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
076f82
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
076f82
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
076f82
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
076f82
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
076f82
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
076f82
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
 
076f82
+	.p2align 4,, 4
076f82
 L(more_8x_vec):
076f82
+	movq	%rdi, %rcx
076f82
+	subq	%rsi, %rcx
076f82
+	/* Go to backwards temporal copy if overlap no matter what as
076f82
+	   backward REP MOVSB is slow and we don't want to use NT stores if
076f82
+	   there is overlap.  */
076f82
+	cmpq	%rdx, %rcx
076f82
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
076f82
+	jb	L(more_8x_vec_backward_check_nop)
076f82
 	/* Check if non-temporal move candidate.  */
076f82
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
076f82
 	/* Check non-temporal store threshold.  */
076f82
-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
076f82
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
076f82
 	ja	L(large_memcpy_2x)
076f82
 #endif
076f82
-	/* Entry if rdx is greater than non-temporal threshold but there
076f82
-       is overlap.  */
076f82
+	/* To reach this point there cannot be overlap and dst > src. So
076f82
+	   check for overlap and src > dst in which case correctness
076f82
+	   requires forward copy. Otherwise decide between backward/forward
076f82
+	   copy depending on address aliasing.  */
076f82
+
076f82
+	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
076f82
+	   but less than __x86_shared_non_temporal_threshold.  */
076f82
 L(more_8x_vec_check):
076f82
-	cmpq	%rsi, %rdi
076f82
-	ja	L(more_8x_vec_backward)
076f82
-	/* Source == destination is less common.  */
076f82
-	je	L(nop)
076f82
-	/* Load the first VEC and last 4 * VEC to support overlapping
076f82
-	   addresses.  */
076f82
-	VMOVU	(%rsi), %VEC(4)
076f82
+	/* rcx contains dst - src. Add back length (rdx).  */
076f82
+	leaq	(%rcx, %rdx), %r8
076f82
+	/* If r8 has different sign than rcx then there is overlap so we
076f82
+	   must do forward copy.  */
076f82
+	xorq	%rcx, %r8
076f82
+	/* Isolate just sign bit of r8.  */
076f82
+	shrq	$63, %r8
076f82
+	/* Get 4k difference dst - src.  */
076f82
+	andl	$(PAGE_SIZE - 256), %ecx
076f82
+	/* If r8 is non-zero must do foward for correctness. Otherwise
076f82
+	   if ecx is non-zero there is 4k False Alaising so do backward
076f82
+	   copy.  */
076f82
+	addl	%r8d, %ecx
076f82
+	jz	L(more_8x_vec_backward)
076f82
+
076f82
+	/* if rdx is greater than __x86_shared_non_temporal_threshold
076f82
+	   but there is overlap, or from short distance movsb.  */
076f82
+L(more_8x_vec_forward):
076f82
+	/* Load first and last 4 * VEC to support overlapping addresses.
076f82
+	 */
076f82
+
076f82
+	/* First vec was already loaded into VEC(0).  */
076f82
 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
076f82
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
076f82
+	/* Save begining of dst.  */
076f82
+	movq	%rdi, %rcx
076f82
+	/* Align dst to VEC_SIZE - 1.  */
076f82
+	orq	$(VEC_SIZE - 1), %rdi
076f82
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
076f82
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
076f82
-	/* Save start and stop of the destination buffer.  */
076f82
-	movq	%rdi, %r11
076f82
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
076f82
-	/* Align destination for aligned stores in the loop.  Compute
076f82
-	   how much destination is misaligned.  */
076f82
-	movq	%rdi, %r8
076f82
-	andq	$(VEC_SIZE - 1), %r8
076f82
-	/* Get the negative of offset for alignment.  */
076f82
-	subq	$VEC_SIZE, %r8
076f82
-	/* Adjust source.  */
076f82
-	subq	%r8, %rsi
076f82
-	/* Adjust destination which should be aligned now.  */
076f82
-	subq	%r8, %rdi
076f82
-	/* Adjust length.  */
076f82
-	addq	%r8, %rdx
076f82
 
076f82
-	.p2align 4
076f82
+	/* Subtract dst from src. Add back after dst aligned.  */
076f82
+	subq	%rcx, %rsi
076f82
+	/* Finish aligning dst.  */
076f82
+	incq	%rdi
076f82
+	/* Restore src adjusted with new value for aligned dst.  */
076f82
+	addq	%rdi, %rsi
076f82
+	/* Store end of buffer minus tail in rdx.  */
076f82
+	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
076f82
+
076f82
+	/* Dont use multi-byte nop to align.  */
076f82
+	.p2align 4,, 11
076f82
 L(loop_4x_vec_forward):
076f82
 	/* Copy 4 * VEC a time forward.  */
076f82
-	VMOVU	(%rsi), %VEC(0)
076f82
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
076f82
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
076f82
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
076f82
+	VMOVU	(%rsi), %VEC(1)
076f82
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
076f82
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
076f82
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
076f82
 	subq	$-(VEC_SIZE * 4), %rsi
076f82
-	addq	$-(VEC_SIZE * 4), %rdx
076f82
-	VMOVA	%VEC(0), (%rdi)
076f82
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
076f82
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
076f82
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
076f82
+	VMOVA	%VEC(1), (%rdi)
076f82
+	VMOVA	%VEC(2), VEC_SIZE(%rdi)
076f82
+	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
076f82
+	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
076f82
 	subq	$-(VEC_SIZE * 4), %rdi
076f82
-	cmpq	$(VEC_SIZE * 4), %rdx
076f82
+	cmpq	%rdi, %rdx
076f82
 	ja	L(loop_4x_vec_forward)
076f82
 	/* Store the last 4 * VEC.  */
076f82
-	VMOVU	%VEC(5), (%rcx)
076f82
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
076f82
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
076f82
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
076f82
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
076f82
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
076f82
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
076f82
+	VMOVU	%VEC(8), (%rdx)
076f82
 	/* Store the first VEC.  */
076f82
-	VMOVU	%VEC(4), (%r11)
076f82
+	VMOVU	%VEC(0), (%rcx)
076f82
+	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
076f82
+	 */
076f82
+L(nop_backward):
076f82
 	VZEROUPPER_RETURN
076f82
 
076f82
+	.p2align 4,, 8
076f82
+L(more_8x_vec_backward_check_nop):
076f82
+	/* rcx contains dst - src. Test for dst == src to skip all of
076f82
+	   memmove.  */
076f82
+	testq	%rcx, %rcx
076f82
+	jz	L(nop_backward)
076f82
 L(more_8x_vec_backward):
076f82
 	/* Load the first 4 * VEC and last VEC to support overlapping
076f82
 	   addresses.  */
076f82
-	VMOVU	(%rsi), %VEC(4)
076f82
+
076f82
+	/* First vec was also loaded into VEC(0).  */
076f82
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
076f82
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
076f82
+	/* Begining of region for 4x backward copy stored in rcx.  */
076f82
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
076f82
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
076f82
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
076f82
-	/* Save stop of the destination buffer.  */
076f82
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
076f82
-	/* Align destination end for aligned stores in the loop.  Compute
076f82
-	   how much destination end is misaligned.  */
076f82
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
076f82
-	movq	%r11, %r9
076f82
-	movq	%r11, %r8
076f82
-	andq	$(VEC_SIZE - 1), %r8
076f82
-	/* Adjust source.  */
076f82
-	subq	%r8, %rcx
076f82
-	/* Adjust the end of destination which should be aligned now.  */
076f82
-	subq	%r8, %r9
076f82
-	/* Adjust length.  */
076f82
-	subq	%r8, %rdx
076f82
-
076f82
-	.p2align 4
076f82
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
076f82
+	/* Subtract dst from src. Add back after dst aligned.  */
076f82
+	subq	%rdi, %rsi
076f82
+	/* Align dst.  */
076f82
+	andq	$-(VEC_SIZE), %rcx
076f82
+	/* Restore src.  */
076f82
+	addq	%rcx, %rsi
076f82
+
076f82
+	/* Don't use multi-byte nop to align.  */
076f82
+	.p2align 4,, 11
076f82
 L(loop_4x_vec_backward):
076f82
 	/* Copy 4 * VEC a time backward.  */
076f82
-	VMOVU	(%rcx), %VEC(0)
076f82
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
076f82
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
076f82
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
076f82
-	addq	$-(VEC_SIZE * 4), %rcx
076f82
-	addq	$-(VEC_SIZE * 4), %rdx
076f82
-	VMOVA	%VEC(0), (%r9)
076f82
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
076f82
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
076f82
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
076f82
-	addq	$-(VEC_SIZE * 4), %r9
076f82
-	cmpq	$(VEC_SIZE * 4), %rdx
076f82
-	ja	L(loop_4x_vec_backward)
076f82
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
076f82
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
076f82
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
076f82
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
076f82
+	addq	$(VEC_SIZE * -4), %rsi
076f82
+	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
076f82
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
076f82
+	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
076f82
+	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
076f82
+	addq	$(VEC_SIZE * -4), %rcx
076f82
+	cmpq	%rcx, %rdi
076f82
+	jb	L(loop_4x_vec_backward)
076f82
 	/* Store the first 4 * VEC.  */
076f82
-	VMOVU	%VEC(4), (%rdi)
076f82
+	VMOVU	%VEC(0), (%rdi)
076f82
 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
076f82
 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
076f82
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
076f82
 	/* Store the last VEC.  */
076f82
-	VMOVU	%VEC(8), (%r11)
076f82
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
076f82
+	VZEROUPPER_RETURN
076f82
+
076f82
+#if defined USE_MULTIARCH && IS_IN (libc)
076f82
+	/* L(skip_short_movsb_check) is only used with ERMS. Not for
076f82
+	   FSRM.  */
076f82
+	.p2align 5,, 16
076f82
+# if ALIGN_MOVSB
076f82
+L(skip_short_movsb_check):
076f82
+#  if MOVSB_ALIGN_TO > VEC_SIZE
076f82
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
076f82
+#  endif
076f82
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
076f82
+#   error Unsupported MOVSB_ALIGN_TO
076f82
+#  endif
076f82
+	/* If CPU does not have FSRM two options for aligning. Align src
076f82
+	   if dst and src 4k alias. Otherwise align dst.  */
076f82
+	testl	$(PAGE_SIZE - 512), %ecx
076f82
+	jnz	L(movsb_align_dst)
076f82
+	/* Fall through. dst and src 4k alias. It's better to align src
076f82
+	   here because the bottleneck will be loads dues to the false
076f82
+	   dependency on dst.  */
076f82
+
076f82
+	/* rcx already has dst - src.  */
076f82
+	movq	%rcx, %r9
076f82
+	/* Add src to len. Subtract back after src aligned. -1 because
076f82
+	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
076f82
+	leaq	-1(%rsi, %rdx), %rcx
076f82
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
076f82
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
076f82
+	/* Restore dst and len adjusted with new values for aligned dst.
076f82
+	 */
076f82
+	leaq	1(%rsi, %r9), %rdi
076f82
+	subq	%rsi, %rcx
076f82
+	/* Finish aligning src.  */
076f82
+	incq	%rsi
076f82
+
076f82
+	rep	movsb
076f82
+
076f82
+	VMOVU	%VEC(0), (%r8)
076f82
+#  if MOVSB_ALIGN_TO > VEC_SIZE
076f82
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
076f82
+#  endif
076f82
 	VZEROUPPER_RETURN
076f82
+# endif
076f82
+
076f82
+	.p2align 4,, 12
076f82
+L(movsb):
076f82
+	movq	%rdi, %rcx
076f82
+	subq	%rsi, %rcx
076f82
+	/* Go to backwards temporal copy if overlap no matter what as
076f82
+	   backward REP MOVSB is slow and we don't want to use NT stores if
076f82
+	   there is overlap.  */
076f82
+	cmpq	%rdx, %rcx
076f82
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
076f82
+	jb	L(more_8x_vec_backward_check_nop)
076f82
+# if ALIGN_MOVSB
076f82
+	/* Save dest for storing aligning VECs later.  */
076f82
+	movq	%rdi, %r8
076f82
+# endif
076f82
+	/* If above __x86_rep_movsb_stop_threshold most likely is
076f82
+	   candidate for NT moves aswell.  */
076f82
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
076f82
+	jae	L(large_memcpy_2x_check)
076f82
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
076f82
+	/* Only avoid short movsb if CPU has FSRM.  */
076f82
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
076f82
+	jz	L(skip_short_movsb_check)
076f82
+#  if AVOID_SHORT_DISTANCE_REP_MOVSB
076f82
+	/* Avoid "rep movsb" if RCX, the distance between source and
076f82
+	   destination, is N*4GB + [1..63] with N >= 0.  */
076f82
+
076f82
+	/* ecx contains dst - src. Early check for backward copy
076f82
+	   conditions means only case of slow movsb with src = dst + [0,
076f82
+	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
076f82
+	   for that case.  */
076f82
+	cmpl	$-64, %ecx
076f82
+	ja	L(more_8x_vec_forward)
076f82
+#  endif
076f82
+# endif
076f82
+# if ALIGN_MOVSB
076f82
+#  if MOVSB_ALIGN_TO > VEC_SIZE
076f82
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
076f82
+#  endif
076f82
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
076f82
+#   error Unsupported MOVSB_ALIGN_TO
076f82
+#  endif
076f82
+	/* Fall through means cpu has FSRM. In that case exclusively
076f82
+	   align destination.  */
076f82
+L(movsb_align_dst):
076f82
+	/* Subtract dst from src. Add back after dst aligned.  */
076f82
+	subq	%rdi, %rsi
076f82
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
076f82
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
076f82
+	/* Add dst to len. Subtract back after dst aligned.  */
076f82
+	leaq	(%r8, %rdx), %rcx
076f82
+	/* Finish aligning dst.  */
076f82
+	andq	$-(MOVSB_ALIGN_TO), %rdi
076f82
+	/* Restore src and len adjusted with new values for aligned dst.
076f82
+	 */
076f82
+	addq	%rdi, %rsi
076f82
+	subq	%rdi, %rcx
076f82
+
076f82
+	rep	movsb
076f82
+
076f82
+	/* Store VECs loaded for aligning.  */
076f82
+	VMOVU	%VEC(0), (%r8)
076f82
+#  if MOVSB_ALIGN_TO > VEC_SIZE
076f82
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
076f82
+#  endif
076f82
+	VZEROUPPER_RETURN
076f82
+# else	/* !ALIGN_MOVSB.  */
076f82
+L(skip_short_movsb_check):
076f82
+	mov	%RDX_LP, %RCX_LP
076f82
+	rep	movsb
076f82
+	ret
076f82
+# endif
076f82
+#endif
076f82
 
076f82
+	.p2align 4,, 10
076f82
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
076f82
-	.p2align 4
076f82
+L(large_memcpy_2x_check):
076f82
+	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
076f82
+	jb	L(more_8x_vec_check)
076f82
 L(large_memcpy_2x):
076f82
-	/* Compute absolute value of difference between source and
076f82
-	   destination.  */
076f82
-	movq	%rdi, %r9
076f82
-	subq	%rsi, %r9
076f82
-	movq	%r9, %r8
076f82
-	leaq	-1(%r9), %rcx
076f82
-	sarq	$63, %r8
076f82
-	xorq	%r8, %r9
076f82
-	subq	%r8, %r9
076f82
-	/* Don't use non-temporal store if there is overlap between
076f82
-	   destination and source since destination may be in cache when
076f82
-	   source is loaded.  */
076f82
-	cmpq	%r9, %rdx
076f82
-	ja	L(more_8x_vec_check)
076f82
+	/* To reach this point it is impossible for dst > src and
076f82
+	   overlap. Remaining to check is src > dst and overlap. rcx
076f82
+	   already contains dst - src. Negate rcx to get src - dst. If
076f82
+	   length > rcx then there is overlap and forward copy is best.  */
076f82
+	negq	%rcx
076f82
+	cmpq	%rcx, %rdx
076f82
+	ja	L(more_8x_vec_forward)
076f82
 
076f82
 	/* Cache align destination. First store the first 64 bytes then
076f82
 	   adjust alignments.  */
076f82
-	VMOVU	(%rsi), %VEC(8)
076f82
-#if VEC_SIZE < 64
076f82
-	VMOVU	VEC_SIZE(%rsi), %VEC(9)
076f82
-#if VEC_SIZE < 32
076f82
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
076f82
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
076f82
-#endif
076f82
-#endif
076f82
-	VMOVU	%VEC(8), (%rdi)
076f82
-#if VEC_SIZE < 64
076f82
-	VMOVU	%VEC(9), VEC_SIZE(%rdi)
076f82
-#if VEC_SIZE < 32
076f82
-	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
076f82
-	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
076f82
-#endif
076f82
-#endif
076f82
+
076f82
+	/* First vec was also loaded into VEC(0).  */
076f82
+# if VEC_SIZE < 64
076f82
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
076f82
+#  if VEC_SIZE < 32
076f82
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
076f82
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
076f82
+#  endif
076f82
+# endif
076f82
+	VMOVU	%VEC(0), (%rdi)
076f82
+# if VEC_SIZE < 64
076f82
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
076f82
+#  if VEC_SIZE < 32
076f82
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
076f82
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
076f82
+#  endif
076f82
+# endif
076f82
+
076f82
 	/* Adjust source, destination, and size.  */
076f82
 	movq	%rdi, %r8
076f82
 	andq	$63, %r8
076f82
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
076f82
 	/* Adjust length.  */
076f82
 	addq	%r8, %rdx
076f82
 
076f82
-	/* Test if source and destination addresses will alias. If they do
076f82
-	   the larger pipeline in large_memcpy_4x alleviated the
076f82
+	/* Test if source and destination addresses will alias. If they
076f82
+	   do the larger pipeline in large_memcpy_4x alleviated the
076f82
 	   performance drop.  */
076f82
+
076f82
+	/* ecx contains -(dst - src). not ecx will return dst - src - 1
076f82
+	   which works for testing aliasing.  */
076f82
+	notl	%ecx
076f82
 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
076f82
 	jz	L(large_memcpy_4x)
076f82
 
076f82
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
076f82
 	/* ecx stores inner loop counter.  */
076f82
 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
076f82
 L(loop_large_memcpy_4x_inner):
076f82
-	/* Only one prefetch set per page as doing 4 pages give more time
076f82
-	   for prefetcher to keep up.  */
076f82
+	/* Only one prefetch set per page as doing 4 pages give more
076f82
+	   time for prefetcher to keep up.  */
076f82
 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
076f82
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
076f82
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)