bca718
From 143ce75a4203a78d79549b00e570a5bb429c44cf Mon Sep 17 00:00:00 2001
bca718
From: Ondrej Bilka <neleai@seznam.cz>
bca718
Date: Mon, 20 May 2013 08:26:00 +0200
bca718
Subject: [PATCH] Faster memset on x64
bca718
bca718
This implementation speed up memset in several ways. First is
bca718
avoiding expensive computed jump. Second is using fact that arguments
bca718
of memset are most of time aligned to 8 bytes.
bca718
bca718
Benchmark results on:
bca718
bca718
kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_result27_04_13.tar.bz2
bca718
bca718
(cherry picked from commit b2b671b677d92429a3d41bf451668f476aa267ed)
bca718
---
bca718
 sysdeps/x86_64/memset.S | 1406 +++--------------------------------------------
bca718
 1 file changed, 91 insertions(+), 1315 deletions(-)
bca718
bca718
Index: glibc-2.17-c758a686/sysdeps/x86_64/memset.S
bca718
===================================================================
bca718
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/memset.S
bca718
+++ glibc-2.17-c758a686/sysdeps/x86_64/memset.S
bca718
@@ -19,17 +19,31 @@
bca718
 
bca718
 #include <sysdep.h>
bca718
 
bca718
-#define __STOS_LOWER_BOUNDARY	$8192
bca718
-#define __STOS_UPPER_BOUNDARY	$65536
bca718
+#ifndef ALIGN
bca718
+# define ALIGN(n) .p2align n
bca718
+#endif
bca718
 
bca718
 	.text
bca718
 #if IS_IN (libc) && !defined USE_MULTIARCH
bca718
 ENTRY(__bzero)
bca718
-	mov	%rsi,%rdx	/* Adjust parameter.  */
bca718
-	xorl	%esi,%esi	/* Fill with 0s.  */
bca718
-	jmp	L(memset_entry)
bca718
+	movq	%rdi, %rax /* Set return value.  */
bca718
+	movq	%rsi, %rdx /* Set n.  */
bca718
+	pxor	%xmm8, %xmm8
bca718
+	jmp	L(entry_from_bzero)
bca718
 END(__bzero)
bca718
 weak_alias (__bzero, bzero)
bca718
+
bca718
+/* Like memset but takes additional parameter with return value.  */
bca718
+ENTRY(__memset_tail)
bca718
+	movq	%rcx, %rax /* Set return value.  */
bca718
+
bca718
+	movd	%esi, %xmm8
bca718
+	punpcklbw	%xmm8, %xmm8
bca718
+	punpcklwd	%xmm8, %xmm8
bca718
+	pshufd	$0, %xmm8, %xmm8
bca718
+
bca718
+	jmp	L(entry_from_bzero)
bca718
+END(__memset_tail)
bca718
 #endif
bca718
 
bca718
 #if defined PIC && IS_IN (libc)
bca718
@@ -38,1318 +52,80 @@ ENTRY_CHK (__memset_chk)
bca718
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
bca718
 END_CHK (__memset_chk)
bca718
 #endif
bca718
-ENTRY (memset)
bca718
-L(memset_entry):
bca718
-	cmp    $0x1,%rdx
bca718
-	mov    %rdi,%rax	/* memset returns the dest address.  */
bca718
-	jne    L(ck2)
bca718
-	mov    %sil,(%rdi)
bca718
-	retq
bca718
-L(ck2):
bca718
-	mov    $0x101010101010101,%r9
bca718
-	mov    %rdx,%r8
bca718
-	movzbq %sil,%rdx
bca718
-	imul   %r9,%rdx
bca718
-L(now_dw_aligned):
bca718
-	cmp    $0x90,%r8
bca718
-	ja     L(ck_mem_ops_method)
bca718
-L(now_dw_aligned_small):
bca718
-	add    %r8,%rdi
bca718
-#ifndef PIC
bca718
-	lea    L(setPxQx)(%rip),%r11
bca718
-	jmpq   *(%r11,%r8,8)
bca718
-#else
bca718
-	lea    L(Got0)(%rip),%r11
bca718
-	lea    L(setPxQx)(%rip),%rcx
bca718
-	movswq (%rcx,%r8,2),%rcx
bca718
-	lea    (%rcx,%r11,1),%r11
bca718
-	jmpq   *%r11
bca718
-#endif
bca718
-
bca718
-L(Got0):
bca718
-	retq
bca718
-
bca718
-	.pushsection .rodata
bca718
-	.balign     16
bca718
-#ifndef PIC
bca718
-L(setPxQx):
bca718
-	.quad       L(Got0), L(P1Q0), L(P2Q0), L(P3Q0)
bca718
-	.quad       L(P4Q0), L(P5Q0), L(P6Q0), L(P7Q0)
bca718
-	.quad       L(P0Q1), L(P1Q1), L(P2Q1), L(P3Q1)
bca718
-	.quad       L(P4Q1), L(P5Q1), L(P6Q1), L(P7Q1)
bca718
-	.quad       L(P0Q2), L(P1Q2), L(P2Q2), L(P3Q2)
bca718
-	.quad       L(P4Q2), L(P5Q2), L(P6Q2), L(P7Q2)
bca718
-	.quad       L(P0Q3), L(P1Q3), L(P2Q3), L(P3Q3)
bca718
-	.quad       L(P4Q3), L(P5Q3), L(P6Q3), L(P7Q3)
bca718
-	.quad       L(P0Q4), L(P1Q4), L(P2Q4), L(P3Q4)
bca718
-	.quad       L(P4Q4), L(P5Q4), L(P6Q4), L(P7Q4)
bca718
-	.quad       L(P0Q5), L(P1Q5), L(P2Q5), L(P3Q5)
bca718
-	.quad       L(P4Q5), L(P5Q5), L(P6Q5), L(P7Q5)
bca718
-	.quad       L(P0Q6), L(P1Q6), L(P2Q6), L(P3Q6)
bca718
-	.quad       L(P4Q6), L(P5Q6), L(P6Q6), L(P7Q6)
bca718
-	.quad       L(P0Q7), L(P1Q7), L(P2Q7), L(P3Q7)
bca718
-	.quad       L(P4Q7), L(P5Q7), L(P6Q7), L(P7Q7)
bca718
-	.quad       L(P0Q8), L(P1Q8), L(P2Q8), L(P3Q8)
bca718
-	.quad       L(P4Q8), L(P5Q8), L(P6Q8), L(P7Q8)
bca718
-	.quad       L(P0Q9), L(P1Q9), L(P2Q9), L(P3Q9)
bca718
-	.quad       L(P4Q9), L(P5Q9), L(P6Q9), L(P7Q9)
bca718
-	.quad       L(P0QA), L(P1QA), L(P2QA), L(P3QA)
bca718
-	.quad       L(P4QA), L(P5QA), L(P6QA), L(P7QA)
bca718
-	.quad       L(P0QB), L(P1QB), L(P2QB), L(P3QB)
bca718
-	.quad       L(P4QB), L(P5QB), L(P6QB), L(P7QB)
bca718
-	.quad       L(P0QC), L(P1QC), L(P2QC), L(P3QC)
bca718
-	.quad       L(P4QC), L(P5QC), L(P6QC), L(P7QC)
bca718
-	.quad       L(P0QD), L(P1QD), L(P2QD), L(P3QD)
bca718
-	.quad       L(P4QD), L(P5QD), L(P6QD), L(P7QD)
bca718
-	.quad       L(P0QE), L(P1QE), L(P2QE), L(P3QE)
bca718
-	.quad       L(P4QE), L(P5QE), L(P6QE), L(P7QE)
bca718
-	.quad       L(P0QF), L(P1QF), L(P2QF), L(P3QF)
bca718
-	.quad       L(P4QF), L(P5QF), L(P6QF), L(P7QF)
bca718
-	.quad       L(P0QG), L(P1QG), L(P2QG), L(P3QG)
bca718
-	.quad       L(P4QG), L(P5QG), L(P6QG), L(P7QG)
bca718
-	.quad       L(P0QH), L(P1QH), L(P2QH), L(P3QH)
bca718
-	.quad       L(P4QH), L(P5QH), L(P6QH), L(P7QH)
bca718
-	.quad       L(P0QI)
bca718
-# ifdef USE_EXTRA_TABLE
bca718
-	.quad       L(P1QI), L(P2QI), L(P3QI), L(P4QI)
bca718
-	.quad       L(P5QI), L(P6QI), L(P7QI)
bca718
-# endif
bca718
-#else
bca718
-L(setPxQx):
bca718
-	.short     L(Got0)-L(Got0)
bca718
-	.short     L(P1Q0)-L(Got0)
bca718
-	.short     L(P2Q0)-L(Got0)
bca718
-	.short     L(P3Q0)-L(Got0)
bca718
-	.short     L(P4Q0)-L(Got0)
bca718
-	.short     L(P5Q0)-L(Got0)
bca718
-	.short     L(P6Q0)-L(Got0)
bca718
-	.short     L(P7Q0)-L(Got0)
bca718
-
bca718
-	.short     L(P0Q1)-L(Got0)
bca718
-	.short     L(P1Q1)-L(Got0)
bca718
-	.short     L(P2Q1)-L(Got0)
bca718
-	.short     L(P3Q1)-L(Got0)
bca718
-	.short     L(P4Q1)-L(Got0)
bca718
-	.short     L(P5Q1)-L(Got0)
bca718
-	.short     L(P6Q1)-L(Got0)
bca718
-	.short     L(P7Q1)-L(Got0)
bca718
-
bca718
-	.short     L(P0Q2)-L(Got0)
bca718
-	.short     L(P1Q2)-L(Got0)
bca718
-	.short     L(P2Q2)-L(Got0)
bca718
-	.short     L(P3Q2)-L(Got0)
bca718
-	.short     L(P4Q2)-L(Got0)
bca718
-	.short     L(P5Q2)-L(Got0)
bca718
-	.short     L(P6Q2)-L(Got0)
bca718
-	.short     L(P7Q2)-L(Got0)
bca718
-
bca718
-	.short     L(P0Q3)-L(Got0)
bca718
-	.short     L(P1Q3)-L(Got0)
bca718
-	.short     L(P2Q3)-L(Got0)
bca718
-	.short     L(P3Q3)-L(Got0)
bca718
-	.short     L(P4Q3)-L(Got0)
bca718
-	.short     L(P5Q3)-L(Got0)
bca718
-	.short     L(P6Q3)-L(Got0)
bca718
-	.short     L(P7Q3)-L(Got0)
bca718
-
bca718
-	.short     L(P0Q4)-L(Got0)
bca718
-	.short     L(P1Q4)-L(Got0)
bca718
-	.short     L(P2Q4)-L(Got0)
bca718
-	.short     L(P3Q4)-L(Got0)
bca718
-	.short     L(P4Q4)-L(Got0)
bca718
-	.short     L(P5Q4)-L(Got0)
bca718
-	.short     L(P6Q4)-L(Got0)
bca718
-	.short     L(P7Q4)-L(Got0)
bca718
-
bca718
-	.short     L(P0Q5)-L(Got0)
bca718
-	.short     L(P1Q5)-L(Got0)
bca718
-	.short     L(P2Q5)-L(Got0)
bca718
-	.short     L(P3Q5)-L(Got0)
bca718
-	.short     L(P4Q5)-L(Got0)
bca718
-	.short     L(P5Q5)-L(Got0)
bca718
-	.short     L(P6Q5)-L(Got0)
bca718
-	.short     L(P7Q5)-L(Got0)
bca718
-
bca718
-	.short     L(P0Q6)-L(Got0)
bca718
-	.short     L(P1Q6)-L(Got0)
bca718
-	.short     L(P2Q6)-L(Got0)
bca718
-	.short     L(P3Q6)-L(Got0)
bca718
-	.short     L(P4Q6)-L(Got0)
bca718
-	.short     L(P5Q6)-L(Got0)
bca718
-	.short     L(P6Q6)-L(Got0)
bca718
-	.short     L(P7Q6)-L(Got0)
bca718
-
bca718
-	.short     L(P0Q7)-L(Got0)
bca718
-	.short     L(P1Q7)-L(Got0)
bca718
-	.short     L(P2Q7)-L(Got0)
bca718
-	.short     L(P3Q7)-L(Got0)
bca718
-	.short     L(P4Q7)-L(Got0)
bca718
-	.short     L(P5Q7)-L(Got0)
bca718
-	.short     L(P6Q7)-L(Got0)
bca718
-	.short     L(P7Q7)-L(Got0)
bca718
-
bca718
-	.short     L(P0Q8)-L(Got0)
bca718
-	.short     L(P1Q8)-L(Got0)
bca718
-	.short     L(P2Q8)-L(Got0)
bca718
-	.short     L(P3Q8)-L(Got0)
bca718
-	.short     L(P4Q8)-L(Got0)
bca718
-	.short     L(P5Q8)-L(Got0)
bca718
-	.short     L(P6Q8)-L(Got0)
bca718
-	.short     L(P7Q8)-L(Got0)
bca718
-
bca718
-	.short     L(P0Q9)-L(Got0)
bca718
-	.short     L(P1Q9)-L(Got0)
bca718
-	.short     L(P2Q9)-L(Got0)
bca718
-	.short     L(P3Q9)-L(Got0)
bca718
-	.short     L(P4Q9)-L(Got0)
bca718
-	.short     L(P5Q9)-L(Got0)
bca718
-	.short     L(P6Q9)-L(Got0)
bca718
-	.short     L(P7Q9)-L(Got0)
bca718
-
bca718
-	.short     L(P0QA)-L(Got0)
bca718
-	.short     L(P1QA)-L(Got0)
bca718
-	.short     L(P2QA)-L(Got0)
bca718
-	.short     L(P3QA)-L(Got0)
bca718
-	.short     L(P4QA)-L(Got0)
bca718
-	.short     L(P5QA)-L(Got0)
bca718
-	.short     L(P6QA)-L(Got0)
bca718
-	.short     L(P7QA)-L(Got0)
bca718
-
bca718
-	.short     L(P0QB)-L(Got0)
bca718
-	.short     L(P1QB)-L(Got0)
bca718
-	.short     L(P2QB)-L(Got0)
bca718
-	.short     L(P3QB)-L(Got0)
bca718
-	.short     L(P4QB)-L(Got0)
bca718
-	.short     L(P5QB)-L(Got0)
bca718
-	.short     L(P6QB)-L(Got0)
bca718
-	.short     L(P7QB)-L(Got0)
bca718
-
bca718
-	.short     L(P0QC)-L(Got0)
bca718
-	.short     L(P1QC)-L(Got0)
bca718
-	.short     L(P2QC)-L(Got0)
bca718
-	.short     L(P3QC)-L(Got0)
bca718
-	.short     L(P4QC)-L(Got0)
bca718
-	.short     L(P5QC)-L(Got0)
bca718
-	.short     L(P6QC)-L(Got0)
bca718
-	.short     L(P7QC)-L(Got0)
bca718
-
bca718
-	.short     L(P0QD)-L(Got0)
bca718
-	.short     L(P1QD)-L(Got0)
bca718
-	.short     L(P2QD)-L(Got0)
bca718
-	.short     L(P3QD)-L(Got0)
bca718
-	.short     L(P4QD)-L(Got0)
bca718
-	.short     L(P5QD)-L(Got0)
bca718
-	.short     L(P6QD)-L(Got0)
bca718
-	.short     L(P7QD)-L(Got0)
bca718
-
bca718
-	.short     L(P0QE)-L(Got0)
bca718
-	.short     L(P1QE)-L(Got0)
bca718
-	.short     L(P2QE)-L(Got0)
bca718
-	.short     L(P3QE)-L(Got0)
bca718
-	.short     L(P4QE)-L(Got0)
bca718
-	.short     L(P5QE)-L(Got0)
bca718
-	.short     L(P6QE)-L(Got0)
bca718
-	.short     L(P7QE)-L(Got0)
bca718
-
bca718
-	.short     L(P0QF)-L(Got0)
bca718
-	.short     L(P1QF)-L(Got0)
bca718
-	.short     L(P2QF)-L(Got0)
bca718
-	.short     L(P3QF)-L(Got0)
bca718
-	.short     L(P4QF)-L(Got0)
bca718
-	.short     L(P5QF)-L(Got0)
bca718
-	.short     L(P6QF)-L(Got0)
bca718
-	.short     L(P7QF)-L(Got0)
bca718
-
bca718
-	.short     L(P0QG)-L(Got0)
bca718
-	.short     L(P1QG)-L(Got0)
bca718
-	.short     L(P2QG)-L(Got0)
bca718
-	.short     L(P3QG)-L(Got0)
bca718
-	.short     L(P4QG)-L(Got0)
bca718
-	.short     L(P5QG)-L(Got0)
bca718
-	.short     L(P6QG)-L(Got0)
bca718
-	.short     L(P7QG)-L(Got0)
bca718
-
bca718
-	.short     L(P0QH)-L(Got0)
bca718
-	.short     L(P1QH)-L(Got0)
bca718
-	.short     L(P2QH)-L(Got0)
bca718
-	.short     L(P3QH)-L(Got0)
bca718
-	.short     L(P4QH)-L(Got0)
bca718
-	.short     L(P5QH)-L(Got0)
bca718
-	.short     L(P6QH)-L(Got0)
bca718
-	.short     L(P7QH)-L(Got0)
bca718
-
bca718
-	.short     L(P0QI)-L(Got0)
bca718
-# ifdef USE_EXTRA_TABLE
bca718
-	.short     L(P1QI)-L(Got0)
bca718
-	.short     L(P2QI)-L(Got0)
bca718
-	.short     L(P3QI)-L(Got0)
bca718
-	.short     L(P4QI)-L(Got0)
bca718
-	.short     L(P5QI)-L(Got0)
bca718
-	.short     L(P6QI)-L(Got0)
bca718
-	.short     L(P7QI)-L(Got0)
bca718
-# endif
bca718
-#endif
bca718
-	.popsection
bca718
-
bca718
-	.balign     16
bca718
-#ifdef USE_EXTRA_TABLE
bca718
-L(P1QI): mov    %rdx,-0x91(%rdi)
bca718
-#endif
bca718
-L(P1QH): mov    %rdx,-0x89(%rdi)
bca718
-L(P1QG): mov    %rdx,-0x81(%rdi)
bca718
-#		   .balign     16
bca718
-L(P1QF): mov    %rdx,-0x79(%rdi)
bca718
-L(P1QE): mov    %rdx,-0x71(%rdi)
bca718
-L(P1QD): mov    %rdx,-0x69(%rdi)
bca718
-L(P1QC): mov    %rdx,-0x61(%rdi)
bca718
-L(P1QB): mov    %rdx,-0x59(%rdi)
bca718
-L(P1QA): mov    %rdx,-0x51(%rdi)
bca718
-L(P1Q9): mov    %rdx,-0x49(%rdi)
bca718
-L(P1Q8): mov    %rdx,-0x41(%rdi)
bca718
-L(P1Q7): mov    %rdx,-0x39(%rdi)
bca718
-L(P1Q6): mov    %rdx,-0x31(%rdi)
bca718
-L(P1Q5): mov    %rdx,-0x29(%rdi)
bca718
-L(P1Q4): mov    %rdx,-0x21(%rdi)
bca718
-L(P1Q3): mov    %rdx,-0x19(%rdi)
bca718
-L(P1Q2): mov    %rdx,-0x11(%rdi)
bca718
-L(P1Q1): mov    %rdx,-0x9(%rdi)
bca718
-L(P1Q0): mov    %dl,-0x1(%rdi)
bca718
-		retq
bca718
-
bca718
-	.balign     16
bca718
-L(P0QI): mov    %rdx,-0x90(%rdi)
bca718
-L(P0QH): mov    %rdx,-0x88(%rdi)
bca718
-#		   .balign     16
bca718
-L(P0QG): mov    %rdx,-0x80(%rdi)
bca718
-L(P0QF): mov    %rdx,-0x78(%rdi)
bca718
-L(P0QE): mov    %rdx,-0x70(%rdi)
bca718
-L(P0QD): mov    %rdx,-0x68(%rdi)
bca718
-L(P0QC): mov    %rdx,-0x60(%rdi)
bca718
-L(P0QB): mov    %rdx,-0x58(%rdi)
bca718
-L(P0QA): mov    %rdx,-0x50(%rdi)
bca718
-L(P0Q9): mov    %rdx,-0x48(%rdi)
bca718
-L(P0Q8): mov    %rdx,-0x40(%rdi)
bca718
-L(P0Q7): mov    %rdx,-0x38(%rdi)
bca718
-L(P0Q6): mov    %rdx,-0x30(%rdi)
bca718
-L(P0Q5): mov    %rdx,-0x28(%rdi)
bca718
-L(P0Q4): mov    %rdx,-0x20(%rdi)
bca718
-L(P0Q3): mov    %rdx,-0x18(%rdi)
bca718
-L(P0Q2): mov    %rdx,-0x10(%rdi)
bca718
-L(P0Q1): mov    %rdx,-0x8(%rdi)
bca718
-L(P0Q0): retq
bca718
-
bca718
-
bca718
-	.balign     16
bca718
-#ifdef USE_EXTRA_TABLE
bca718
-L(P2QI): mov    %rdx,-0x92(%rdi)
bca718
-#endif
bca718
-L(P2QH): mov    %rdx,-0x8a(%rdi)
bca718
-L(P2QG): mov    %rdx,-0x82(%rdi)
bca718
-#		   .balign     16
bca718
-L(P2QF): mov    %rdx,-0x7a(%rdi)
bca718
-L(P2QE): mov    %rdx,-0x72(%rdi)
bca718
-L(P2QD): mov    %rdx,-0x6a(%rdi)
bca718
-L(P2QC): mov    %rdx,-0x62(%rdi)
bca718
-L(P2QB): mov    %rdx,-0x5a(%rdi)
bca718
-L(P2QA): mov    %rdx,-0x52(%rdi)
bca718
-L(P2Q9): mov    %rdx,-0x4a(%rdi)
bca718
-L(P2Q8): mov    %rdx,-0x42(%rdi)
bca718
-L(P2Q7): mov    %rdx,-0x3a(%rdi)
bca718
-L(P2Q6): mov    %rdx,-0x32(%rdi)
bca718
-L(P2Q5): mov    %rdx,-0x2a(%rdi)
bca718
-L(P2Q4): mov    %rdx,-0x22(%rdi)
bca718
-L(P2Q3): mov    %rdx,-0x1a(%rdi)
bca718
-L(P2Q2): mov    %rdx,-0x12(%rdi)
bca718
-L(P2Q1): mov    %rdx,-0xa(%rdi)
bca718
-L(P2Q0): mov    %dx,-0x2(%rdi)
bca718
-		retq
bca718
-
bca718
-	.balign     16
bca718
-#ifdef USE_EXTRA_TABLE
bca718
-L(P3QI): mov    %rdx,-0x93(%rdi)
bca718
-#endif
bca718
-L(P3QH): mov    %rdx,-0x8b(%rdi)
bca718
-L(P3QG): mov    %rdx,-0x83(%rdi)
bca718
-#		   .balign     16
bca718
-L(P3QF): mov    %rdx,-0x7b(%rdi)
bca718
-L(P3QE): mov    %rdx,-0x73(%rdi)
bca718
-L(P3QD): mov    %rdx,-0x6b(%rdi)
bca718
-L(P3QC): mov    %rdx,-0x63(%rdi)
bca718
-L(P3QB): mov    %rdx,-0x5b(%rdi)
bca718
-L(P3QA): mov    %rdx,-0x53(%rdi)
bca718
-L(P3Q9): mov    %rdx,-0x4b(%rdi)
bca718
-L(P3Q8): mov    %rdx,-0x43(%rdi)
bca718
-L(P3Q7): mov    %rdx,-0x3b(%rdi)
bca718
-L(P3Q6): mov    %rdx,-0x33(%rdi)
bca718
-L(P3Q5): mov    %rdx,-0x2b(%rdi)
bca718
-L(P3Q4): mov    %rdx,-0x23(%rdi)
bca718
-L(P3Q3): mov    %rdx,-0x1b(%rdi)
bca718
-L(P3Q2): mov    %rdx,-0x13(%rdi)
bca718
-L(P3Q1): mov    %rdx,-0xb(%rdi)
bca718
-L(P3Q0): mov    %dx,-0x3(%rdi)
bca718
-		mov    %dl,-0x1(%rdi)
bca718
-		retq
bca718
-
bca718
-	.balign     16
bca718
-#ifdef USE_EXTRA_TABLE
bca718
-L(P4QI): mov    %rdx,-0x94(%rdi)
bca718
-#endif
bca718
-L(P4QH): mov    %rdx,-0x8c(%rdi)
bca718
-L(P4QG): mov    %rdx,-0x84(%rdi)
bca718
-#		   .balign     16
bca718
-L(P4QF): mov    %rdx,-0x7c(%rdi)
bca718
-L(P4QE): mov    %rdx,-0x74(%rdi)
bca718
-L(P4QD): mov    %rdx,-0x6c(%rdi)
bca718
-L(P4QC): mov    %rdx,-0x64(%rdi)
bca718
-L(P4QB): mov    %rdx,-0x5c(%rdi)
bca718
-L(P4QA): mov    %rdx,-0x54(%rdi)
bca718
-L(P4Q9): mov    %rdx,-0x4c(%rdi)
bca718
-L(P4Q8): mov    %rdx,-0x44(%rdi)
bca718
-L(P4Q7): mov    %rdx,-0x3c(%rdi)
bca718
-L(P4Q6): mov    %rdx,-0x34(%rdi)
bca718
-L(P4Q5): mov    %rdx,-0x2c(%rdi)
bca718
-L(P4Q4): mov    %rdx,-0x24(%rdi)
bca718
-L(P4Q3): mov    %rdx,-0x1c(%rdi)
bca718
-L(P4Q2): mov    %rdx,-0x14(%rdi)
bca718
-L(P4Q1): mov    %rdx,-0xc(%rdi)
bca718
-L(P4Q0): mov    %edx,-0x4(%rdi)
bca718
-		retq
bca718
-
bca718
-	.balign     16
bca718
-#ifdef USE_EXTRA_TABLE
bca718
-L(P5QI): mov    %rdx,-0x95(%rdi)
bca718
-#endif
bca718
-L(P5QH): mov    %rdx,-0x8d(%rdi)
bca718
-L(P5QG): mov    %rdx,-0x85(%rdi)
bca718
-#		   .balign     16
bca718
-L(P5QF): mov    %rdx,-0x7d(%rdi)
bca718
-L(P5QE): mov    %rdx,-0x75(%rdi)
bca718
-L(P5QD): mov    %rdx,-0x6d(%rdi)
bca718
-L(P5QC): mov    %rdx,-0x65(%rdi)
bca718
-L(P5QB): mov    %rdx,-0x5d(%rdi)
bca718
-L(P5QA): mov    %rdx,-0x55(%rdi)
bca718
-L(P5Q9): mov    %rdx,-0x4d(%rdi)
bca718
-L(P5Q8): mov    %rdx,-0x45(%rdi)
bca718
-L(P5Q7): mov    %rdx,-0x3d(%rdi)
bca718
-L(P5Q6): mov    %rdx,-0x35(%rdi)
bca718
-L(P5Q5): mov    %rdx,-0x2d(%rdi)
bca718
-L(P5Q4): mov    %rdx,-0x25(%rdi)
bca718
-L(P5Q3): mov    %rdx,-0x1d(%rdi)
bca718
-L(P5Q2): mov    %rdx,-0x15(%rdi)
bca718
-L(P5Q1): mov    %rdx,-0xd(%rdi)
bca718
-L(P5Q0): mov    %edx,-0x5(%rdi)
bca718
-		mov    %dl,-0x1(%rdi)
bca718
-		retq
bca718
-
bca718
-	.balign     16
bca718
-#ifdef USE_EXTRA_TABLE
bca718
-L(P6QI): mov    %rdx,-0x96(%rdi)
bca718
-#endif
bca718
-L(P6QH): mov    %rdx,-0x8e(%rdi)
bca718
-L(P6QG): mov    %rdx,-0x86(%rdi)
bca718
-#		   .balign     16
bca718
-L(P6QF): mov    %rdx,-0x7e(%rdi)
bca718
-L(P6QE): mov    %rdx,-0x76(%rdi)
bca718
-L(P6QD): mov    %rdx,-0x6e(%rdi)
bca718
-L(P6QC): mov    %rdx,-0x66(%rdi)
bca718
-L(P6QB): mov    %rdx,-0x5e(%rdi)
bca718
-L(P6QA): mov    %rdx,-0x56(%rdi)
bca718
-L(P6Q9): mov    %rdx,-0x4e(%rdi)
bca718
-L(P6Q8): mov    %rdx,-0x46(%rdi)
bca718
-L(P6Q7): mov    %rdx,-0x3e(%rdi)
bca718
-L(P6Q6): mov    %rdx,-0x36(%rdi)
bca718
-L(P6Q5): mov    %rdx,-0x2e(%rdi)
bca718
-L(P6Q4): mov    %rdx,-0x26(%rdi)
bca718
-L(P6Q3): mov    %rdx,-0x1e(%rdi)
bca718
-L(P6Q2): mov    %rdx,-0x16(%rdi)
bca718
-L(P6Q1): mov    %rdx,-0xe(%rdi)
bca718
-L(P6Q0): mov    %edx,-0x6(%rdi)
bca718
-		mov    %dx,-0x2(%rdi)
bca718
-		retq
bca718
-
bca718
-	.balign     16
bca718
-#ifdef USE_EXTRA_TABLE
bca718
-L(P7QI): mov    %rdx,-0x97(%rdi)
bca718
-#endif
bca718
-L(P7QH): mov    %rdx,-0x8f(%rdi)
bca718
-L(P7QG): mov    %rdx,-0x87(%rdi)
bca718
-#		   .balign     16
bca718
-L(P7QF): mov    %rdx,-0x7f(%rdi)
bca718
-L(P7QE): mov    %rdx,-0x77(%rdi)
bca718
-L(P7QD): mov    %rdx,-0x6f(%rdi)
bca718
-L(P7QC): mov    %rdx,-0x67(%rdi)
bca718
-L(P7QB): mov    %rdx,-0x5f(%rdi)
bca718
-L(P7QA): mov    %rdx,-0x57(%rdi)
bca718
-L(P7Q9): mov    %rdx,-0x4f(%rdi)
bca718
-L(P7Q8): mov    %rdx,-0x47(%rdi)
bca718
-L(P7Q7): mov    %rdx,-0x3f(%rdi)
bca718
-L(P7Q6): mov    %rdx,-0x37(%rdi)
bca718
-L(P7Q5): mov    %rdx,-0x2f(%rdi)
bca718
-L(P7Q4): mov    %rdx,-0x27(%rdi)
bca718
-L(P7Q3): mov    %rdx,-0x1f(%rdi)
bca718
-L(P7Q2): mov    %rdx,-0x17(%rdi)
bca718
-L(P7Q1): mov    %rdx,-0xf(%rdi)
bca718
-L(P7Q0): mov    %edx,-0x7(%rdi)
bca718
-		mov    %dx,-0x3(%rdi)
bca718
-		mov    %dl,-0x1(%rdi)
bca718
-		retq
bca718
-
bca718
-	.balign     16
bca718
-L(ck_mem_ops_method):
bca718
-
bca718
-# align to 16 byte boundary first
bca718
-	#test $0xf,%rdi
bca718
-	#jz L(aligned_now)
bca718
-	mov    $0x10,%r10
bca718
-	mov    %rdi,%r9
bca718
-	and    $0xf,%r9
bca718
-	sub    %r9,%r10
bca718
-	and    $0xf,%r10
bca718
-	add    %r10,%rdi
bca718
-	sub    %r10,%r8
bca718
-#ifndef PIC
bca718
-	lea    L(AliPxQx)(%rip),%r11
bca718
-	jmpq   *(%r11,%r10,8)
bca718
-#else
bca718
-	lea    L(aligned_now)(%rip), %r11
bca718
-	lea    L(AliPxQx)(%rip),%rcx
bca718
-	movswq (%rcx,%r10,2),%rcx
bca718
-	lea    (%rcx,%r11,1),%r11
bca718
-	jmpq   *%r11
bca718
-#endif
bca718
-
bca718
-	.pushsection .rodata
bca718
-	.balign     16
bca718
-#ifndef PIC
bca718
-L(AliPxQx):
bca718
-	.quad       L(aligned_now), L(A1Q0), L(A2Q0), L(A3Q0)
bca718
-	.quad	    L(A4Q0), L(A5Q0), L(A6Q0), L(A7Q0)
bca718
-	.quad       L(A0Q1), L(A1Q1), L(A2Q1), L(A3Q1)
bca718
-	.quad       L(A4Q1), L(A5Q1), L(A6Q1), L(A7Q1)
bca718
-#else
bca718
-L(AliPxQx):
bca718
-	.short     L(aligned_now)-L(aligned_now)
bca718
-	.short     L(A1Q0)-L(aligned_now)
bca718
-	.short     L(A2Q0)-L(aligned_now)
bca718
-	.short     L(A3Q0)-L(aligned_now)
bca718
-	.short     L(A4Q0)-L(aligned_now)
bca718
-	.short     L(A5Q0)-L(aligned_now)
bca718
-	.short     L(A6Q0)-L(aligned_now)
bca718
-	.short     L(A7Q0)-L(aligned_now)
bca718
-
bca718
-	.short     L(A0Q1)-L(aligned_now)
bca718
-	.short     L(A1Q1)-L(aligned_now)
bca718
-	.short     L(A2Q1)-L(aligned_now)
bca718
-	.short     L(A3Q1)-L(aligned_now)
bca718
-	.short     L(A4Q1)-L(aligned_now)
bca718
-	.short     L(A5Q1)-L(aligned_now)
bca718
-	.short     L(A6Q1)-L(aligned_now)
bca718
-	.short     L(A7Q1)-L(aligned_now)
bca718
-#endif
bca718
-	.popsection
bca718
-
bca718
-	.balign     16
bca718
-L(A5Q1):    mov    %dl,-0xd(%rdi)
bca718
-L(A4Q1):    mov    %edx,-0xc(%rdi)
bca718
-L(A0Q1):    mov    %rdx,-0x8(%rdi)
bca718
-L(A0Q0):    jmp     L(aligned_now)
bca718
-
bca718
-	.balign     16
bca718
-L(A1Q1):   mov    %dl,-0x9(%rdi)
bca718
-	mov    %rdx,-0x8(%rdi)
bca718
-	jmp    L(aligned_now)
bca718
-
bca718
-	.balign     16
bca718
-L(A1Q0):   mov    %dl,-0x1(%rdi)
bca718
-	jmp    L(aligned_now)
bca718
-
bca718
-	.balign     16
bca718
-L(A3Q1):    mov    %dl,-0xb(%rdi)
bca718
-L(A2Q1):    mov    %dx,-0xa(%rdi)
bca718
-	mov    %rdx,-0x8(%rdi)
bca718
-	jmp    L(aligned_now)
bca718
-
bca718
-	.balign     16
bca718
-L(A3Q0):    mov    %dl,-0x3(%rdi)
bca718
-L(A2Q0):    mov    %dx,-0x2(%rdi)
bca718
-	jmp    L(aligned_now)
bca718
-
bca718
-	.balign     16
bca718
-L(A5Q0):    mov    %dl,-0x5(%rdi)
bca718
-L(A4Q0):    mov    %edx,-0x4(%rdi)
bca718
-	jmp    L(aligned_now)
bca718
-
bca718
-	.balign     16
bca718
-L(A7Q1):    mov    %dl,-0xf(%rdi)
bca718
-L(A6Q1):    mov    %dx,-0xe(%rdi)
bca718
-	mov    %edx,-0xc(%rdi)
bca718
-	mov    %rdx,-0x8(%rdi)
bca718
-	jmp    L(aligned_now)
bca718
-
bca718
-	.balign     16
bca718
-L(A7Q0):    mov    %dl,-0x7(%rdi)
bca718
-L(A6Q0):    mov    %dx,-0x6(%rdi)
bca718
-	mov    %edx,-0x4(%rdi)
bca718
-
bca718
-#ifndef USE_MULTIARCH
bca718
-	jmp    L(aligned_now)
bca718
-
bca718
-L(SSE_pre):
bca718
-#else
bca718
-L(aligned_now):
bca718
-#endif
bca718
-#if !defined USE_MULTIARCH || defined USE_SSE2
bca718
-	 # fill RegXMM0 with the pattern
bca718
-	 movd   %rdx,%xmm0
bca718
-	 punpcklqdq %xmm0,%xmm0
bca718
-
bca718
-	 cmp    $0xb0,%r8 # 176
bca718
-	 jae    L(byte32sse2_pre)
bca718
-
bca718
-	 add    %r8,%rdi
bca718
-# ifndef PIC
bca718
-	 lea    L(SSExDx)(%rip),%r9
bca718
-	 jmpq   *(%r9,%r8,8)
bca718
-# else
bca718
-	 lea    L(SSE0Q0)(%rip),%r9
bca718
-	 lea    L(SSExDx)(%rip),%rcx
bca718
-	 movswq (%rcx,%r8,2),%rcx
bca718
-	 lea    (%rcx,%r9,1),%r9
bca718
-	 jmpq   *%r9
bca718
-# endif
bca718
-
bca718
-L(SSE0QB):  movdqa %xmm0,-0xb0(%rdi)
bca718
-L(SSE0QA):  movdqa %xmm0,-0xa0(%rdi)
bca718
-L(SSE0Q9):  movdqa %xmm0,-0x90(%rdi)
bca718
-L(SSE0Q8):  movdqa %xmm0,-0x80(%rdi)
bca718
-L(SSE0Q7):  movdqa %xmm0,-0x70(%rdi)
bca718
-L(SSE0Q6):  movdqa %xmm0,-0x60(%rdi)
bca718
-L(SSE0Q5):  movdqa %xmm0,-0x50(%rdi)
bca718
-L(SSE0Q4):  movdqa %xmm0,-0x40(%rdi)
bca718
-L(SSE0Q3):  movdqa %xmm0,-0x30(%rdi)
bca718
-L(SSE0Q2):  movdqa %xmm0,-0x20(%rdi)
bca718
-L(SSE0Q1):  movdqa %xmm0,-0x10(%rdi)
bca718
-L(SSE0Q0):  retq
bca718
-
bca718
-L(SSE1QB):  movdqa %xmm0,-0xb1(%rdi)
bca718
-L(SSE1QA):  movdqa %xmm0,-0xa1(%rdi)
bca718
-L(SSE1Q9):  movdqa %xmm0,-0x91(%rdi)
bca718
-L(SSE1Q8):  movdqa %xmm0,-0x81(%rdi)
bca718
-L(SSE1Q7):  movdqa %xmm0,-0x71(%rdi)
bca718
-L(SSE1Q6):  movdqa %xmm0,-0x61(%rdi)
bca718
-L(SSE1Q5):  movdqa %xmm0,-0x51(%rdi)
bca718
-L(SSE1Q4):  movdqa %xmm0,-0x41(%rdi)
bca718
-L(SSE1Q3):  movdqa %xmm0,-0x31(%rdi)
bca718
-L(SSE1Q2):  movdqa %xmm0,-0x21(%rdi)
bca718
-L(SSE1Q1):  movdqa %xmm0,-0x11(%rdi)
bca718
-L(SSE1Q0):  mov    %dl,-0x1(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE2QB):  movdqa %xmm0,-0xb2(%rdi)
bca718
-L(SSE2QA):  movdqa %xmm0,-0xa2(%rdi)
bca718
-L(SSE2Q9):  movdqa %xmm0,-0x92(%rdi)
bca718
-L(SSE2Q8):  movdqa %xmm0,-0x82(%rdi)
bca718
-L(SSE2Q7):  movdqa %xmm0,-0x72(%rdi)
bca718
-L(SSE2Q6):  movdqa %xmm0,-0x62(%rdi)
bca718
-L(SSE2Q5):  movdqa %xmm0,-0x52(%rdi)
bca718
-L(SSE2Q4):  movdqa %xmm0,-0x42(%rdi)
bca718
-L(SSE2Q3):  movdqa %xmm0,-0x32(%rdi)
bca718
-L(SSE2Q2):  movdqa %xmm0,-0x22(%rdi)
bca718
-L(SSE2Q1):  movdqa %xmm0,-0x12(%rdi)
bca718
-L(SSE2Q0):  mov    %dx,-0x2(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE3QB):  movdqa %xmm0,-0xb3(%rdi)
bca718
-L(SSE3QA):  movdqa %xmm0,-0xa3(%rdi)
bca718
-L(SSE3Q9):  movdqa %xmm0,-0x93(%rdi)
bca718
-L(SSE3Q8):  movdqa %xmm0,-0x83(%rdi)
bca718
-L(SSE3Q7):  movdqa %xmm0,-0x73(%rdi)
bca718
-L(SSE3Q6):  movdqa %xmm0,-0x63(%rdi)
bca718
-L(SSE3Q5):  movdqa %xmm0,-0x53(%rdi)
bca718
-L(SSE3Q4):  movdqa %xmm0,-0x43(%rdi)
bca718
-L(SSE3Q3):  movdqa %xmm0,-0x33(%rdi)
bca718
-L(SSE3Q2):  movdqa %xmm0,-0x23(%rdi)
bca718
-L(SSE3Q1):  movdqa %xmm0,-0x13(%rdi)
bca718
-L(SSE3Q0):  mov    %dx,-0x3(%rdi)
bca718
-	mov    %dl,-0x1(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE4QB):  movdqa %xmm0,-0xb4(%rdi)
bca718
-L(SSE4QA):  movdqa %xmm0,-0xa4(%rdi)
bca718
-L(SSE4Q9):  movdqa %xmm0,-0x94(%rdi)
bca718
-L(SSE4Q8):  movdqa %xmm0,-0x84(%rdi)
bca718
-L(SSE4Q7):  movdqa %xmm0,-0x74(%rdi)
bca718
-L(SSE4Q6):  movdqa %xmm0,-0x64(%rdi)
bca718
-L(SSE4Q5):  movdqa %xmm0,-0x54(%rdi)
bca718
-L(SSE4Q4):  movdqa %xmm0,-0x44(%rdi)
bca718
-L(SSE4Q3):  movdqa %xmm0,-0x34(%rdi)
bca718
-L(SSE4Q2):  movdqa %xmm0,-0x24(%rdi)
bca718
-L(SSE4Q1):  movdqa %xmm0,-0x14(%rdi)
bca718
-L(SSE4Q0):  mov    %edx,-0x4(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE5QB):  movdqa %xmm0,-0xb5(%rdi)
bca718
-L(SSE5QA):  movdqa %xmm0,-0xa5(%rdi)
bca718
-L(SSE5Q9):  movdqa %xmm0,-0x95(%rdi)
bca718
-L(SSE5Q8):  movdqa %xmm0,-0x85(%rdi)
bca718
-L(SSE5Q7):  movdqa %xmm0,-0x75(%rdi)
bca718
-L(SSE5Q6):  movdqa %xmm0,-0x65(%rdi)
bca718
-L(SSE5Q5):  movdqa %xmm0,-0x55(%rdi)
bca718
-L(SSE5Q4):  movdqa %xmm0,-0x45(%rdi)
bca718
-L(SSE5Q3):  movdqa %xmm0,-0x35(%rdi)
bca718
-L(SSE5Q2):  movdqa %xmm0,-0x25(%rdi)
bca718
-L(SSE5Q1):  movdqa %xmm0,-0x15(%rdi)
bca718
-L(SSE5Q0):  mov    %edx,-0x5(%rdi)
bca718
-	mov    %dl,-0x1(%rdi)
bca718
-	retq
bca718
-
bca718
-
bca718
-L(SSE6QB):  movdqa %xmm0,-0xb6(%rdi)
bca718
-L(SSE6QA):  movdqa %xmm0,-0xa6(%rdi)
bca718
-L(SSE6Q9):  movdqa %xmm0,-0x96(%rdi)
bca718
-L(SSE6Q8):  movdqa %xmm0,-0x86(%rdi)
bca718
-L(SSE6Q7):  movdqa %xmm0,-0x76(%rdi)
bca718
-L(SSE6Q6):  movdqa %xmm0,-0x66(%rdi)
bca718
-L(SSE6Q5):  movdqa %xmm0,-0x56(%rdi)
bca718
-L(SSE6Q4):  movdqa %xmm0,-0x46(%rdi)
bca718
-L(SSE6Q3):  movdqa %xmm0,-0x36(%rdi)
bca718
-L(SSE6Q2):  movdqa %xmm0,-0x26(%rdi)
bca718
-L(SSE6Q1):  movdqa %xmm0,-0x16(%rdi)
bca718
-L(SSE6Q0):  mov    %edx,-0x6(%rdi)
bca718
-	mov    %dx,-0x2(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE7QB):  movdqa %xmm0,-0xb7(%rdi)
bca718
-L(SSE7QA):  movdqa %xmm0,-0xa7(%rdi)
bca718
-L(SSE7Q9):  movdqa %xmm0,-0x97(%rdi)
bca718
-L(SSE7Q8):  movdqa %xmm0,-0x87(%rdi)
bca718
-L(SSE7Q7):  movdqa %xmm0,-0x77(%rdi)
bca718
-L(SSE7Q6):  movdqa %xmm0,-0x67(%rdi)
bca718
-L(SSE7Q5):  movdqa %xmm0,-0x57(%rdi)
bca718
-L(SSE7Q4):  movdqa %xmm0,-0x47(%rdi)
bca718
-L(SSE7Q3):  movdqa %xmm0,-0x37(%rdi)
bca718
-L(SSE7Q2):  movdqa %xmm0,-0x27(%rdi)
bca718
-L(SSE7Q1):  movdqa %xmm0,-0x17(%rdi)
bca718
-L(SSE7Q0):  mov    %edx,-0x7(%rdi)
bca718
-	mov    %dx,-0x3(%rdi)
bca718
-	mov    %dl,-0x1(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE8QB):  movdqa %xmm0,-0xb8(%rdi)
bca718
-L(SSE8QA):  movdqa %xmm0,-0xa8(%rdi)
bca718
-L(SSE8Q9):  movdqa %xmm0,-0x98(%rdi)
bca718
-L(SSE8Q8):  movdqa %xmm0,-0x88(%rdi)
bca718
-L(SSE8Q7):  movdqa %xmm0,-0x78(%rdi)
bca718
-L(SSE8Q6):  movdqa %xmm0,-0x68(%rdi)
bca718
-L(SSE8Q5):  movdqa %xmm0,-0x58(%rdi)
bca718
-L(SSE8Q4):  movdqa %xmm0,-0x48(%rdi)
bca718
-L(SSE8Q3):  movdqa %xmm0,-0x38(%rdi)
bca718
-L(SSE8Q2):  movdqa %xmm0,-0x28(%rdi)
bca718
-L(SSE8Q1):  movdqa %xmm0,-0x18(%rdi)
bca718
-L(SSE8Q0):  mov    %rdx,-0x8(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE9QB):  movdqa %xmm0,-0xb9(%rdi)
bca718
-L(SSE9QA):  movdqa %xmm0,-0xa9(%rdi)
bca718
-L(SSE9Q9):  movdqa %xmm0,-0x99(%rdi)
bca718
-L(SSE9Q8):  movdqa %xmm0,-0x89(%rdi)
bca718
-L(SSE9Q7):  movdqa %xmm0,-0x79(%rdi)
bca718
-L(SSE9Q6):  movdqa %xmm0,-0x69(%rdi)
bca718
-L(SSE9Q5):  movdqa %xmm0,-0x59(%rdi)
bca718
-L(SSE9Q4):  movdqa %xmm0,-0x49(%rdi)
bca718
-L(SSE9Q3):  movdqa %xmm0,-0x39(%rdi)
bca718
-L(SSE9Q2):  movdqa %xmm0,-0x29(%rdi)
bca718
-L(SSE9Q1):  movdqa %xmm0,-0x19(%rdi)
bca718
-L(SSE9Q0):  mov    %rdx,-0x9(%rdi)
bca718
-	mov    %dl,-0x1(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE10QB): movdqa %xmm0,-0xba(%rdi)
bca718
-L(SSE10QA): movdqa %xmm0,-0xaa(%rdi)
bca718
-L(SSE10Q9): movdqa %xmm0,-0x9a(%rdi)
bca718
-L(SSE10Q8): movdqa %xmm0,-0x8a(%rdi)
bca718
-L(SSE10Q7): movdqa %xmm0,-0x7a(%rdi)
bca718
-L(SSE10Q6): movdqa %xmm0,-0x6a(%rdi)
bca718
-L(SSE10Q5): movdqa %xmm0,-0x5a(%rdi)
bca718
-L(SSE10Q4): movdqa %xmm0,-0x4a(%rdi)
bca718
-L(SSE10Q3): movdqa %xmm0,-0x3a(%rdi)
bca718
-L(SSE10Q2): movdqa %xmm0,-0x2a(%rdi)
bca718
-L(SSE10Q1): movdqa %xmm0,-0x1a(%rdi)
bca718
-L(SSE10Q0): mov    %rdx,-0xa(%rdi)
bca718
-	mov    %dx,-0x2(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE11QB): movdqa %xmm0,-0xbb(%rdi)
bca718
-L(SSE11QA): movdqa %xmm0,-0xab(%rdi)
bca718
-L(SSE11Q9): movdqa %xmm0,-0x9b(%rdi)
bca718
-L(SSE11Q8): movdqa %xmm0,-0x8b(%rdi)
bca718
-L(SSE11Q7): movdqa %xmm0,-0x7b(%rdi)
bca718
-L(SSE11Q6): movdqa %xmm0,-0x6b(%rdi)
bca718
-L(SSE11Q5): movdqa %xmm0,-0x5b(%rdi)
bca718
-L(SSE11Q4): movdqa %xmm0,-0x4b(%rdi)
bca718
-L(SSE11Q3): movdqa %xmm0,-0x3b(%rdi)
bca718
-L(SSE11Q2): movdqa %xmm0,-0x2b(%rdi)
bca718
-L(SSE11Q1): movdqa %xmm0,-0x1b(%rdi)
bca718
-L(SSE11Q0): mov    %rdx,-0xb(%rdi)
bca718
-	mov    %dx,-0x3(%rdi)
bca718
-	mov    %dl,-0x1(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE12QB): movdqa %xmm0,-0xbc(%rdi)
bca718
-L(SSE12QA): movdqa %xmm0,-0xac(%rdi)
bca718
-L(SSE12Q9): movdqa %xmm0,-0x9c(%rdi)
bca718
-L(SSE12Q8): movdqa %xmm0,-0x8c(%rdi)
bca718
-L(SSE12Q7): movdqa %xmm0,-0x7c(%rdi)
bca718
-L(SSE12Q6): movdqa %xmm0,-0x6c(%rdi)
bca718
-L(SSE12Q5): movdqa %xmm0,-0x5c(%rdi)
bca718
-L(SSE12Q4): movdqa %xmm0,-0x4c(%rdi)
bca718
-L(SSE12Q3): movdqa %xmm0,-0x3c(%rdi)
bca718
-L(SSE12Q2): movdqa %xmm0,-0x2c(%rdi)
bca718
-L(SSE12Q1): movdqa %xmm0,-0x1c(%rdi)
bca718
-L(SSE12Q0): mov    %rdx,-0xc(%rdi)
bca718
-	mov    %edx,-0x4(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE13QB): movdqa %xmm0,-0xbd(%rdi)
bca718
-L(SSE13QA): movdqa %xmm0,-0xad(%rdi)
bca718
-L(SSE13Q9): movdqa %xmm0,-0x9d(%rdi)
bca718
-L(SSE13Q8): movdqa %xmm0,-0x8d(%rdi)
bca718
-L(SSE13Q7): movdqa %xmm0,-0x7d(%rdi)
bca718
-L(SSE13Q6): movdqa %xmm0,-0x6d(%rdi)
bca718
-L(SSE13Q5): movdqa %xmm0,-0x5d(%rdi)
bca718
-L(SSE13Q4): movdqa %xmm0,-0x4d(%rdi)
bca718
-L(SSE13Q3): movdqa %xmm0,-0x3d(%rdi)
bca718
-L(SSE13Q2): movdqa %xmm0,-0x2d(%rdi)
bca718
-L(SSE13Q1): movdqa %xmm0,-0x1d(%rdi)
bca718
-L(SSE13Q0): mov    %rdx,-0xd(%rdi)
bca718
-	mov    %edx,-0x5(%rdi)
bca718
-	mov    %dl,-0x1(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE14QB): movdqa %xmm0,-0xbe(%rdi)
bca718
-L(SSE14QA): movdqa %xmm0,-0xae(%rdi)
bca718
-L(SSE14Q9): movdqa %xmm0,-0x9e(%rdi)
bca718
-L(SSE14Q8): movdqa %xmm0,-0x8e(%rdi)
bca718
-L(SSE14Q7): movdqa %xmm0,-0x7e(%rdi)
bca718
-L(SSE14Q6): movdqa %xmm0,-0x6e(%rdi)
bca718
-L(SSE14Q5): movdqa %xmm0,-0x5e(%rdi)
bca718
-L(SSE14Q4): movdqa %xmm0,-0x4e(%rdi)
bca718
-L(SSE14Q3): movdqa %xmm0,-0x3e(%rdi)
bca718
-L(SSE14Q2): movdqa %xmm0,-0x2e(%rdi)
bca718
-L(SSE14Q1): movdqa %xmm0,-0x1e(%rdi)
bca718
-L(SSE14Q0): mov    %rdx,-0xe(%rdi)
bca718
-	mov    %edx,-0x6(%rdi)
bca718
-	mov    %dx,-0x2(%rdi)
bca718
-	retq
bca718
-
bca718
-L(SSE15QB): movdqa %xmm0,-0xbf(%rdi)
bca718
-L(SSE15QA): movdqa %xmm0,-0xaf(%rdi)
bca718
-L(SSE15Q9): movdqa %xmm0,-0x9f(%rdi)
bca718
-L(SSE15Q8): movdqa %xmm0,-0x8f(%rdi)
bca718
-L(SSE15Q7): movdqa %xmm0,-0x7f(%rdi)
bca718
-L(SSE15Q6): movdqa %xmm0,-0x6f(%rdi)
bca718
-L(SSE15Q5): movdqa %xmm0,-0x5f(%rdi)
bca718
-L(SSE15Q4): movdqa %xmm0,-0x4f(%rdi)
bca718
-L(SSE15Q3): movdqa %xmm0,-0x3f(%rdi)
bca718
-L(SSE15Q2): movdqa %xmm0,-0x2f(%rdi)
bca718
-L(SSE15Q1): movdqa %xmm0,-0x1f(%rdi)
bca718
-L(SSE15Q0): mov    %rdx,-0xf(%rdi)
bca718
-	mov    %edx,-0x7(%rdi)
bca718
-	mov    %dx,-0x3(%rdi)
bca718
-	mov    %dl,-0x1(%rdi)
bca718
-	retq
bca718
-
bca718
-	.balign     16
bca718
-L(byte32sse2_pre):
bca718
-
bca718
-	mov    __x86_64_shared_cache_size(%rip),%r9d  # The largest cache size
bca718
-	cmp    %r9,%r8
bca718
-	ja     L(sse2_nt_move_pre)
bca718
-	#jmp    L(byte32sse2)
bca718
-	.balign     16
bca718
-L(byte32sse2):
bca718
-	lea    -0x80(%r8),%r8 # 128
bca718
-	cmp    $0x80,%r8   # 128
bca718
-	movdqa %xmm0,(%rdi)
bca718
-	movdqa %xmm0,0x10(%rdi)
bca718
-	movdqa %xmm0,0x20(%rdi)
bca718
-	movdqa %xmm0,0x30(%rdi)
bca718
-	movdqa %xmm0,0x40(%rdi)
bca718
-	movdqa %xmm0,0x50(%rdi)
bca718
-	movdqa %xmm0,0x60(%rdi)
bca718
-	movdqa %xmm0,0x70(%rdi)
bca718
-
bca718
-	lea    0x80(%rdi),%rdi
bca718
-	jae    L(byte32sse2)
bca718
-	add    %r8,%rdi
bca718
-# ifndef PIC
bca718
-	lea    L(SSExDx)(%rip),%r11
bca718
-	jmpq   *(%r11,%r8,8)
bca718
-# else
bca718
-	lea    L(SSE0Q0)(%rip),%r11
bca718
-	lea    L(SSExDx)(%rip),%rcx
bca718
-	movswq (%rcx,%r8,2),%rcx
bca718
-	lea    (%rcx,%r11,1),%r11
bca718
-	jmpq   *%r11
bca718
-# endif
bca718
-
bca718
-	.balign     16
bca718
-L(sse2_nt_move_pre):
bca718
-	cmp    $0x0,%r9
bca718
-	je     L(byte32sse2)
bca718
-	jmp    L(sse2_nt_move)
bca718
-
bca718
-	.balign     16
bca718
-L(sse2_nt_move):
bca718
-	lea    -0x80(%r8),%r8
bca718
-	cmp    $0x80,%r8
bca718
-
bca718
-	movntdq %xmm0,(%rdi)
bca718
-	movntdq %xmm0,0x10(%rdi)
bca718
-	movntdq %xmm0,0x20(%rdi)
bca718
-	movntdq %xmm0,0x30(%rdi)
bca718
-	movntdq %xmm0,0x40(%rdi)
bca718
-	movntdq %xmm0,0x50(%rdi)
bca718
-	movntdq %xmm0,0x60(%rdi)
bca718
-	movntdq %xmm0,0x70(%rdi)
bca718
-
bca718
-	lea    0x80(%rdi),%rdi
bca718
-	jae    L(sse2_nt_move)
bca718
-	sfence
bca718
-	add    %r8,%rdi
bca718
-# ifndef PIC
bca718
-	lea    L(SSExDx)(%rip),%r11
bca718
-	jmpq   *(%r11,%r8,8)
bca718
-# else
bca718
-	lea    L(SSE0Q0)(%rip),%r11
bca718
-	lea    L(SSExDx)(%rip),%rcx
bca718
-	movswq (%rcx,%r8,2),%rcx
bca718
-	lea   (%rcx,%r11,1),%r11
bca718
-	jmpq   *%r11
bca718
-# endif
bca718
-
bca718
-	.pushsection .rodata
bca718
-	.balign     16
bca718
-# ifndef PIC
bca718
-L(SSExDx):
bca718
-	.quad       L(SSE0Q0), L(SSE1Q0), L(SSE2Q0), L(SSE3Q0)
bca718
-	.quad       L(SSE4Q0), L(SSE5Q0), L(SSE6Q0), L(SSE7Q0)
bca718
-	.quad       L(SSE8Q0), L(SSE9Q0), L(SSE10Q0), L(SSE11Q0)
bca718
-	.quad       L(SSE12Q0), L(SSE13Q0), L(SSE14Q0), L(SSE15Q0)
bca718
-	.quad       L(SSE0Q1), L(SSE1Q1), L(SSE2Q1), L(SSE3Q1)
bca718
-	.quad       L(SSE4Q1), L(SSE5Q1), L(SSE6Q1), L(SSE7Q1)
bca718
-	.quad       L(SSE8Q1), L(SSE9Q1), L(SSE10Q1), L(SSE11Q1)
bca718
-	.quad       L(SSE12Q1), L(SSE13Q1), L(SSE14Q1), L(SSE15Q1)
bca718
-	.quad       L(SSE0Q2), L(SSE1Q2), L(SSE2Q2), L(SSE3Q2)
bca718
-	.quad       L(SSE4Q2), L(SSE5Q2), L(SSE6Q2), L(SSE7Q2)
bca718
-	.quad       L(SSE8Q2), L(SSE9Q2), L(SSE10Q2), L(SSE11Q2)
bca718
-	.quad       L(SSE12Q2), L(SSE13Q2), L(SSE14Q2), L(SSE15Q2)
bca718
-	.quad       L(SSE0Q3), L(SSE1Q3), L(SSE2Q3), L(SSE3Q3)
bca718
-	.quad       L(SSE4Q3), L(SSE5Q3), L(SSE6Q3), L(SSE7Q3)
bca718
-	.quad       L(SSE8Q3), L(SSE9Q3), L(SSE10Q3), L(SSE11Q3)
bca718
-	.quad       L(SSE12Q3), L(SSE13Q3), L(SSE14Q3), L(SSE15Q3)
bca718
-	.quad       L(SSE0Q4), L(SSE1Q4), L(SSE2Q4), L(SSE3Q4)
bca718
-	.quad       L(SSE4Q4), L(SSE5Q4), L(SSE6Q4), L(SSE7Q4)
bca718
-	.quad       L(SSE8Q4), L(SSE9Q4), L(SSE10Q4), L(SSE11Q4)
bca718
-	.quad       L(SSE12Q4), L(SSE13Q4), L(SSE14Q4), L(SSE15Q4)
bca718
-	.quad       L(SSE0Q5), L(SSE1Q5), L(SSE2Q5), L(SSE3Q5)
bca718
-	.quad       L(SSE4Q5), L(SSE5Q5), L(SSE6Q5), L(SSE7Q5)
bca718
-	.quad       L(SSE8Q5), L(SSE9Q5), L(SSE10Q5), L(SSE11Q5)
bca718
-	.quad       L(SSE12Q5), L(SSE13Q5), L(SSE14Q5), L(SSE15Q5)
bca718
-	.quad       L(SSE0Q6), L(SSE1Q6), L(SSE2Q6), L(SSE3Q6)
bca718
-	.quad       L(SSE4Q6), L(SSE5Q6), L(SSE6Q6), L(SSE7Q6)
bca718
-	.quad       L(SSE8Q6), L(SSE9Q6), L(SSE10Q6), L(SSE11Q6)
bca718
-	.quad       L(SSE12Q6), L(SSE13Q6), L(SSE14Q6), L(SSE15Q6)
bca718
-	.quad       L(SSE0Q7), L(SSE1Q7), L(SSE2Q7), L(SSE3Q7)
bca718
-	.quad       L(SSE4Q7), L(SSE5Q7), L(SSE6Q7), L(SSE7Q7)
bca718
-	.quad       L(SSE8Q7), L(SSE9Q7), L(SSE10Q7), L(SSE11Q7)
bca718
-	.quad       L(SSE12Q7), L(SSE13Q7), L(SSE14Q7), L(SSE15Q7)
bca718
-	.quad       L(SSE0Q8), L(SSE1Q8), L(SSE2Q8), L(SSE3Q8)
bca718
-	.quad       L(SSE4Q8), L(SSE5Q8), L(SSE6Q8), L(SSE7Q8)
bca718
-	.quad       L(SSE8Q8), L(SSE9Q8), L(SSE10Q8), L(SSE11Q8)
bca718
-	.quad       L(SSE12Q8), L(SSE13Q8), L(SSE14Q8), L(SSE15Q8)
bca718
-	.quad       L(SSE0Q9), L(SSE1Q9), L(SSE2Q9), L(SSE3Q9)
bca718
-	.quad       L(SSE4Q9), L(SSE5Q9), L(SSE6Q9), L(SSE7Q9)
bca718
-	.quad       L(SSE8Q9), L(SSE9Q9), L(SSE10Q9), L(SSE11Q9)
bca718
-	.quad       L(SSE12Q9), L(SSE13Q9), L(SSE14Q9), L(SSE15Q9)
bca718
-	.quad       L(SSE0QA), L(SSE1QA), L(SSE2QA), L(SSE3QA)
bca718
-	.quad       L(SSE4QA), L(SSE5QA), L(SSE6QA), L(SSE7QA)
bca718
-	.quad       L(SSE8QA), L(SSE9QA), L(SSE10QA), L(SSE11QA)
bca718
-	.quad       L(SSE12QA), L(SSE13QA), L(SSE14QA), L(SSE15QA)
bca718
-	.quad       L(SSE0QB), L(SSE1QB), L(SSE2QB), L(SSE3QB)
bca718
-	.quad       L(SSE4QB), L(SSE5QB), L(SSE6QB), L(SSE7QB)
bca718
-	.quad       L(SSE8QB), L(SSE9QB), L(SSE10QB), L(SSE11QB)
bca718
-	.quad       L(SSE12QB), L(SSE13QB), L(SSE14QB), L(SSE15QB)
bca718
-# else
bca718
-L(SSExDx):
bca718
-	.short     L(SSE0Q0) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q0) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q0) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q0) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q0) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q0) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q0) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q0) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q0) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q0) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q0)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q0)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q0)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q0)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q0)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q0)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0Q1) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q1) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q1) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q1) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q1) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q1) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q1) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q1) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q1) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q1) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q1)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q1)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q1)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q1)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q1)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q1)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0Q2) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q2) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q2) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q2) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q2) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q2) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q2) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q2) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q2) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q2) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q2)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q2)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q2)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q2)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q2)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q2)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0Q3) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q3) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q3) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q3) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q3) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q3) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q3) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q3) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q3) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q3) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q3)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q3)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q3)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q3)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q3)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q3)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0Q4) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q4) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q4) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q4) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q4) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q4) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q4) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q4) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q4) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q4) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q4)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q4)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q4)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q4)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q4)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q4)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0Q5) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q5) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q5) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q5) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q5) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q5) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q5) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q5) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q5) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q5) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q5)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q5)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q5)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q5)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q5)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q5)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0Q6) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q6) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q6) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q6) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q6) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q6) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q6) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q6) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q6) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q6) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q6)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q6)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q6)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q6)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q6)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q6)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0Q7) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q7) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q7) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q7) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q7) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q7) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q7) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q7) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q7) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q7) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q7)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q7)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q7)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q7)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q7)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q7)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0Q8) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q8) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q8) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q8) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q8) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q8) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q8) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q8) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q8) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q8) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q8)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q8)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q8)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q8)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q8)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q8)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0Q9) -L(SSE0Q0)
bca718
-	.short     L(SSE1Q9) -L(SSE0Q0)
bca718
-	.short     L(SSE2Q9) -L(SSE0Q0)
bca718
-	.short     L(SSE3Q9) -L(SSE0Q0)
bca718
-	.short     L(SSE4Q9) -L(SSE0Q0)
bca718
-	.short     L(SSE5Q9) -L(SSE0Q0)
bca718
-	.short     L(SSE6Q9) -L(SSE0Q0)
bca718
-	.short     L(SSE7Q9) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8Q9) -L(SSE0Q0)
bca718
-	.short     L(SSE9Q9) -L(SSE0Q0)
bca718
-	.short     L(SSE10Q9)-L(SSE0Q0)
bca718
-	.short     L(SSE11Q9)-L(SSE0Q0)
bca718
-	.short     L(SSE12Q9)-L(SSE0Q0)
bca718
-	.short     L(SSE13Q9)-L(SSE0Q0)
bca718
-	.short     L(SSE14Q9)-L(SSE0Q0)
bca718
-	.short     L(SSE15Q9)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0QA) -L(SSE0Q0)
bca718
-	.short     L(SSE1QA) -L(SSE0Q0)
bca718
-	.short     L(SSE2QA) -L(SSE0Q0)
bca718
-	.short     L(SSE3QA) -L(SSE0Q0)
bca718
-	.short     L(SSE4QA) -L(SSE0Q0)
bca718
-	.short     L(SSE5QA) -L(SSE0Q0)
bca718
-	.short     L(SSE6QA) -L(SSE0Q0)
bca718
-	.short     L(SSE7QA) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8QA) -L(SSE0Q0)
bca718
-	.short     L(SSE9QA) -L(SSE0Q0)
bca718
-	.short     L(SSE10QA)-L(SSE0Q0)
bca718
-	.short     L(SSE11QA)-L(SSE0Q0)
bca718
-	.short     L(SSE12QA)-L(SSE0Q0)
bca718
-	.short     L(SSE13QA)-L(SSE0Q0)
bca718
-	.short     L(SSE14QA)-L(SSE0Q0)
bca718
-	.short     L(SSE15QA)-L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE0QB) -L(SSE0Q0)
bca718
-	.short     L(SSE1QB) -L(SSE0Q0)
bca718
-	.short     L(SSE2QB) -L(SSE0Q0)
bca718
-	.short     L(SSE3QB) -L(SSE0Q0)
bca718
-	.short     L(SSE4QB) -L(SSE0Q0)
bca718
-	.short     L(SSE5QB) -L(SSE0Q0)
bca718
-	.short     L(SSE6QB) -L(SSE0Q0)
bca718
-	.short     L(SSE7QB) -L(SSE0Q0)
bca718
-
bca718
-	.short     L(SSE8QB) -L(SSE0Q0)
bca718
-	.short     L(SSE9QB) -L(SSE0Q0)
bca718
-	.short     L(SSE10QB)-L(SSE0Q0)
bca718
-	.short     L(SSE11QB)-L(SSE0Q0)
bca718
-	.short     L(SSE12QB)-L(SSE0Q0)
bca718
-	.short     L(SSE13QB)-L(SSE0Q0)
bca718
-	.short     L(SSE14QB)-L(SSE0Q0)
bca718
-	.short     L(SSE15QB)-L(SSE0Q0)
bca718
-# endif
bca718
-	.popsection
bca718
-#endif /* !defined USE_MULTIARCH || defined USE_SSE2  */
bca718
-
bca718
-	.balign     16
bca718
-#ifndef USE_MULTIARCH
bca718
-L(aligned_now):
bca718
-
bca718
-	 cmpl   $0x1,__x86_64_preferred_memory_instruction(%rip)
bca718
-	 jg     L(SSE_pre)
bca718
-#endif /* USE_MULTIARCH */
bca718
-
bca718
-L(8byte_move_try):
bca718
-	cmpq	__STOS_LOWER_BOUNDARY,%r8
bca718
-	jae	L(8byte_stos_try)
bca718
-
bca718
-	.balign     16
bca718
-L(8byte_move):
bca718
-	movq	%r8,%rcx
bca718
-	shrq	$7,%rcx
bca718
-	jz	L(8byte_move_skip)
bca718
-
bca718
-	.p2align 4
bca718
-
bca718
-L(8byte_move_loop):
bca718
-	decq	%rcx
bca718
-
bca718
-	movq	%rdx,    (%rdi)
bca718
-	movq	%rdx,  8 (%rdi)
bca718
-	movq	%rdx, 16 (%rdi)
bca718
-	movq	%rdx, 24 (%rdi)
bca718
-	movq	%rdx, 32 (%rdi)
bca718
-	movq	%rdx, 40 (%rdi)
bca718
-	movq	%rdx, 48 (%rdi)
bca718
-	movq	%rdx, 56 (%rdi)
bca718
-	movq	%rdx, 64 (%rdi)
bca718
-	movq	%rdx, 72 (%rdi)
bca718
-	movq	%rdx, 80 (%rdi)
bca718
-	movq	%rdx, 88 (%rdi)
bca718
-	movq	%rdx, 96 (%rdi)
bca718
-	movq	%rdx, 104 (%rdi)
bca718
-	movq	%rdx, 112 (%rdi)
bca718
-	movq	%rdx, 120 (%rdi)
bca718
-
bca718
-	leaq	128 (%rdi),%rdi
bca718
-
bca718
-	jnz     L(8byte_move_loop)
bca718
-
bca718
-L(8byte_move_skip):
bca718
-	andl	$127,%r8d
bca718
-	lea	(%rdi,%r8,1),%rdi
bca718
-
bca718
-#ifndef PIC
bca718
-	lea	L(setPxQx)(%rip),%r11
bca718
-	jmpq	*(%r11,%r8,8) # old scheme remained for nonPIC
bca718
-#else
bca718
-	lea	L(Got0)(%rip),%r11
bca718
-	lea	L(setPxQx)(%rip),%rcx
bca718
-	movswq	(%rcx,%r8,2),%rcx
bca718
-	lea	(%rcx,%r11,1),%r11
bca718
-	jmpq	*%r11
bca718
-#endif
bca718
-
bca718
-	.balign     16
bca718
-L(8byte_stos_try):
bca718
-	mov    __x86_64_shared_cache_size(%rip),%r9d // ck largest cache size
bca718
-	cmpq	%r8,%r9		// calculate the lesser of remaining
bca718
-	cmovaq	%r8,%r9		// bytes and largest cache size
bca718
-	jbe	L(8byte_stos)
bca718
-
bca718
-L(8byte_move_reuse_try):
bca718
-	cmp	__STOS_UPPER_BOUNDARY,%r8
bca718
-	jae	L(8byte_move)
bca718
-
bca718
-	.balign     16
bca718
-L(8byte_stos):
bca718
-	movq	%r9,%rcx
bca718
-	andq	$-8,%r9
bca718
-
bca718
-	shrq	$3,%rcx
bca718
-	jz	L(8byte_stos_skip)
bca718
-
bca718
-	xchgq	%rax,%rdx
bca718
 
bca718
+ENTRY (memset)
bca718
+	movd	%esi, %xmm8
bca718
+	movq	%rdi, %rax
bca718
+	punpcklbw	%xmm8, %xmm8
bca718
+	punpcklwd	%xmm8, %xmm8
bca718
+	pshufd	$0, %xmm8, %xmm8
bca718
+L(entry_from_bzero):
bca718
+	cmpq	$64, %rdx
bca718
+	ja	L(loop_start)
bca718
+	cmpq	$16, %rdx
bca718
+	jbe	L(less_16_bytes)
bca718
+	cmpq	$32, %rdx
bca718
+	movdqu	%xmm8, (%rdi)
bca718
+	movdqu	%xmm8, -16(%rdi,%rdx)
bca718
+	ja	L(between_32_64_bytes)
bca718
+L(return):
bca718
 	rep
bca718
-	stosq
bca718
-
bca718
-	xchgq	%rax,%rdx
bca718
-
bca718
-L(8byte_stos_skip):
bca718
-	subq	%r9,%r8
bca718
-	ja	L(8byte_nt_move)
bca718
-
bca718
-	andl	$7,%r8d
bca718
-	lea	(%rdi,%r8,1),%rdi
bca718
-#ifndef PIC
bca718
-	lea	L(setPxQx)(%rip),%r11
bca718
-	jmpq	*(%r11,%r8,8) # old scheme remained for nonPIC
bca718
-#else
bca718
-	lea	L(Got0)(%rip),%r11
bca718
-	lea     L(setPxQx)(%rip),%rcx
bca718
-	movswq	(%rcx,%r8,2),%rcx
bca718
-	lea	(%rcx,%r11,1),%r11
bca718
-	jmpq	*%r11
bca718
-#endif
bca718
-
bca718
-	.balign     16
bca718
-L(8byte_nt_move):
bca718
-	movq	%r8,%rcx
bca718
-	shrq	$7,%rcx
bca718
-	jz      L(8byte_nt_move_skip)
bca718
-
bca718
-	.balign     16
bca718
-L(8byte_nt_move_loop):
bca718
-	decq	%rcx
bca718
+	ret
bca718
 
bca718
-	movntiq	%rdx,     (%rdi)
bca718
-	movntiq	%rdx,   8 (%rdi)
bca718
-	movntiq	%rdx,  16 (%rdi)
bca718
-	movntiq	%rdx,  24 (%rdi)
bca718
-	movntiq	%rdx,  32 (%rdi)
bca718
-	movntiq	%rdx,  40 (%rdi)
bca718
-	movntiq	%rdx,  48 (%rdi)
bca718
-	movntiq	%rdx,  56 (%rdi)
bca718
-	movntiq	%rdx,  64 (%rdi)
bca718
-	movntiq	%rdx,  72 (%rdi)
bca718
-	movntiq	%rdx,  80 (%rdi)
bca718
-	movntiq	%rdx,  88 (%rdi)
bca718
-	movntiq	%rdx,  96 (%rdi)
bca718
-	movntiq	%rdx, 104 (%rdi)
bca718
-	movntiq	%rdx, 112 (%rdi)
bca718
-	movntiq	%rdx, 120 (%rdi)
bca718
-
bca718
-	leaq	128 (%rdi),%rdi
bca718
-
bca718
-	jnz     L(8byte_nt_move_loop)
bca718
-
bca718
-	sfence
bca718
-
bca718
-L(8byte_nt_move_skip):
bca718
-	andl	$127,%r8d
bca718
-
bca718
-	lea	(%rdi,%r8,1),%rdi
bca718
-#ifndef PIC
bca718
-	lea	L(setPxQx)(%rip),%r11
bca718
-	jmpq	*(%r11,%r8,8) # old scheme remained for nonPIC
bca718
-#else
bca718
-	lea	L(Got0)(%rip),%r11
bca718
-	lea     L(setPxQx)(%rip),%rcx
bca718
-	movswq	(%rcx,%r8,2),%rcx
bca718
-	lea	(%rcx,%r11,1),%r11
bca718
-	jmpq	*%r11
bca718
-#endif
bca718
+	ALIGN (4)
bca718
+L(between_32_64_bytes):
bca718
+	movdqu	%xmm8, 16(%rdi)
bca718
+	movdqu	%xmm8, -32(%rdi,%rdx)
bca718
+	ret
bca718
+	ALIGN (4)
bca718
+L(loop_start):
bca718
+	leaq	64(%rdi), %rcx
bca718
+	movdqu	%xmm8, (%rdi)
bca718
+	andq	$-64, %rcx
bca718
+	movdqu	%xmm8, -16(%rdi,%rdx)
bca718
+	movdqu	%xmm8, 16(%rdi)
bca718
+	movdqu	%xmm8, -32(%rdi,%rdx)
bca718
+	movdqu	%xmm8, 32(%rdi)
bca718
+	movdqu	%xmm8, -48(%rdi,%rdx)
bca718
+	movdqu	%xmm8, 48(%rdi)
bca718
+	movdqu	%xmm8, -64(%rdi,%rdx)
bca718
+	addq	%rdi, %rdx
bca718
+	andq	$-64, %rdx
bca718
+	cmpq	%rdx, %rcx
bca718
+	je	L(return)
bca718
+	ALIGN (4)
bca718
+L(loop):
bca718
+	movdqa	%xmm8, (%rcx)
bca718
+	movdqa	%xmm8, 16(%rcx)
bca718
+	movdqa	%xmm8, 32(%rcx)
bca718
+	movdqa	%xmm8, 48(%rcx)
bca718
+	addq	$64, %rcx
bca718
+	cmpq	%rcx, %rdx
bca718
+	jne	L(loop)
bca718
+	rep
bca718
+	ret
bca718
+L(less_16_bytes):
bca718
+	movq %xmm8, %rcx
bca718
+	testb	$24, %dl
bca718
+	jne	L(between8_16bytes)
bca718
+	testb	$4, %dl
bca718
+	jne	L(between4_7bytes)
bca718
+	testb	$1, %dl
bca718
+	je	L(odd_byte)
bca718
+	movb	%cl, (%rdi)
bca718
+L(odd_byte):
bca718
+	testb	$2, %dl
bca718
+	je	L(return)
bca718
+	movw	%cx, -2(%rax,%rdx)
bca718
+	ret
bca718
+L(between4_7bytes):
bca718
+	movl	%ecx, (%rdi)
bca718
+	movl	%ecx, -4(%rdi,%rdx)
bca718
+	ret
bca718
+L(between8_16bytes):
bca718
+	movq	%rcx, (%rdi)
bca718
+	movq	%rcx, -8(%rdi,%rdx)
bca718
+	ret
bca718
 
bca718
 END (memset)
bca718
 libc_hidden_builtin_def (memset)