ce426f
From 143ce75a4203a78d79549b00e570a5bb429c44cf Mon Sep 17 00:00:00 2001
ce426f
From: Ondrej Bilka <neleai@seznam.cz>
ce426f
Date: Mon, 20 May 2013 08:26:00 +0200
ce426f
Subject: [PATCH] Faster memset on x64
ce426f
ce426f
This implementation speed up memset in several ways. First is
ce426f
avoiding expensive computed jump. Second is using fact that arguments
ce426f
of memset are most of time aligned to 8 bytes.
ce426f
ce426f
Benchmark results on:
ce426f
ce426f
kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_result27_04_13.tar.bz2
ce426f
ce426f
(cherry picked from commit b2b671b677d92429a3d41bf451668f476aa267ed)
ce426f
---
ce426f
 sysdeps/x86_64/memset.S | 1406 +++--------------------------------------------
ce426f
 1 file changed, 91 insertions(+), 1315 deletions(-)
ce426f
ce426f
Index: glibc-2.17-c758a686/sysdeps/x86_64/memset.S
ce426f
===================================================================
ce426f
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/memset.S
ce426f
+++ glibc-2.17-c758a686/sysdeps/x86_64/memset.S
ce426f
@@ -19,17 +19,31 @@
ce426f
 
ce426f
 #include <sysdep.h>
ce426f
 
ce426f
-#define __STOS_LOWER_BOUNDARY	$8192
ce426f
-#define __STOS_UPPER_BOUNDARY	$65536
ce426f
+#ifndef ALIGN
ce426f
+# define ALIGN(n) .p2align n
ce426f
+#endif
ce426f
 
ce426f
 	.text
ce426f
 #if IS_IN (libc) && !defined USE_MULTIARCH
ce426f
 ENTRY(__bzero)
ce426f
-	mov	%rsi,%rdx	/* Adjust parameter.  */
ce426f
-	xorl	%esi,%esi	/* Fill with 0s.  */
ce426f
-	jmp	L(memset_entry)
ce426f
+	movq	%rdi, %rax /* Set return value.  */
ce426f
+	movq	%rsi, %rdx /* Set n.  */
ce426f
+	pxor	%xmm8, %xmm8
ce426f
+	jmp	L(entry_from_bzero)
ce426f
 END(__bzero)
ce426f
 weak_alias (__bzero, bzero)
ce426f
+
ce426f
+/* Like memset but takes additional parameter with return value.  */
ce426f
+ENTRY(__memset_tail)
ce426f
+	movq	%rcx, %rax /* Set return value.  */
ce426f
+
ce426f
+	movd	%esi, %xmm8
ce426f
+	punpcklbw	%xmm8, %xmm8
ce426f
+	punpcklwd	%xmm8, %xmm8
ce426f
+	pshufd	$0, %xmm8, %xmm8
ce426f
+
ce426f
+	jmp	L(entry_from_bzero)
ce426f
+END(__memset_tail)
ce426f
 #endif
ce426f
 
ce426f
 #if defined PIC && IS_IN (libc)
ce426f
@@ -38,1318 +52,80 @@ ENTRY_CHK (__memset_chk)
ce426f
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
ce426f
 END_CHK (__memset_chk)
ce426f
 #endif
ce426f
-ENTRY (memset)
ce426f
-L(memset_entry):
ce426f
-	cmp    $0x1,%rdx
ce426f
-	mov    %rdi,%rax	/* memset returns the dest address.  */
ce426f
-	jne    L(ck2)
ce426f
-	mov    %sil,(%rdi)
ce426f
-	retq
ce426f
-L(ck2):
ce426f
-	mov    $0x101010101010101,%r9
ce426f
-	mov    %rdx,%r8
ce426f
-	movzbq %sil,%rdx
ce426f
-	imul   %r9,%rdx
ce426f
-L(now_dw_aligned):
ce426f
-	cmp    $0x90,%r8
ce426f
-	ja     L(ck_mem_ops_method)
ce426f
-L(now_dw_aligned_small):
ce426f
-	add    %r8,%rdi
ce426f
-#ifndef PIC
ce426f
-	lea    L(setPxQx)(%rip),%r11
ce426f
-	jmpq   *(%r11,%r8,8)
ce426f
-#else
ce426f
-	lea    L(Got0)(%rip),%r11
ce426f
-	lea    L(setPxQx)(%rip),%rcx
ce426f
-	movswq (%rcx,%r8,2),%rcx
ce426f
-	lea    (%rcx,%r11,1),%r11
ce426f
-	jmpq   *%r11
ce426f
-#endif
ce426f
-
ce426f
-L(Got0):
ce426f
-	retq
ce426f
-
ce426f
-	.pushsection .rodata
ce426f
-	.balign     16
ce426f
-#ifndef PIC
ce426f
-L(setPxQx):
ce426f
-	.quad       L(Got0), L(P1Q0), L(P2Q0), L(P3Q0)
ce426f
-	.quad       L(P4Q0), L(P5Q0), L(P6Q0), L(P7Q0)
ce426f
-	.quad       L(P0Q1), L(P1Q1), L(P2Q1), L(P3Q1)
ce426f
-	.quad       L(P4Q1), L(P5Q1), L(P6Q1), L(P7Q1)
ce426f
-	.quad       L(P0Q2), L(P1Q2), L(P2Q2), L(P3Q2)
ce426f
-	.quad       L(P4Q2), L(P5Q2), L(P6Q2), L(P7Q2)
ce426f
-	.quad       L(P0Q3), L(P1Q3), L(P2Q3), L(P3Q3)
ce426f
-	.quad       L(P4Q3), L(P5Q3), L(P6Q3), L(P7Q3)
ce426f
-	.quad       L(P0Q4), L(P1Q4), L(P2Q4), L(P3Q4)
ce426f
-	.quad       L(P4Q4), L(P5Q4), L(P6Q4), L(P7Q4)
ce426f
-	.quad       L(P0Q5), L(P1Q5), L(P2Q5), L(P3Q5)
ce426f
-	.quad       L(P4Q5), L(P5Q5), L(P6Q5), L(P7Q5)
ce426f
-	.quad       L(P0Q6), L(P1Q6), L(P2Q6), L(P3Q6)
ce426f
-	.quad       L(P4Q6), L(P5Q6), L(P6Q6), L(P7Q6)
ce426f
-	.quad       L(P0Q7), L(P1Q7), L(P2Q7), L(P3Q7)
ce426f
-	.quad       L(P4Q7), L(P5Q7), L(P6Q7), L(P7Q7)
ce426f
-	.quad       L(P0Q8), L(P1Q8), L(P2Q8), L(P3Q8)
ce426f
-	.quad       L(P4Q8), L(P5Q8), L(P6Q8), L(P7Q8)
ce426f
-	.quad       L(P0Q9), L(P1Q9), L(P2Q9), L(P3Q9)
ce426f
-	.quad       L(P4Q9), L(P5Q9), L(P6Q9), L(P7Q9)
ce426f
-	.quad       L(P0QA), L(P1QA), L(P2QA), L(P3QA)
ce426f
-	.quad       L(P4QA), L(P5QA), L(P6QA), L(P7QA)
ce426f
-	.quad       L(P0QB), L(P1QB), L(P2QB), L(P3QB)
ce426f
-	.quad       L(P4QB), L(P5QB), L(P6QB), L(P7QB)
ce426f
-	.quad       L(P0QC), L(P1QC), L(P2QC), L(P3QC)
ce426f
-	.quad       L(P4QC), L(P5QC), L(P6QC), L(P7QC)
ce426f
-	.quad       L(P0QD), L(P1QD), L(P2QD), L(P3QD)
ce426f
-	.quad       L(P4QD), L(P5QD), L(P6QD), L(P7QD)
ce426f
-	.quad       L(P0QE), L(P1QE), L(P2QE), L(P3QE)
ce426f
-	.quad       L(P4QE), L(P5QE), L(P6QE), L(P7QE)
ce426f
-	.quad       L(P0QF), L(P1QF), L(P2QF), L(P3QF)
ce426f
-	.quad       L(P4QF), L(P5QF), L(P6QF), L(P7QF)
ce426f
-	.quad       L(P0QG), L(P1QG), L(P2QG), L(P3QG)
ce426f
-	.quad       L(P4QG), L(P5QG), L(P6QG), L(P7QG)
ce426f
-	.quad       L(P0QH), L(P1QH), L(P2QH), L(P3QH)
ce426f
-	.quad       L(P4QH), L(P5QH), L(P6QH), L(P7QH)
ce426f
-	.quad       L(P0QI)
ce426f
-# ifdef USE_EXTRA_TABLE
ce426f
-	.quad       L(P1QI), L(P2QI), L(P3QI), L(P4QI)
ce426f
-	.quad       L(P5QI), L(P6QI), L(P7QI)
ce426f
-# endif
ce426f
-#else
ce426f
-L(setPxQx):
ce426f
-	.short     L(Got0)-L(Got0)
ce426f
-	.short     L(P1Q0)-L(Got0)
ce426f
-	.short     L(P2Q0)-L(Got0)
ce426f
-	.short     L(P3Q0)-L(Got0)
ce426f
-	.short     L(P4Q0)-L(Got0)
ce426f
-	.short     L(P5Q0)-L(Got0)
ce426f
-	.short     L(P6Q0)-L(Got0)
ce426f
-	.short     L(P7Q0)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0Q1)-L(Got0)
ce426f
-	.short     L(P1Q1)-L(Got0)
ce426f
-	.short     L(P2Q1)-L(Got0)
ce426f
-	.short     L(P3Q1)-L(Got0)
ce426f
-	.short     L(P4Q1)-L(Got0)
ce426f
-	.short     L(P5Q1)-L(Got0)
ce426f
-	.short     L(P6Q1)-L(Got0)
ce426f
-	.short     L(P7Q1)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0Q2)-L(Got0)
ce426f
-	.short     L(P1Q2)-L(Got0)
ce426f
-	.short     L(P2Q2)-L(Got0)
ce426f
-	.short     L(P3Q2)-L(Got0)
ce426f
-	.short     L(P4Q2)-L(Got0)
ce426f
-	.short     L(P5Q2)-L(Got0)
ce426f
-	.short     L(P6Q2)-L(Got0)
ce426f
-	.short     L(P7Q2)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0Q3)-L(Got0)
ce426f
-	.short     L(P1Q3)-L(Got0)
ce426f
-	.short     L(P2Q3)-L(Got0)
ce426f
-	.short     L(P3Q3)-L(Got0)
ce426f
-	.short     L(P4Q3)-L(Got0)
ce426f
-	.short     L(P5Q3)-L(Got0)
ce426f
-	.short     L(P6Q3)-L(Got0)
ce426f
-	.short     L(P7Q3)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0Q4)-L(Got0)
ce426f
-	.short     L(P1Q4)-L(Got0)
ce426f
-	.short     L(P2Q4)-L(Got0)
ce426f
-	.short     L(P3Q4)-L(Got0)
ce426f
-	.short     L(P4Q4)-L(Got0)
ce426f
-	.short     L(P5Q4)-L(Got0)
ce426f
-	.short     L(P6Q4)-L(Got0)
ce426f
-	.short     L(P7Q4)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0Q5)-L(Got0)
ce426f
-	.short     L(P1Q5)-L(Got0)
ce426f
-	.short     L(P2Q5)-L(Got0)
ce426f
-	.short     L(P3Q5)-L(Got0)
ce426f
-	.short     L(P4Q5)-L(Got0)
ce426f
-	.short     L(P5Q5)-L(Got0)
ce426f
-	.short     L(P6Q5)-L(Got0)
ce426f
-	.short     L(P7Q5)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0Q6)-L(Got0)
ce426f
-	.short     L(P1Q6)-L(Got0)
ce426f
-	.short     L(P2Q6)-L(Got0)
ce426f
-	.short     L(P3Q6)-L(Got0)
ce426f
-	.short     L(P4Q6)-L(Got0)
ce426f
-	.short     L(P5Q6)-L(Got0)
ce426f
-	.short     L(P6Q6)-L(Got0)
ce426f
-	.short     L(P7Q6)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0Q7)-L(Got0)
ce426f
-	.short     L(P1Q7)-L(Got0)
ce426f
-	.short     L(P2Q7)-L(Got0)
ce426f
-	.short     L(P3Q7)-L(Got0)
ce426f
-	.short     L(P4Q7)-L(Got0)
ce426f
-	.short     L(P5Q7)-L(Got0)
ce426f
-	.short     L(P6Q7)-L(Got0)
ce426f
-	.short     L(P7Q7)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0Q8)-L(Got0)
ce426f
-	.short     L(P1Q8)-L(Got0)
ce426f
-	.short     L(P2Q8)-L(Got0)
ce426f
-	.short     L(P3Q8)-L(Got0)
ce426f
-	.short     L(P4Q8)-L(Got0)
ce426f
-	.short     L(P5Q8)-L(Got0)
ce426f
-	.short     L(P6Q8)-L(Got0)
ce426f
-	.short     L(P7Q8)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0Q9)-L(Got0)
ce426f
-	.short     L(P1Q9)-L(Got0)
ce426f
-	.short     L(P2Q9)-L(Got0)
ce426f
-	.short     L(P3Q9)-L(Got0)
ce426f
-	.short     L(P4Q9)-L(Got0)
ce426f
-	.short     L(P5Q9)-L(Got0)
ce426f
-	.short     L(P6Q9)-L(Got0)
ce426f
-	.short     L(P7Q9)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0QA)-L(Got0)
ce426f
-	.short     L(P1QA)-L(Got0)
ce426f
-	.short     L(P2QA)-L(Got0)
ce426f
-	.short     L(P3QA)-L(Got0)
ce426f
-	.short     L(P4QA)-L(Got0)
ce426f
-	.short     L(P5QA)-L(Got0)
ce426f
-	.short     L(P6QA)-L(Got0)
ce426f
-	.short     L(P7QA)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0QB)-L(Got0)
ce426f
-	.short     L(P1QB)-L(Got0)
ce426f
-	.short     L(P2QB)-L(Got0)
ce426f
-	.short     L(P3QB)-L(Got0)
ce426f
-	.short     L(P4QB)-L(Got0)
ce426f
-	.short     L(P5QB)-L(Got0)
ce426f
-	.short     L(P6QB)-L(Got0)
ce426f
-	.short     L(P7QB)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0QC)-L(Got0)
ce426f
-	.short     L(P1QC)-L(Got0)
ce426f
-	.short     L(P2QC)-L(Got0)
ce426f
-	.short     L(P3QC)-L(Got0)
ce426f
-	.short     L(P4QC)-L(Got0)
ce426f
-	.short     L(P5QC)-L(Got0)
ce426f
-	.short     L(P6QC)-L(Got0)
ce426f
-	.short     L(P7QC)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0QD)-L(Got0)
ce426f
-	.short     L(P1QD)-L(Got0)
ce426f
-	.short     L(P2QD)-L(Got0)
ce426f
-	.short     L(P3QD)-L(Got0)
ce426f
-	.short     L(P4QD)-L(Got0)
ce426f
-	.short     L(P5QD)-L(Got0)
ce426f
-	.short     L(P6QD)-L(Got0)
ce426f
-	.short     L(P7QD)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0QE)-L(Got0)
ce426f
-	.short     L(P1QE)-L(Got0)
ce426f
-	.short     L(P2QE)-L(Got0)
ce426f
-	.short     L(P3QE)-L(Got0)
ce426f
-	.short     L(P4QE)-L(Got0)
ce426f
-	.short     L(P5QE)-L(Got0)
ce426f
-	.short     L(P6QE)-L(Got0)
ce426f
-	.short     L(P7QE)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0QF)-L(Got0)
ce426f
-	.short     L(P1QF)-L(Got0)
ce426f
-	.short     L(P2QF)-L(Got0)
ce426f
-	.short     L(P3QF)-L(Got0)
ce426f
-	.short     L(P4QF)-L(Got0)
ce426f
-	.short     L(P5QF)-L(Got0)
ce426f
-	.short     L(P6QF)-L(Got0)
ce426f
-	.short     L(P7QF)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0QG)-L(Got0)
ce426f
-	.short     L(P1QG)-L(Got0)
ce426f
-	.short     L(P2QG)-L(Got0)
ce426f
-	.short     L(P3QG)-L(Got0)
ce426f
-	.short     L(P4QG)-L(Got0)
ce426f
-	.short     L(P5QG)-L(Got0)
ce426f
-	.short     L(P6QG)-L(Got0)
ce426f
-	.short     L(P7QG)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0QH)-L(Got0)
ce426f
-	.short     L(P1QH)-L(Got0)
ce426f
-	.short     L(P2QH)-L(Got0)
ce426f
-	.short     L(P3QH)-L(Got0)
ce426f
-	.short     L(P4QH)-L(Got0)
ce426f
-	.short     L(P5QH)-L(Got0)
ce426f
-	.short     L(P6QH)-L(Got0)
ce426f
-	.short     L(P7QH)-L(Got0)
ce426f
-
ce426f
-	.short     L(P0QI)-L(Got0)
ce426f
-# ifdef USE_EXTRA_TABLE
ce426f
-	.short     L(P1QI)-L(Got0)
ce426f
-	.short     L(P2QI)-L(Got0)
ce426f
-	.short     L(P3QI)-L(Got0)
ce426f
-	.short     L(P4QI)-L(Got0)
ce426f
-	.short     L(P5QI)-L(Got0)
ce426f
-	.short     L(P6QI)-L(Got0)
ce426f
-	.short     L(P7QI)-L(Got0)
ce426f
-# endif
ce426f
-#endif
ce426f
-	.popsection
ce426f
-
ce426f
-	.balign     16
ce426f
-#ifdef USE_EXTRA_TABLE
ce426f
-L(P1QI): mov    %rdx,-0x91(%rdi)
ce426f
-#endif
ce426f
-L(P1QH): mov    %rdx,-0x89(%rdi)
ce426f
-L(P1QG): mov    %rdx,-0x81(%rdi)
ce426f
-#		   .balign     16
ce426f
-L(P1QF): mov    %rdx,-0x79(%rdi)
ce426f
-L(P1QE): mov    %rdx,-0x71(%rdi)
ce426f
-L(P1QD): mov    %rdx,-0x69(%rdi)
ce426f
-L(P1QC): mov    %rdx,-0x61(%rdi)
ce426f
-L(P1QB): mov    %rdx,-0x59(%rdi)
ce426f
-L(P1QA): mov    %rdx,-0x51(%rdi)
ce426f
-L(P1Q9): mov    %rdx,-0x49(%rdi)
ce426f
-L(P1Q8): mov    %rdx,-0x41(%rdi)
ce426f
-L(P1Q7): mov    %rdx,-0x39(%rdi)
ce426f
-L(P1Q6): mov    %rdx,-0x31(%rdi)
ce426f
-L(P1Q5): mov    %rdx,-0x29(%rdi)
ce426f
-L(P1Q4): mov    %rdx,-0x21(%rdi)
ce426f
-L(P1Q3): mov    %rdx,-0x19(%rdi)
ce426f
-L(P1Q2): mov    %rdx,-0x11(%rdi)
ce426f
-L(P1Q1): mov    %rdx,-0x9(%rdi)
ce426f
-L(P1Q0): mov    %dl,-0x1(%rdi)
ce426f
-		retq
ce426f
-
ce426f
-	.balign     16
ce426f
-L(P0QI): mov    %rdx,-0x90(%rdi)
ce426f
-L(P0QH): mov    %rdx,-0x88(%rdi)
ce426f
-#		   .balign     16
ce426f
-L(P0QG): mov    %rdx,-0x80(%rdi)
ce426f
-L(P0QF): mov    %rdx,-0x78(%rdi)
ce426f
-L(P0QE): mov    %rdx,-0x70(%rdi)
ce426f
-L(P0QD): mov    %rdx,-0x68(%rdi)
ce426f
-L(P0QC): mov    %rdx,-0x60(%rdi)
ce426f
-L(P0QB): mov    %rdx,-0x58(%rdi)
ce426f
-L(P0QA): mov    %rdx,-0x50(%rdi)
ce426f
-L(P0Q9): mov    %rdx,-0x48(%rdi)
ce426f
-L(P0Q8): mov    %rdx,-0x40(%rdi)
ce426f
-L(P0Q7): mov    %rdx,-0x38(%rdi)
ce426f
-L(P0Q6): mov    %rdx,-0x30(%rdi)
ce426f
-L(P0Q5): mov    %rdx,-0x28(%rdi)
ce426f
-L(P0Q4): mov    %rdx,-0x20(%rdi)
ce426f
-L(P0Q3): mov    %rdx,-0x18(%rdi)
ce426f
-L(P0Q2): mov    %rdx,-0x10(%rdi)
ce426f
-L(P0Q1): mov    %rdx,-0x8(%rdi)
ce426f
-L(P0Q0): retq
ce426f
-
ce426f
-
ce426f
-	.balign     16
ce426f
-#ifdef USE_EXTRA_TABLE
ce426f
-L(P2QI): mov    %rdx,-0x92(%rdi)
ce426f
-#endif
ce426f
-L(P2QH): mov    %rdx,-0x8a(%rdi)
ce426f
-L(P2QG): mov    %rdx,-0x82(%rdi)
ce426f
-#		   .balign     16
ce426f
-L(P2QF): mov    %rdx,-0x7a(%rdi)
ce426f
-L(P2QE): mov    %rdx,-0x72(%rdi)
ce426f
-L(P2QD): mov    %rdx,-0x6a(%rdi)
ce426f
-L(P2QC): mov    %rdx,-0x62(%rdi)
ce426f
-L(P2QB): mov    %rdx,-0x5a(%rdi)
ce426f
-L(P2QA): mov    %rdx,-0x52(%rdi)
ce426f
-L(P2Q9): mov    %rdx,-0x4a(%rdi)
ce426f
-L(P2Q8): mov    %rdx,-0x42(%rdi)
ce426f
-L(P2Q7): mov    %rdx,-0x3a(%rdi)
ce426f
-L(P2Q6): mov    %rdx,-0x32(%rdi)
ce426f
-L(P2Q5): mov    %rdx,-0x2a(%rdi)
ce426f
-L(P2Q4): mov    %rdx,-0x22(%rdi)
ce426f
-L(P2Q3): mov    %rdx,-0x1a(%rdi)
ce426f
-L(P2Q2): mov    %rdx,-0x12(%rdi)
ce426f
-L(P2Q1): mov    %rdx,-0xa(%rdi)
ce426f
-L(P2Q0): mov    %dx,-0x2(%rdi)
ce426f
-		retq
ce426f
-
ce426f
-	.balign     16
ce426f
-#ifdef USE_EXTRA_TABLE
ce426f
-L(P3QI): mov    %rdx,-0x93(%rdi)
ce426f
-#endif
ce426f
-L(P3QH): mov    %rdx,-0x8b(%rdi)
ce426f
-L(P3QG): mov    %rdx,-0x83(%rdi)
ce426f
-#		   .balign     16
ce426f
-L(P3QF): mov    %rdx,-0x7b(%rdi)
ce426f
-L(P3QE): mov    %rdx,-0x73(%rdi)
ce426f
-L(P3QD): mov    %rdx,-0x6b(%rdi)
ce426f
-L(P3QC): mov    %rdx,-0x63(%rdi)
ce426f
-L(P3QB): mov    %rdx,-0x5b(%rdi)
ce426f
-L(P3QA): mov    %rdx,-0x53(%rdi)
ce426f
-L(P3Q9): mov    %rdx,-0x4b(%rdi)
ce426f
-L(P3Q8): mov    %rdx,-0x43(%rdi)
ce426f
-L(P3Q7): mov    %rdx,-0x3b(%rdi)
ce426f
-L(P3Q6): mov    %rdx,-0x33(%rdi)
ce426f
-L(P3Q5): mov    %rdx,-0x2b(%rdi)
ce426f
-L(P3Q4): mov    %rdx,-0x23(%rdi)
ce426f
-L(P3Q3): mov    %rdx,-0x1b(%rdi)
ce426f
-L(P3Q2): mov    %rdx,-0x13(%rdi)
ce426f
-L(P3Q1): mov    %rdx,-0xb(%rdi)
ce426f
-L(P3Q0): mov    %dx,-0x3(%rdi)
ce426f
-		mov    %dl,-0x1(%rdi)
ce426f
-		retq
ce426f
-
ce426f
-	.balign     16
ce426f
-#ifdef USE_EXTRA_TABLE
ce426f
-L(P4QI): mov    %rdx,-0x94(%rdi)
ce426f
-#endif
ce426f
-L(P4QH): mov    %rdx,-0x8c(%rdi)
ce426f
-L(P4QG): mov    %rdx,-0x84(%rdi)
ce426f
-#		   .balign     16
ce426f
-L(P4QF): mov    %rdx,-0x7c(%rdi)
ce426f
-L(P4QE): mov    %rdx,-0x74(%rdi)
ce426f
-L(P4QD): mov    %rdx,-0x6c(%rdi)
ce426f
-L(P4QC): mov    %rdx,-0x64(%rdi)
ce426f
-L(P4QB): mov    %rdx,-0x5c(%rdi)
ce426f
-L(P4QA): mov    %rdx,-0x54(%rdi)
ce426f
-L(P4Q9): mov    %rdx,-0x4c(%rdi)
ce426f
-L(P4Q8): mov    %rdx,-0x44(%rdi)
ce426f
-L(P4Q7): mov    %rdx,-0x3c(%rdi)
ce426f
-L(P4Q6): mov    %rdx,-0x34(%rdi)
ce426f
-L(P4Q5): mov    %rdx,-0x2c(%rdi)
ce426f
-L(P4Q4): mov    %rdx,-0x24(%rdi)
ce426f
-L(P4Q3): mov    %rdx,-0x1c(%rdi)
ce426f
-L(P4Q2): mov    %rdx,-0x14(%rdi)
ce426f
-L(P4Q1): mov    %rdx,-0xc(%rdi)
ce426f
-L(P4Q0): mov    %edx,-0x4(%rdi)
ce426f
-		retq
ce426f
-
ce426f
-	.balign     16
ce426f
-#ifdef USE_EXTRA_TABLE
ce426f
-L(P5QI): mov    %rdx,-0x95(%rdi)
ce426f
-#endif
ce426f
-L(P5QH): mov    %rdx,-0x8d(%rdi)
ce426f
-L(P5QG): mov    %rdx,-0x85(%rdi)
ce426f
-#		   .balign     16
ce426f
-L(P5QF): mov    %rdx,-0x7d(%rdi)
ce426f
-L(P5QE): mov    %rdx,-0x75(%rdi)
ce426f
-L(P5QD): mov    %rdx,-0x6d(%rdi)
ce426f
-L(P5QC): mov    %rdx,-0x65(%rdi)
ce426f
-L(P5QB): mov    %rdx,-0x5d(%rdi)
ce426f
-L(P5QA): mov    %rdx,-0x55(%rdi)
ce426f
-L(P5Q9): mov    %rdx,-0x4d(%rdi)
ce426f
-L(P5Q8): mov    %rdx,-0x45(%rdi)
ce426f
-L(P5Q7): mov    %rdx,-0x3d(%rdi)
ce426f
-L(P5Q6): mov    %rdx,-0x35(%rdi)
ce426f
-L(P5Q5): mov    %rdx,-0x2d(%rdi)
ce426f
-L(P5Q4): mov    %rdx,-0x25(%rdi)
ce426f
-L(P5Q3): mov    %rdx,-0x1d(%rdi)
ce426f
-L(P5Q2): mov    %rdx,-0x15(%rdi)
ce426f
-L(P5Q1): mov    %rdx,-0xd(%rdi)
ce426f
-L(P5Q0): mov    %edx,-0x5(%rdi)
ce426f
-		mov    %dl,-0x1(%rdi)
ce426f
-		retq
ce426f
-
ce426f
-	.balign     16
ce426f
-#ifdef USE_EXTRA_TABLE
ce426f
-L(P6QI): mov    %rdx,-0x96(%rdi)
ce426f
-#endif
ce426f
-L(P6QH): mov    %rdx,-0x8e(%rdi)
ce426f
-L(P6QG): mov    %rdx,-0x86(%rdi)
ce426f
-#		   .balign     16
ce426f
-L(P6QF): mov    %rdx,-0x7e(%rdi)
ce426f
-L(P6QE): mov    %rdx,-0x76(%rdi)
ce426f
-L(P6QD): mov    %rdx,-0x6e(%rdi)
ce426f
-L(P6QC): mov    %rdx,-0x66(%rdi)
ce426f
-L(P6QB): mov    %rdx,-0x5e(%rdi)
ce426f
-L(P6QA): mov    %rdx,-0x56(%rdi)
ce426f
-L(P6Q9): mov    %rdx,-0x4e(%rdi)
ce426f
-L(P6Q8): mov    %rdx,-0x46(%rdi)
ce426f
-L(P6Q7): mov    %rdx,-0x3e(%rdi)
ce426f
-L(P6Q6): mov    %rdx,-0x36(%rdi)
ce426f
-L(P6Q5): mov    %rdx,-0x2e(%rdi)
ce426f
-L(P6Q4): mov    %rdx,-0x26(%rdi)
ce426f
-L(P6Q3): mov    %rdx,-0x1e(%rdi)
ce426f
-L(P6Q2): mov    %rdx,-0x16(%rdi)
ce426f
-L(P6Q1): mov    %rdx,-0xe(%rdi)
ce426f
-L(P6Q0): mov    %edx,-0x6(%rdi)
ce426f
-		mov    %dx,-0x2(%rdi)
ce426f
-		retq
ce426f
-
ce426f
-	.balign     16
ce426f
-#ifdef USE_EXTRA_TABLE
ce426f
-L(P7QI): mov    %rdx,-0x97(%rdi)
ce426f
-#endif
ce426f
-L(P7QH): mov    %rdx,-0x8f(%rdi)
ce426f
-L(P7QG): mov    %rdx,-0x87(%rdi)
ce426f
-#		   .balign     16
ce426f
-L(P7QF): mov    %rdx,-0x7f(%rdi)
ce426f
-L(P7QE): mov    %rdx,-0x77(%rdi)
ce426f
-L(P7QD): mov    %rdx,-0x6f(%rdi)
ce426f
-L(P7QC): mov    %rdx,-0x67(%rdi)
ce426f
-L(P7QB): mov    %rdx,-0x5f(%rdi)
ce426f
-L(P7QA): mov    %rdx,-0x57(%rdi)
ce426f
-L(P7Q9): mov    %rdx,-0x4f(%rdi)
ce426f
-L(P7Q8): mov    %rdx,-0x47(%rdi)
ce426f
-L(P7Q7): mov    %rdx,-0x3f(%rdi)
ce426f
-L(P7Q6): mov    %rdx,-0x37(%rdi)
ce426f
-L(P7Q5): mov    %rdx,-0x2f(%rdi)
ce426f
-L(P7Q4): mov    %rdx,-0x27(%rdi)
ce426f
-L(P7Q3): mov    %rdx,-0x1f(%rdi)
ce426f
-L(P7Q2): mov    %rdx,-0x17(%rdi)
ce426f
-L(P7Q1): mov    %rdx,-0xf(%rdi)
ce426f
-L(P7Q0): mov    %edx,-0x7(%rdi)
ce426f
-		mov    %dx,-0x3(%rdi)
ce426f
-		mov    %dl,-0x1(%rdi)
ce426f
-		retq
ce426f
-
ce426f
-	.balign     16
ce426f
-L(ck_mem_ops_method):
ce426f
-
ce426f
-# align to 16 byte boundary first
ce426f
-	#test $0xf,%rdi
ce426f
-	#jz L(aligned_now)
ce426f
-	mov    $0x10,%r10
ce426f
-	mov    %rdi,%r9
ce426f
-	and    $0xf,%r9
ce426f
-	sub    %r9,%r10
ce426f
-	and    $0xf,%r10
ce426f
-	add    %r10,%rdi
ce426f
-	sub    %r10,%r8
ce426f
-#ifndef PIC
ce426f
-	lea    L(AliPxQx)(%rip),%r11
ce426f
-	jmpq   *(%r11,%r10,8)
ce426f
-#else
ce426f
-	lea    L(aligned_now)(%rip), %r11
ce426f
-	lea    L(AliPxQx)(%rip),%rcx
ce426f
-	movswq (%rcx,%r10,2),%rcx
ce426f
-	lea    (%rcx,%r11,1),%r11
ce426f
-	jmpq   *%r11
ce426f
-#endif
ce426f
-
ce426f
-	.pushsection .rodata
ce426f
-	.balign     16
ce426f
-#ifndef PIC
ce426f
-L(AliPxQx):
ce426f
-	.quad       L(aligned_now), L(A1Q0), L(A2Q0), L(A3Q0)
ce426f
-	.quad	    L(A4Q0), L(A5Q0), L(A6Q0), L(A7Q0)
ce426f
-	.quad       L(A0Q1), L(A1Q1), L(A2Q1), L(A3Q1)
ce426f
-	.quad       L(A4Q1), L(A5Q1), L(A6Q1), L(A7Q1)
ce426f
-#else
ce426f
-L(AliPxQx):
ce426f
-	.short     L(aligned_now)-L(aligned_now)
ce426f
-	.short     L(A1Q0)-L(aligned_now)
ce426f
-	.short     L(A2Q0)-L(aligned_now)
ce426f
-	.short     L(A3Q0)-L(aligned_now)
ce426f
-	.short     L(A4Q0)-L(aligned_now)
ce426f
-	.short     L(A5Q0)-L(aligned_now)
ce426f
-	.short     L(A6Q0)-L(aligned_now)
ce426f
-	.short     L(A7Q0)-L(aligned_now)
ce426f
-
ce426f
-	.short     L(A0Q1)-L(aligned_now)
ce426f
-	.short     L(A1Q1)-L(aligned_now)
ce426f
-	.short     L(A2Q1)-L(aligned_now)
ce426f
-	.short     L(A3Q1)-L(aligned_now)
ce426f
-	.short     L(A4Q1)-L(aligned_now)
ce426f
-	.short     L(A5Q1)-L(aligned_now)
ce426f
-	.short     L(A6Q1)-L(aligned_now)
ce426f
-	.short     L(A7Q1)-L(aligned_now)
ce426f
-#endif
ce426f
-	.popsection
ce426f
-
ce426f
-	.balign     16
ce426f
-L(A5Q1):    mov    %dl,-0xd(%rdi)
ce426f
-L(A4Q1):    mov    %edx,-0xc(%rdi)
ce426f
-L(A0Q1):    mov    %rdx,-0x8(%rdi)
ce426f
-L(A0Q0):    jmp     L(aligned_now)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(A1Q1):   mov    %dl,-0x9(%rdi)
ce426f
-	mov    %rdx,-0x8(%rdi)
ce426f
-	jmp    L(aligned_now)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(A1Q0):   mov    %dl,-0x1(%rdi)
ce426f
-	jmp    L(aligned_now)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(A3Q1):    mov    %dl,-0xb(%rdi)
ce426f
-L(A2Q1):    mov    %dx,-0xa(%rdi)
ce426f
-	mov    %rdx,-0x8(%rdi)
ce426f
-	jmp    L(aligned_now)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(A3Q0):    mov    %dl,-0x3(%rdi)
ce426f
-L(A2Q0):    mov    %dx,-0x2(%rdi)
ce426f
-	jmp    L(aligned_now)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(A5Q0):    mov    %dl,-0x5(%rdi)
ce426f
-L(A4Q0):    mov    %edx,-0x4(%rdi)
ce426f
-	jmp    L(aligned_now)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(A7Q1):    mov    %dl,-0xf(%rdi)
ce426f
-L(A6Q1):    mov    %dx,-0xe(%rdi)
ce426f
-	mov    %edx,-0xc(%rdi)
ce426f
-	mov    %rdx,-0x8(%rdi)
ce426f
-	jmp    L(aligned_now)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(A7Q0):    mov    %dl,-0x7(%rdi)
ce426f
-L(A6Q0):    mov    %dx,-0x6(%rdi)
ce426f
-	mov    %edx,-0x4(%rdi)
ce426f
-
ce426f
-#ifndef USE_MULTIARCH
ce426f
-	jmp    L(aligned_now)
ce426f
-
ce426f
-L(SSE_pre):
ce426f
-#else
ce426f
-L(aligned_now):
ce426f
-#endif
ce426f
-#if !defined USE_MULTIARCH || defined USE_SSE2
ce426f
-	 # fill RegXMM0 with the pattern
ce426f
-	 movd   %rdx,%xmm0
ce426f
-	 punpcklqdq %xmm0,%xmm0
ce426f
-
ce426f
-	 cmp    $0xb0,%r8 # 176
ce426f
-	 jae    L(byte32sse2_pre)
ce426f
-
ce426f
-	 add    %r8,%rdi
ce426f
-# ifndef PIC
ce426f
-	 lea    L(SSExDx)(%rip),%r9
ce426f
-	 jmpq   *(%r9,%r8,8)
ce426f
-# else
ce426f
-	 lea    L(SSE0Q0)(%rip),%r9
ce426f
-	 lea    L(SSExDx)(%rip),%rcx
ce426f
-	 movswq (%rcx,%r8,2),%rcx
ce426f
-	 lea    (%rcx,%r9,1),%r9
ce426f
-	 jmpq   *%r9
ce426f
-# endif
ce426f
-
ce426f
-L(SSE0QB):  movdqa %xmm0,-0xb0(%rdi)
ce426f
-L(SSE0QA):  movdqa %xmm0,-0xa0(%rdi)
ce426f
-L(SSE0Q9):  movdqa %xmm0,-0x90(%rdi)
ce426f
-L(SSE0Q8):  movdqa %xmm0,-0x80(%rdi)
ce426f
-L(SSE0Q7):  movdqa %xmm0,-0x70(%rdi)
ce426f
-L(SSE0Q6):  movdqa %xmm0,-0x60(%rdi)
ce426f
-L(SSE0Q5):  movdqa %xmm0,-0x50(%rdi)
ce426f
-L(SSE0Q4):  movdqa %xmm0,-0x40(%rdi)
ce426f
-L(SSE0Q3):  movdqa %xmm0,-0x30(%rdi)
ce426f
-L(SSE0Q2):  movdqa %xmm0,-0x20(%rdi)
ce426f
-L(SSE0Q1):  movdqa %xmm0,-0x10(%rdi)
ce426f
-L(SSE0Q0):  retq
ce426f
-
ce426f
-L(SSE1QB):  movdqa %xmm0,-0xb1(%rdi)
ce426f
-L(SSE1QA):  movdqa %xmm0,-0xa1(%rdi)
ce426f
-L(SSE1Q9):  movdqa %xmm0,-0x91(%rdi)
ce426f
-L(SSE1Q8):  movdqa %xmm0,-0x81(%rdi)
ce426f
-L(SSE1Q7):  movdqa %xmm0,-0x71(%rdi)
ce426f
-L(SSE1Q6):  movdqa %xmm0,-0x61(%rdi)
ce426f
-L(SSE1Q5):  movdqa %xmm0,-0x51(%rdi)
ce426f
-L(SSE1Q4):  movdqa %xmm0,-0x41(%rdi)
ce426f
-L(SSE1Q3):  movdqa %xmm0,-0x31(%rdi)
ce426f
-L(SSE1Q2):  movdqa %xmm0,-0x21(%rdi)
ce426f
-L(SSE1Q1):  movdqa %xmm0,-0x11(%rdi)
ce426f
-L(SSE1Q0):  mov    %dl,-0x1(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE2QB):  movdqa %xmm0,-0xb2(%rdi)
ce426f
-L(SSE2QA):  movdqa %xmm0,-0xa2(%rdi)
ce426f
-L(SSE2Q9):  movdqa %xmm0,-0x92(%rdi)
ce426f
-L(SSE2Q8):  movdqa %xmm0,-0x82(%rdi)
ce426f
-L(SSE2Q7):  movdqa %xmm0,-0x72(%rdi)
ce426f
-L(SSE2Q6):  movdqa %xmm0,-0x62(%rdi)
ce426f
-L(SSE2Q5):  movdqa %xmm0,-0x52(%rdi)
ce426f
-L(SSE2Q4):  movdqa %xmm0,-0x42(%rdi)
ce426f
-L(SSE2Q3):  movdqa %xmm0,-0x32(%rdi)
ce426f
-L(SSE2Q2):  movdqa %xmm0,-0x22(%rdi)
ce426f
-L(SSE2Q1):  movdqa %xmm0,-0x12(%rdi)
ce426f
-L(SSE2Q0):  mov    %dx,-0x2(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE3QB):  movdqa %xmm0,-0xb3(%rdi)
ce426f
-L(SSE3QA):  movdqa %xmm0,-0xa3(%rdi)
ce426f
-L(SSE3Q9):  movdqa %xmm0,-0x93(%rdi)
ce426f
-L(SSE3Q8):  movdqa %xmm0,-0x83(%rdi)
ce426f
-L(SSE3Q7):  movdqa %xmm0,-0x73(%rdi)
ce426f
-L(SSE3Q6):  movdqa %xmm0,-0x63(%rdi)
ce426f
-L(SSE3Q5):  movdqa %xmm0,-0x53(%rdi)
ce426f
-L(SSE3Q4):  movdqa %xmm0,-0x43(%rdi)
ce426f
-L(SSE3Q3):  movdqa %xmm0,-0x33(%rdi)
ce426f
-L(SSE3Q2):  movdqa %xmm0,-0x23(%rdi)
ce426f
-L(SSE3Q1):  movdqa %xmm0,-0x13(%rdi)
ce426f
-L(SSE3Q0):  mov    %dx,-0x3(%rdi)
ce426f
-	mov    %dl,-0x1(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE4QB):  movdqa %xmm0,-0xb4(%rdi)
ce426f
-L(SSE4QA):  movdqa %xmm0,-0xa4(%rdi)
ce426f
-L(SSE4Q9):  movdqa %xmm0,-0x94(%rdi)
ce426f
-L(SSE4Q8):  movdqa %xmm0,-0x84(%rdi)
ce426f
-L(SSE4Q7):  movdqa %xmm0,-0x74(%rdi)
ce426f
-L(SSE4Q6):  movdqa %xmm0,-0x64(%rdi)
ce426f
-L(SSE4Q5):  movdqa %xmm0,-0x54(%rdi)
ce426f
-L(SSE4Q4):  movdqa %xmm0,-0x44(%rdi)
ce426f
-L(SSE4Q3):  movdqa %xmm0,-0x34(%rdi)
ce426f
-L(SSE4Q2):  movdqa %xmm0,-0x24(%rdi)
ce426f
-L(SSE4Q1):  movdqa %xmm0,-0x14(%rdi)
ce426f
-L(SSE4Q0):  mov    %edx,-0x4(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE5QB):  movdqa %xmm0,-0xb5(%rdi)
ce426f
-L(SSE5QA):  movdqa %xmm0,-0xa5(%rdi)
ce426f
-L(SSE5Q9):  movdqa %xmm0,-0x95(%rdi)
ce426f
-L(SSE5Q8):  movdqa %xmm0,-0x85(%rdi)
ce426f
-L(SSE5Q7):  movdqa %xmm0,-0x75(%rdi)
ce426f
-L(SSE5Q6):  movdqa %xmm0,-0x65(%rdi)
ce426f
-L(SSE5Q5):  movdqa %xmm0,-0x55(%rdi)
ce426f
-L(SSE5Q4):  movdqa %xmm0,-0x45(%rdi)
ce426f
-L(SSE5Q3):  movdqa %xmm0,-0x35(%rdi)
ce426f
-L(SSE5Q2):  movdqa %xmm0,-0x25(%rdi)
ce426f
-L(SSE5Q1):  movdqa %xmm0,-0x15(%rdi)
ce426f
-L(SSE5Q0):  mov    %edx,-0x5(%rdi)
ce426f
-	mov    %dl,-0x1(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-
ce426f
-L(SSE6QB):  movdqa %xmm0,-0xb6(%rdi)
ce426f
-L(SSE6QA):  movdqa %xmm0,-0xa6(%rdi)
ce426f
-L(SSE6Q9):  movdqa %xmm0,-0x96(%rdi)
ce426f
-L(SSE6Q8):  movdqa %xmm0,-0x86(%rdi)
ce426f
-L(SSE6Q7):  movdqa %xmm0,-0x76(%rdi)
ce426f
-L(SSE6Q6):  movdqa %xmm0,-0x66(%rdi)
ce426f
-L(SSE6Q5):  movdqa %xmm0,-0x56(%rdi)
ce426f
-L(SSE6Q4):  movdqa %xmm0,-0x46(%rdi)
ce426f
-L(SSE6Q3):  movdqa %xmm0,-0x36(%rdi)
ce426f
-L(SSE6Q2):  movdqa %xmm0,-0x26(%rdi)
ce426f
-L(SSE6Q1):  movdqa %xmm0,-0x16(%rdi)
ce426f
-L(SSE6Q0):  mov    %edx,-0x6(%rdi)
ce426f
-	mov    %dx,-0x2(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE7QB):  movdqa %xmm0,-0xb7(%rdi)
ce426f
-L(SSE7QA):  movdqa %xmm0,-0xa7(%rdi)
ce426f
-L(SSE7Q9):  movdqa %xmm0,-0x97(%rdi)
ce426f
-L(SSE7Q8):  movdqa %xmm0,-0x87(%rdi)
ce426f
-L(SSE7Q7):  movdqa %xmm0,-0x77(%rdi)
ce426f
-L(SSE7Q6):  movdqa %xmm0,-0x67(%rdi)
ce426f
-L(SSE7Q5):  movdqa %xmm0,-0x57(%rdi)
ce426f
-L(SSE7Q4):  movdqa %xmm0,-0x47(%rdi)
ce426f
-L(SSE7Q3):  movdqa %xmm0,-0x37(%rdi)
ce426f
-L(SSE7Q2):  movdqa %xmm0,-0x27(%rdi)
ce426f
-L(SSE7Q1):  movdqa %xmm0,-0x17(%rdi)
ce426f
-L(SSE7Q0):  mov    %edx,-0x7(%rdi)
ce426f
-	mov    %dx,-0x3(%rdi)
ce426f
-	mov    %dl,-0x1(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE8QB):  movdqa %xmm0,-0xb8(%rdi)
ce426f
-L(SSE8QA):  movdqa %xmm0,-0xa8(%rdi)
ce426f
-L(SSE8Q9):  movdqa %xmm0,-0x98(%rdi)
ce426f
-L(SSE8Q8):  movdqa %xmm0,-0x88(%rdi)
ce426f
-L(SSE8Q7):  movdqa %xmm0,-0x78(%rdi)
ce426f
-L(SSE8Q6):  movdqa %xmm0,-0x68(%rdi)
ce426f
-L(SSE8Q5):  movdqa %xmm0,-0x58(%rdi)
ce426f
-L(SSE8Q4):  movdqa %xmm0,-0x48(%rdi)
ce426f
-L(SSE8Q3):  movdqa %xmm0,-0x38(%rdi)
ce426f
-L(SSE8Q2):  movdqa %xmm0,-0x28(%rdi)
ce426f
-L(SSE8Q1):  movdqa %xmm0,-0x18(%rdi)
ce426f
-L(SSE8Q0):  mov    %rdx,-0x8(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE9QB):  movdqa %xmm0,-0xb9(%rdi)
ce426f
-L(SSE9QA):  movdqa %xmm0,-0xa9(%rdi)
ce426f
-L(SSE9Q9):  movdqa %xmm0,-0x99(%rdi)
ce426f
-L(SSE9Q8):  movdqa %xmm0,-0x89(%rdi)
ce426f
-L(SSE9Q7):  movdqa %xmm0,-0x79(%rdi)
ce426f
-L(SSE9Q6):  movdqa %xmm0,-0x69(%rdi)
ce426f
-L(SSE9Q5):  movdqa %xmm0,-0x59(%rdi)
ce426f
-L(SSE9Q4):  movdqa %xmm0,-0x49(%rdi)
ce426f
-L(SSE9Q3):  movdqa %xmm0,-0x39(%rdi)
ce426f
-L(SSE9Q2):  movdqa %xmm0,-0x29(%rdi)
ce426f
-L(SSE9Q1):  movdqa %xmm0,-0x19(%rdi)
ce426f
-L(SSE9Q0):  mov    %rdx,-0x9(%rdi)
ce426f
-	mov    %dl,-0x1(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE10QB): movdqa %xmm0,-0xba(%rdi)
ce426f
-L(SSE10QA): movdqa %xmm0,-0xaa(%rdi)
ce426f
-L(SSE10Q9): movdqa %xmm0,-0x9a(%rdi)
ce426f
-L(SSE10Q8): movdqa %xmm0,-0x8a(%rdi)
ce426f
-L(SSE10Q7): movdqa %xmm0,-0x7a(%rdi)
ce426f
-L(SSE10Q6): movdqa %xmm0,-0x6a(%rdi)
ce426f
-L(SSE10Q5): movdqa %xmm0,-0x5a(%rdi)
ce426f
-L(SSE10Q4): movdqa %xmm0,-0x4a(%rdi)
ce426f
-L(SSE10Q3): movdqa %xmm0,-0x3a(%rdi)
ce426f
-L(SSE10Q2): movdqa %xmm0,-0x2a(%rdi)
ce426f
-L(SSE10Q1): movdqa %xmm0,-0x1a(%rdi)
ce426f
-L(SSE10Q0): mov    %rdx,-0xa(%rdi)
ce426f
-	mov    %dx,-0x2(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE11QB): movdqa %xmm0,-0xbb(%rdi)
ce426f
-L(SSE11QA): movdqa %xmm0,-0xab(%rdi)
ce426f
-L(SSE11Q9): movdqa %xmm0,-0x9b(%rdi)
ce426f
-L(SSE11Q8): movdqa %xmm0,-0x8b(%rdi)
ce426f
-L(SSE11Q7): movdqa %xmm0,-0x7b(%rdi)
ce426f
-L(SSE11Q6): movdqa %xmm0,-0x6b(%rdi)
ce426f
-L(SSE11Q5): movdqa %xmm0,-0x5b(%rdi)
ce426f
-L(SSE11Q4): movdqa %xmm0,-0x4b(%rdi)
ce426f
-L(SSE11Q3): movdqa %xmm0,-0x3b(%rdi)
ce426f
-L(SSE11Q2): movdqa %xmm0,-0x2b(%rdi)
ce426f
-L(SSE11Q1): movdqa %xmm0,-0x1b(%rdi)
ce426f
-L(SSE11Q0): mov    %rdx,-0xb(%rdi)
ce426f
-	mov    %dx,-0x3(%rdi)
ce426f
-	mov    %dl,-0x1(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE12QB): movdqa %xmm0,-0xbc(%rdi)
ce426f
-L(SSE12QA): movdqa %xmm0,-0xac(%rdi)
ce426f
-L(SSE12Q9): movdqa %xmm0,-0x9c(%rdi)
ce426f
-L(SSE12Q8): movdqa %xmm0,-0x8c(%rdi)
ce426f
-L(SSE12Q7): movdqa %xmm0,-0x7c(%rdi)
ce426f
-L(SSE12Q6): movdqa %xmm0,-0x6c(%rdi)
ce426f
-L(SSE12Q5): movdqa %xmm0,-0x5c(%rdi)
ce426f
-L(SSE12Q4): movdqa %xmm0,-0x4c(%rdi)
ce426f
-L(SSE12Q3): movdqa %xmm0,-0x3c(%rdi)
ce426f
-L(SSE12Q2): movdqa %xmm0,-0x2c(%rdi)
ce426f
-L(SSE12Q1): movdqa %xmm0,-0x1c(%rdi)
ce426f
-L(SSE12Q0): mov    %rdx,-0xc(%rdi)
ce426f
-	mov    %edx,-0x4(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE13QB): movdqa %xmm0,-0xbd(%rdi)
ce426f
-L(SSE13QA): movdqa %xmm0,-0xad(%rdi)
ce426f
-L(SSE13Q9): movdqa %xmm0,-0x9d(%rdi)
ce426f
-L(SSE13Q8): movdqa %xmm0,-0x8d(%rdi)
ce426f
-L(SSE13Q7): movdqa %xmm0,-0x7d(%rdi)
ce426f
-L(SSE13Q6): movdqa %xmm0,-0x6d(%rdi)
ce426f
-L(SSE13Q5): movdqa %xmm0,-0x5d(%rdi)
ce426f
-L(SSE13Q4): movdqa %xmm0,-0x4d(%rdi)
ce426f
-L(SSE13Q3): movdqa %xmm0,-0x3d(%rdi)
ce426f
-L(SSE13Q2): movdqa %xmm0,-0x2d(%rdi)
ce426f
-L(SSE13Q1): movdqa %xmm0,-0x1d(%rdi)
ce426f
-L(SSE13Q0): mov    %rdx,-0xd(%rdi)
ce426f
-	mov    %edx,-0x5(%rdi)
ce426f
-	mov    %dl,-0x1(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE14QB): movdqa %xmm0,-0xbe(%rdi)
ce426f
-L(SSE14QA): movdqa %xmm0,-0xae(%rdi)
ce426f
-L(SSE14Q9): movdqa %xmm0,-0x9e(%rdi)
ce426f
-L(SSE14Q8): movdqa %xmm0,-0x8e(%rdi)
ce426f
-L(SSE14Q7): movdqa %xmm0,-0x7e(%rdi)
ce426f
-L(SSE14Q6): movdqa %xmm0,-0x6e(%rdi)
ce426f
-L(SSE14Q5): movdqa %xmm0,-0x5e(%rdi)
ce426f
-L(SSE14Q4): movdqa %xmm0,-0x4e(%rdi)
ce426f
-L(SSE14Q3): movdqa %xmm0,-0x3e(%rdi)
ce426f
-L(SSE14Q2): movdqa %xmm0,-0x2e(%rdi)
ce426f
-L(SSE14Q1): movdqa %xmm0,-0x1e(%rdi)
ce426f
-L(SSE14Q0): mov    %rdx,-0xe(%rdi)
ce426f
-	mov    %edx,-0x6(%rdi)
ce426f
-	mov    %dx,-0x2(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-L(SSE15QB): movdqa %xmm0,-0xbf(%rdi)
ce426f
-L(SSE15QA): movdqa %xmm0,-0xaf(%rdi)
ce426f
-L(SSE15Q9): movdqa %xmm0,-0x9f(%rdi)
ce426f
-L(SSE15Q8): movdqa %xmm0,-0x8f(%rdi)
ce426f
-L(SSE15Q7): movdqa %xmm0,-0x7f(%rdi)
ce426f
-L(SSE15Q6): movdqa %xmm0,-0x6f(%rdi)
ce426f
-L(SSE15Q5): movdqa %xmm0,-0x5f(%rdi)
ce426f
-L(SSE15Q4): movdqa %xmm0,-0x4f(%rdi)
ce426f
-L(SSE15Q3): movdqa %xmm0,-0x3f(%rdi)
ce426f
-L(SSE15Q2): movdqa %xmm0,-0x2f(%rdi)
ce426f
-L(SSE15Q1): movdqa %xmm0,-0x1f(%rdi)
ce426f
-L(SSE15Q0): mov    %rdx,-0xf(%rdi)
ce426f
-	mov    %edx,-0x7(%rdi)
ce426f
-	mov    %dx,-0x3(%rdi)
ce426f
-	mov    %dl,-0x1(%rdi)
ce426f
-	retq
ce426f
-
ce426f
-	.balign     16
ce426f
-L(byte32sse2_pre):
ce426f
-
ce426f
-	mov    __x86_64_shared_cache_size(%rip),%r9d  # The largest cache size
ce426f
-	cmp    %r9,%r8
ce426f
-	ja     L(sse2_nt_move_pre)
ce426f
-	#jmp    L(byte32sse2)
ce426f
-	.balign     16
ce426f
-L(byte32sse2):
ce426f
-	lea    -0x80(%r8),%r8 # 128
ce426f
-	cmp    $0x80,%r8   # 128
ce426f
-	movdqa %xmm0,(%rdi)
ce426f
-	movdqa %xmm0,0x10(%rdi)
ce426f
-	movdqa %xmm0,0x20(%rdi)
ce426f
-	movdqa %xmm0,0x30(%rdi)
ce426f
-	movdqa %xmm0,0x40(%rdi)
ce426f
-	movdqa %xmm0,0x50(%rdi)
ce426f
-	movdqa %xmm0,0x60(%rdi)
ce426f
-	movdqa %xmm0,0x70(%rdi)
ce426f
-
ce426f
-	lea    0x80(%rdi),%rdi
ce426f
-	jae    L(byte32sse2)
ce426f
-	add    %r8,%rdi
ce426f
-# ifndef PIC
ce426f
-	lea    L(SSExDx)(%rip),%r11
ce426f
-	jmpq   *(%r11,%r8,8)
ce426f
-# else
ce426f
-	lea    L(SSE0Q0)(%rip),%r11
ce426f
-	lea    L(SSExDx)(%rip),%rcx
ce426f
-	movswq (%rcx,%r8,2),%rcx
ce426f
-	lea    (%rcx,%r11,1),%r11
ce426f
-	jmpq   *%r11
ce426f
-# endif
ce426f
-
ce426f
-	.balign     16
ce426f
-L(sse2_nt_move_pre):
ce426f
-	cmp    $0x0,%r9
ce426f
-	je     L(byte32sse2)
ce426f
-	jmp    L(sse2_nt_move)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(sse2_nt_move):
ce426f
-	lea    -0x80(%r8),%r8
ce426f
-	cmp    $0x80,%r8
ce426f
-
ce426f
-	movntdq %xmm0,(%rdi)
ce426f
-	movntdq %xmm0,0x10(%rdi)
ce426f
-	movntdq %xmm0,0x20(%rdi)
ce426f
-	movntdq %xmm0,0x30(%rdi)
ce426f
-	movntdq %xmm0,0x40(%rdi)
ce426f
-	movntdq %xmm0,0x50(%rdi)
ce426f
-	movntdq %xmm0,0x60(%rdi)
ce426f
-	movntdq %xmm0,0x70(%rdi)
ce426f
-
ce426f
-	lea    0x80(%rdi),%rdi
ce426f
-	jae    L(sse2_nt_move)
ce426f
-	sfence
ce426f
-	add    %r8,%rdi
ce426f
-# ifndef PIC
ce426f
-	lea    L(SSExDx)(%rip),%r11
ce426f
-	jmpq   *(%r11,%r8,8)
ce426f
-# else
ce426f
-	lea    L(SSE0Q0)(%rip),%r11
ce426f
-	lea    L(SSExDx)(%rip),%rcx
ce426f
-	movswq (%rcx,%r8,2),%rcx
ce426f
-	lea   (%rcx,%r11,1),%r11
ce426f
-	jmpq   *%r11
ce426f
-# endif
ce426f
-
ce426f
-	.pushsection .rodata
ce426f
-	.balign     16
ce426f
-# ifndef PIC
ce426f
-L(SSExDx):
ce426f
-	.quad       L(SSE0Q0), L(SSE1Q0), L(SSE2Q0), L(SSE3Q0)
ce426f
-	.quad       L(SSE4Q0), L(SSE5Q0), L(SSE6Q0), L(SSE7Q0)
ce426f
-	.quad       L(SSE8Q0), L(SSE9Q0), L(SSE10Q0), L(SSE11Q0)
ce426f
-	.quad       L(SSE12Q0), L(SSE13Q0), L(SSE14Q0), L(SSE15Q0)
ce426f
-	.quad       L(SSE0Q1), L(SSE1Q1), L(SSE2Q1), L(SSE3Q1)
ce426f
-	.quad       L(SSE4Q1), L(SSE5Q1), L(SSE6Q1), L(SSE7Q1)
ce426f
-	.quad       L(SSE8Q1), L(SSE9Q1), L(SSE10Q1), L(SSE11Q1)
ce426f
-	.quad       L(SSE12Q1), L(SSE13Q1), L(SSE14Q1), L(SSE15Q1)
ce426f
-	.quad       L(SSE0Q2), L(SSE1Q2), L(SSE2Q2), L(SSE3Q2)
ce426f
-	.quad       L(SSE4Q2), L(SSE5Q2), L(SSE6Q2), L(SSE7Q2)
ce426f
-	.quad       L(SSE8Q2), L(SSE9Q2), L(SSE10Q2), L(SSE11Q2)
ce426f
-	.quad       L(SSE12Q2), L(SSE13Q2), L(SSE14Q2), L(SSE15Q2)
ce426f
-	.quad       L(SSE0Q3), L(SSE1Q3), L(SSE2Q3), L(SSE3Q3)
ce426f
-	.quad       L(SSE4Q3), L(SSE5Q3), L(SSE6Q3), L(SSE7Q3)
ce426f
-	.quad       L(SSE8Q3), L(SSE9Q3), L(SSE10Q3), L(SSE11Q3)
ce426f
-	.quad       L(SSE12Q3), L(SSE13Q3), L(SSE14Q3), L(SSE15Q3)
ce426f
-	.quad       L(SSE0Q4), L(SSE1Q4), L(SSE2Q4), L(SSE3Q4)
ce426f
-	.quad       L(SSE4Q4), L(SSE5Q4), L(SSE6Q4), L(SSE7Q4)
ce426f
-	.quad       L(SSE8Q4), L(SSE9Q4), L(SSE10Q4), L(SSE11Q4)
ce426f
-	.quad       L(SSE12Q4), L(SSE13Q4), L(SSE14Q4), L(SSE15Q4)
ce426f
-	.quad       L(SSE0Q5), L(SSE1Q5), L(SSE2Q5), L(SSE3Q5)
ce426f
-	.quad       L(SSE4Q5), L(SSE5Q5), L(SSE6Q5), L(SSE7Q5)
ce426f
-	.quad       L(SSE8Q5), L(SSE9Q5), L(SSE10Q5), L(SSE11Q5)
ce426f
-	.quad       L(SSE12Q5), L(SSE13Q5), L(SSE14Q5), L(SSE15Q5)
ce426f
-	.quad       L(SSE0Q6), L(SSE1Q6), L(SSE2Q6), L(SSE3Q6)
ce426f
-	.quad       L(SSE4Q6), L(SSE5Q6), L(SSE6Q6), L(SSE7Q6)
ce426f
-	.quad       L(SSE8Q6), L(SSE9Q6), L(SSE10Q6), L(SSE11Q6)
ce426f
-	.quad       L(SSE12Q6), L(SSE13Q6), L(SSE14Q6), L(SSE15Q6)
ce426f
-	.quad       L(SSE0Q7), L(SSE1Q7), L(SSE2Q7), L(SSE3Q7)
ce426f
-	.quad       L(SSE4Q7), L(SSE5Q7), L(SSE6Q7), L(SSE7Q7)
ce426f
-	.quad       L(SSE8Q7), L(SSE9Q7), L(SSE10Q7), L(SSE11Q7)
ce426f
-	.quad       L(SSE12Q7), L(SSE13Q7), L(SSE14Q7), L(SSE15Q7)
ce426f
-	.quad       L(SSE0Q8), L(SSE1Q8), L(SSE2Q8), L(SSE3Q8)
ce426f
-	.quad       L(SSE4Q8), L(SSE5Q8), L(SSE6Q8), L(SSE7Q8)
ce426f
-	.quad       L(SSE8Q8), L(SSE9Q8), L(SSE10Q8), L(SSE11Q8)
ce426f
-	.quad       L(SSE12Q8), L(SSE13Q8), L(SSE14Q8), L(SSE15Q8)
ce426f
-	.quad       L(SSE0Q9), L(SSE1Q9), L(SSE2Q9), L(SSE3Q9)
ce426f
-	.quad       L(SSE4Q9), L(SSE5Q9), L(SSE6Q9), L(SSE7Q9)
ce426f
-	.quad       L(SSE8Q9), L(SSE9Q9), L(SSE10Q9), L(SSE11Q9)
ce426f
-	.quad       L(SSE12Q9), L(SSE13Q9), L(SSE14Q9), L(SSE15Q9)
ce426f
-	.quad       L(SSE0QA), L(SSE1QA), L(SSE2QA), L(SSE3QA)
ce426f
-	.quad       L(SSE4QA), L(SSE5QA), L(SSE6QA), L(SSE7QA)
ce426f
-	.quad       L(SSE8QA), L(SSE9QA), L(SSE10QA), L(SSE11QA)
ce426f
-	.quad       L(SSE12QA), L(SSE13QA), L(SSE14QA), L(SSE15QA)
ce426f
-	.quad       L(SSE0QB), L(SSE1QB), L(SSE2QB), L(SSE3QB)
ce426f
-	.quad       L(SSE4QB), L(SSE5QB), L(SSE6QB), L(SSE7QB)
ce426f
-	.quad       L(SSE8QB), L(SSE9QB), L(SSE10QB), L(SSE11QB)
ce426f
-	.quad       L(SSE12QB), L(SSE13QB), L(SSE14QB), L(SSE15QB)
ce426f
-# else
ce426f
-L(SSExDx):
ce426f
-	.short     L(SSE0Q0) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q0) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q0) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q0) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q0) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q0) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q0) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q0) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q0) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q0) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q0)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q0)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q0)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q0)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q0)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q0)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0Q1) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q1) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q1) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q1) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q1) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q1) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q1) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q1) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q1) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q1) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q1)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q1)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q1)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q1)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q1)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q1)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0Q2) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q2) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q2) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q2) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q2) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q2) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q2) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q2) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q2) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q2) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q2)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q2)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q2)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q2)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q2)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q2)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0Q3) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q3) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q3) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q3) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q3) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q3) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q3) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q3) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q3) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q3) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q3)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q3)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q3)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q3)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q3)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q3)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0Q4) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q4) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q4) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q4) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q4) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q4) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q4) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q4) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q4) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q4) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q4)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q4)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q4)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q4)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q4)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q4)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0Q5) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q5) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q5) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q5) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q5) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q5) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q5) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q5) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q5) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q5) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q5)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q5)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q5)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q5)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q5)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q5)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0Q6) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q6) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q6) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q6) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q6) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q6) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q6) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q6) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q6) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q6) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q6)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q6)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q6)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q6)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q6)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q6)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0Q7) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q7) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q7) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q7) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q7) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q7) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q7) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q7) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q7) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q7) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q7)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q7)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q7)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q7)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q7)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q7)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0Q8) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q8) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q8) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q8) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q8) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q8) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q8) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q8) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q8) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q8) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q8)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q8)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q8)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q8)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q8)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q8)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0Q9) -L(SSE0Q0)
ce426f
-	.short     L(SSE1Q9) -L(SSE0Q0)
ce426f
-	.short     L(SSE2Q9) -L(SSE0Q0)
ce426f
-	.short     L(SSE3Q9) -L(SSE0Q0)
ce426f
-	.short     L(SSE4Q9) -L(SSE0Q0)
ce426f
-	.short     L(SSE5Q9) -L(SSE0Q0)
ce426f
-	.short     L(SSE6Q9) -L(SSE0Q0)
ce426f
-	.short     L(SSE7Q9) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8Q9) -L(SSE0Q0)
ce426f
-	.short     L(SSE9Q9) -L(SSE0Q0)
ce426f
-	.short     L(SSE10Q9)-L(SSE0Q0)
ce426f
-	.short     L(SSE11Q9)-L(SSE0Q0)
ce426f
-	.short     L(SSE12Q9)-L(SSE0Q0)
ce426f
-	.short     L(SSE13Q9)-L(SSE0Q0)
ce426f
-	.short     L(SSE14Q9)-L(SSE0Q0)
ce426f
-	.short     L(SSE15Q9)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0QA) -L(SSE0Q0)
ce426f
-	.short     L(SSE1QA) -L(SSE0Q0)
ce426f
-	.short     L(SSE2QA) -L(SSE0Q0)
ce426f
-	.short     L(SSE3QA) -L(SSE0Q0)
ce426f
-	.short     L(SSE4QA) -L(SSE0Q0)
ce426f
-	.short     L(SSE5QA) -L(SSE0Q0)
ce426f
-	.short     L(SSE6QA) -L(SSE0Q0)
ce426f
-	.short     L(SSE7QA) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8QA) -L(SSE0Q0)
ce426f
-	.short     L(SSE9QA) -L(SSE0Q0)
ce426f
-	.short     L(SSE10QA)-L(SSE0Q0)
ce426f
-	.short     L(SSE11QA)-L(SSE0Q0)
ce426f
-	.short     L(SSE12QA)-L(SSE0Q0)
ce426f
-	.short     L(SSE13QA)-L(SSE0Q0)
ce426f
-	.short     L(SSE14QA)-L(SSE0Q0)
ce426f
-	.short     L(SSE15QA)-L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE0QB) -L(SSE0Q0)
ce426f
-	.short     L(SSE1QB) -L(SSE0Q0)
ce426f
-	.short     L(SSE2QB) -L(SSE0Q0)
ce426f
-	.short     L(SSE3QB) -L(SSE0Q0)
ce426f
-	.short     L(SSE4QB) -L(SSE0Q0)
ce426f
-	.short     L(SSE5QB) -L(SSE0Q0)
ce426f
-	.short     L(SSE6QB) -L(SSE0Q0)
ce426f
-	.short     L(SSE7QB) -L(SSE0Q0)
ce426f
-
ce426f
-	.short     L(SSE8QB) -L(SSE0Q0)
ce426f
-	.short     L(SSE9QB) -L(SSE0Q0)
ce426f
-	.short     L(SSE10QB)-L(SSE0Q0)
ce426f
-	.short     L(SSE11QB)-L(SSE0Q0)
ce426f
-	.short     L(SSE12QB)-L(SSE0Q0)
ce426f
-	.short     L(SSE13QB)-L(SSE0Q0)
ce426f
-	.short     L(SSE14QB)-L(SSE0Q0)
ce426f
-	.short     L(SSE15QB)-L(SSE0Q0)
ce426f
-# endif
ce426f
-	.popsection
ce426f
-#endif /* !defined USE_MULTIARCH || defined USE_SSE2  */
ce426f
-
ce426f
-	.balign     16
ce426f
-#ifndef USE_MULTIARCH
ce426f
-L(aligned_now):
ce426f
-
ce426f
-	 cmpl   $0x1,__x86_64_preferred_memory_instruction(%rip)
ce426f
-	 jg     L(SSE_pre)
ce426f
-#endif /* USE_MULTIARCH */
ce426f
-
ce426f
-L(8byte_move_try):
ce426f
-	cmpq	__STOS_LOWER_BOUNDARY,%r8
ce426f
-	jae	L(8byte_stos_try)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(8byte_move):
ce426f
-	movq	%r8,%rcx
ce426f
-	shrq	$7,%rcx
ce426f
-	jz	L(8byte_move_skip)
ce426f
-
ce426f
-	.p2align 4
ce426f
-
ce426f
-L(8byte_move_loop):
ce426f
-	decq	%rcx
ce426f
-
ce426f
-	movq	%rdx,    (%rdi)
ce426f
-	movq	%rdx,  8 (%rdi)
ce426f
-	movq	%rdx, 16 (%rdi)
ce426f
-	movq	%rdx, 24 (%rdi)
ce426f
-	movq	%rdx, 32 (%rdi)
ce426f
-	movq	%rdx, 40 (%rdi)
ce426f
-	movq	%rdx, 48 (%rdi)
ce426f
-	movq	%rdx, 56 (%rdi)
ce426f
-	movq	%rdx, 64 (%rdi)
ce426f
-	movq	%rdx, 72 (%rdi)
ce426f
-	movq	%rdx, 80 (%rdi)
ce426f
-	movq	%rdx, 88 (%rdi)
ce426f
-	movq	%rdx, 96 (%rdi)
ce426f
-	movq	%rdx, 104 (%rdi)
ce426f
-	movq	%rdx, 112 (%rdi)
ce426f
-	movq	%rdx, 120 (%rdi)
ce426f
-
ce426f
-	leaq	128 (%rdi),%rdi
ce426f
-
ce426f
-	jnz     L(8byte_move_loop)
ce426f
-
ce426f
-L(8byte_move_skip):
ce426f
-	andl	$127,%r8d
ce426f
-	lea	(%rdi,%r8,1),%rdi
ce426f
-
ce426f
-#ifndef PIC
ce426f
-	lea	L(setPxQx)(%rip),%r11
ce426f
-	jmpq	*(%r11,%r8,8) # old scheme remained for nonPIC
ce426f
-#else
ce426f
-	lea	L(Got0)(%rip),%r11
ce426f
-	lea	L(setPxQx)(%rip),%rcx
ce426f
-	movswq	(%rcx,%r8,2),%rcx
ce426f
-	lea	(%rcx,%r11,1),%r11
ce426f
-	jmpq	*%r11
ce426f
-#endif
ce426f
-
ce426f
-	.balign     16
ce426f
-L(8byte_stos_try):
ce426f
-	mov    __x86_64_shared_cache_size(%rip),%r9d // ck largest cache size
ce426f
-	cmpq	%r8,%r9		// calculate the lesser of remaining
ce426f
-	cmovaq	%r8,%r9		// bytes and largest cache size
ce426f
-	jbe	L(8byte_stos)
ce426f
-
ce426f
-L(8byte_move_reuse_try):
ce426f
-	cmp	__STOS_UPPER_BOUNDARY,%r8
ce426f
-	jae	L(8byte_move)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(8byte_stos):
ce426f
-	movq	%r9,%rcx
ce426f
-	andq	$-8,%r9
ce426f
-
ce426f
-	shrq	$3,%rcx
ce426f
-	jz	L(8byte_stos_skip)
ce426f
-
ce426f
-	xchgq	%rax,%rdx
ce426f
 
ce426f
+ENTRY (memset)
ce426f
+	movd	%esi, %xmm8
ce426f
+	movq	%rdi, %rax
ce426f
+	punpcklbw	%xmm8, %xmm8
ce426f
+	punpcklwd	%xmm8, %xmm8
ce426f
+	pshufd	$0, %xmm8, %xmm8
ce426f
+L(entry_from_bzero):
ce426f
+	cmpq	$64, %rdx
ce426f
+	ja	L(loop_start)
ce426f
+	cmpq	$16, %rdx
ce426f
+	jbe	L(less_16_bytes)
ce426f
+	cmpq	$32, %rdx
ce426f
+	movdqu	%xmm8, (%rdi)
ce426f
+	movdqu	%xmm8, -16(%rdi,%rdx)
ce426f
+	ja	L(between_32_64_bytes)
ce426f
+L(return):
ce426f
 	rep
ce426f
-	stosq
ce426f
-
ce426f
-	xchgq	%rax,%rdx
ce426f
-
ce426f
-L(8byte_stos_skip):
ce426f
-	subq	%r9,%r8
ce426f
-	ja	L(8byte_nt_move)
ce426f
-
ce426f
-	andl	$7,%r8d
ce426f
-	lea	(%rdi,%r8,1),%rdi
ce426f
-#ifndef PIC
ce426f
-	lea	L(setPxQx)(%rip),%r11
ce426f
-	jmpq	*(%r11,%r8,8) # old scheme remained for nonPIC
ce426f
-#else
ce426f
-	lea	L(Got0)(%rip),%r11
ce426f
-	lea     L(setPxQx)(%rip),%rcx
ce426f
-	movswq	(%rcx,%r8,2),%rcx
ce426f
-	lea	(%rcx,%r11,1),%r11
ce426f
-	jmpq	*%r11
ce426f
-#endif
ce426f
-
ce426f
-	.balign     16
ce426f
-L(8byte_nt_move):
ce426f
-	movq	%r8,%rcx
ce426f
-	shrq	$7,%rcx
ce426f
-	jz      L(8byte_nt_move_skip)
ce426f
-
ce426f
-	.balign     16
ce426f
-L(8byte_nt_move_loop):
ce426f
-	decq	%rcx
ce426f
+	ret
ce426f
 
ce426f
-	movntiq	%rdx,     (%rdi)
ce426f
-	movntiq	%rdx,   8 (%rdi)
ce426f
-	movntiq	%rdx,  16 (%rdi)
ce426f
-	movntiq	%rdx,  24 (%rdi)
ce426f
-	movntiq	%rdx,  32 (%rdi)
ce426f
-	movntiq	%rdx,  40 (%rdi)
ce426f
-	movntiq	%rdx,  48 (%rdi)
ce426f
-	movntiq	%rdx,  56 (%rdi)
ce426f
-	movntiq	%rdx,  64 (%rdi)
ce426f
-	movntiq	%rdx,  72 (%rdi)
ce426f
-	movntiq	%rdx,  80 (%rdi)
ce426f
-	movntiq	%rdx,  88 (%rdi)
ce426f
-	movntiq	%rdx,  96 (%rdi)
ce426f
-	movntiq	%rdx, 104 (%rdi)
ce426f
-	movntiq	%rdx, 112 (%rdi)
ce426f
-	movntiq	%rdx, 120 (%rdi)
ce426f
-
ce426f
-	leaq	128 (%rdi),%rdi
ce426f
-
ce426f
-	jnz     L(8byte_nt_move_loop)
ce426f
-
ce426f
-	sfence
ce426f
-
ce426f
-L(8byte_nt_move_skip):
ce426f
-	andl	$127,%r8d
ce426f
-
ce426f
-	lea	(%rdi,%r8,1),%rdi
ce426f
-#ifndef PIC
ce426f
-	lea	L(setPxQx)(%rip),%r11
ce426f
-	jmpq	*(%r11,%r8,8) # old scheme remained for nonPIC
ce426f
-#else
ce426f
-	lea	L(Got0)(%rip),%r11
ce426f
-	lea     L(setPxQx)(%rip),%rcx
ce426f
-	movswq	(%rcx,%r8,2),%rcx
ce426f
-	lea	(%rcx,%r11,1),%r11
ce426f
-	jmpq	*%r11
ce426f
-#endif
ce426f
+	ALIGN (4)
ce426f
+L(between_32_64_bytes):
ce426f
+	movdqu	%xmm8, 16(%rdi)
ce426f
+	movdqu	%xmm8, -32(%rdi,%rdx)
ce426f
+	ret
ce426f
+	ALIGN (4)
ce426f
+L(loop_start):
ce426f
+	leaq	64(%rdi), %rcx
ce426f
+	movdqu	%xmm8, (%rdi)
ce426f
+	andq	$-64, %rcx
ce426f
+	movdqu	%xmm8, -16(%rdi,%rdx)
ce426f
+	movdqu	%xmm8, 16(%rdi)
ce426f
+	movdqu	%xmm8, -32(%rdi,%rdx)
ce426f
+	movdqu	%xmm8, 32(%rdi)
ce426f
+	movdqu	%xmm8, -48(%rdi,%rdx)
ce426f
+	movdqu	%xmm8, 48(%rdi)
ce426f
+	movdqu	%xmm8, -64(%rdi,%rdx)
ce426f
+	addq	%rdi, %rdx
ce426f
+	andq	$-64, %rdx
ce426f
+	cmpq	%rdx, %rcx
ce426f
+	je	L(return)
ce426f
+	ALIGN (4)
ce426f
+L(loop):
ce426f
+	movdqa	%xmm8, (%rcx)
ce426f
+	movdqa	%xmm8, 16(%rcx)
ce426f
+	movdqa	%xmm8, 32(%rcx)
ce426f
+	movdqa	%xmm8, 48(%rcx)
ce426f
+	addq	$64, %rcx
ce426f
+	cmpq	%rcx, %rdx
ce426f
+	jne	L(loop)
ce426f
+	rep
ce426f
+	ret
ce426f
+L(less_16_bytes):
ce426f
+	movq %xmm8, %rcx
ce426f
+	testb	$24, %dl
ce426f
+	jne	L(between8_16bytes)
ce426f
+	testb	$4, %dl
ce426f
+	jne	L(between4_7bytes)
ce426f
+	testb	$1, %dl
ce426f
+	je	L(odd_byte)
ce426f
+	movb	%cl, (%rdi)
ce426f
+L(odd_byte):
ce426f
+	testb	$2, %dl
ce426f
+	je	L(return)
ce426f
+	movw	%cx, -2(%rax,%rdx)
ce426f
+	ret
ce426f
+L(between4_7bytes):
ce426f
+	movl	%ecx, (%rdi)
ce426f
+	movl	%ecx, -4(%rdi,%rdx)
ce426f
+	ret
ce426f
+L(between8_16bytes):
ce426f
+	movq	%rcx, (%rdi)
ce426f
+	movq	%rcx, -8(%rdi,%rdx)
ce426f
+	ret
ce426f
 
ce426f
 END (memset)
ce426f
 libc_hidden_builtin_def (memset)