ce426f
From 71ae86478edc7b21872464f43fb29ff650c1681a Mon Sep 17 00:00:00 2001
ce426f
From: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
ce426f
Date: Tue, 15 Jul 2014 12:19:09 -0400
ce426f
Subject: [PATCH] PowerPC: memset optimization for POWER8/PPC64
ce426f
ce426f
This patch adds an optimized memset implementation for POWER8.  For
ce426f
sizes from 0 to 255 bytes, a word/doubleword algorithm similar to
ce426f
POWER7 optimized one is used.
ce426f
ce426f
For size higher than 255 two strategies are used:
ce426f
ce426f
1. If the constant is different than 0, the memory is written with
ce426f
   altivec vector instruction;
ce426f
ce426f
2. If constant is 0, dbcz instructions are used.  The loop is unrolled
ce426f
   to clear 512 byte at time.
ce426f
ce426f
Using vector instructions increases throughput considerable, with a
ce426f
double performance for sizes larger than 1024.  The dcbz loops unrolls
ce426f
also shows performance improvement, by doubling throughput for sizes
ce426f
larger than 8192 bytes.
ce426f
---
ce426f
 ChangeLog                                          |  15 +
ce426f
 benchtests/bench-memset.c                          |   5 +
ce426f
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   2 +-
ce426f
 sysdeps/powerpc/powerpc64/multiarch/bzero.c        |  11 +-
ce426f
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
ce426f
 .../powerpc/powerpc64/multiarch/memset-power8.S    |  43 ++
ce426f
 sysdeps/powerpc/powerpc64/multiarch/memset.c       |  11 +-
ce426f
 sysdeps/powerpc/powerpc64/power8/memset.S          | 449 +++++++++++++++++++++
ce426f
 8 files changed, 533 insertions(+), 9 deletions(-)
ce426f
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
ce426f
 create mode 100644 sysdeps/powerpc/powerpc64/power8/memset.S
ce426f
ce426f
diff --git a/ChangeLog b/ChangeLog
ce426f
index ddaf70f..dc61c87 100644
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
ce426f
index 0de3804..abc9d2e 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
ce426f
@@ -1,7 +1,8 @@ ifeq ($(subdir),string)
ce426f
 ifeq ($(subdir),string)
ce426f
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
ce426f
                   memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
ce426f
-                  memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
ce426f
+                  memcmp-ppc64 memset-power8 memset-power7 memset-power6 \
ce426f
+                  memset-power4 \
ce426f
                   memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \
ce426f
                   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
ce426f
                   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
ce426f
index ed83541..298cf00 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
ce426f
@@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden;
ce426f
 extern __typeof (bzero) __bzero_power4 attribute_hidden;
ce426f
 extern __typeof (bzero) __bzero_power6 attribute_hidden;
ce426f
 extern __typeof (bzero) __bzero_power7 attribute_hidden;
ce426f
+extern __typeof (bzero) __bzero_power8 attribute_hidden;
ce426f
 
ce426f
 libc_ifunc (__bzero,
ce426f
-            (hwcap & PPC_FEATURE_HAS_VSX)
ce426f
-            ? __bzero_power7 :
ce426f
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
ce426f
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
ce426f
+            ? __bzero_power8 :
ce426f
+	      (hwcap & PPC_FEATURE_HAS_VSX)
ce426f
+	      ? __bzero_power7 :
ce426f
+		(hwcap & PPC_FEATURE_ARCH_2_05)
ce426f
 		? __bzero_power6 :
ce426f
 		  (hwcap & PPC_FEATURE_POWER4)
ce426f
-		? __bzero_power4
ce426f
+		  ? __bzero_power4
ce426f
             : __bzero_ppc);
ce426f
 
ce426f
 weak_alias (__bzero, bzero)
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
ce426f
index a574487..06d5be9 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
ce426f
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
ce426f
 
ce426f
   /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c.  */
ce426f
   IFUNC_IMPL (i, name, memset,
ce426f
+             IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
ce426f
+                             __memset_power8)
ce426f
              IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
ce426f
                              __memset_power7)
ce426f
              IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
ce426f
@@ -134,6 +138,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
ce426f
 
ce426f
   /* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c.  */
ce426f
   IFUNC_IMPL (i, name, bzero,
ce426f
+             IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
ce426f
+                             __bzero_power8)
ce426f
              IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
ce426f
                              __bzero_power7)
ce426f
              IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
ce426f
new file mode 100644
ce426f
index 0000000..e8a604b
ce426f
--- /dev/null
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
ce426f
@@ -0,0 +1,43 @@
ce426f
+/* Optimized memset implementation for PowerPC64/POWER8.
ce426f
+   Copyright (C) 2014 Free Software Foundation, Inc.
ce426f
+   This file is part of the GNU C Library.
ce426f
+
ce426f
+   The GNU C Library is free software; you can redistribute it and/or
ce426f
+   modify it under the terms of the GNU Lesser General Public
ce426f
+   License as published by the Free Software Foundation; either
ce426f
+   version 2.1 of the License, or (at your option) any later version.
ce426f
+
ce426f
+   The GNU C Library is distributed in the hope that it will be useful,
ce426f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
ce426f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
ce426f
+   Lesser General Public License for more details.
ce426f
+
ce426f
+   You should have received a copy of the GNU Lesser General Public
ce426f
+   License along with the GNU C Library; if not, see
ce426f
+   <http://www.gnu.org/licenses/>.  */
ce426f
+
ce426f
+#include <sysdep.h>
ce426f
+
ce426f
+#undef EALIGN
ce426f
+#define EALIGN(name, alignt, words)				\
ce426f
+  .section ".text";						\
ce426f
+  ENTRY_2(__memset_power8)					\
ce426f
+  .align ALIGNARG(alignt);					\
ce426f
+  EALIGN_W_##words;						\
ce426f
+  BODY_LABEL(__memset_power8):					\
ce426f
+  cfi_startproc;						\
ce426f
+  LOCALENTRY(__memset_power8)
ce426f
+
ce426f
+#undef END_GEN_TB
ce426f
+#define END_GEN_TB(name, mask)					\
ce426f
+  cfi_endproc;							\
ce426f
+  TRACEBACK_MASK(__memset_power8,mask)				\
ce426f
+  END_2(__memset_power8)
ce426f
+
ce426f
+#undef libc_hidden_builtin_def
ce426f
+#define libc_hidden_builtin_def(name)
ce426f
+
ce426f
+#undef __bzero
ce426f
+#define __bzero __bzero_power8
ce426f
+
ce426f
+#include <sysdeps/powerpc/powerpc64/power8/memset.S>
ce426f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
ce426f
index aa2ae70..9c7ed10 100644
ce426f
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
ce426f
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
ce426f
@@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
ce426f
 extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
ce426f
 extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
ce426f
 extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
ce426f
+extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
ce426f
 
ce426f
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ce426f
    ifunc symbol properly.  */
ce426f
 libc_ifunc (__libc_memset,
ce426f
-            (hwcap & PPC_FEATURE_HAS_VSX)
ce426f
-            ? __memset_power7 :
ce426f
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
ce426f
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
ce426f
+            ? __memset_power8 :
ce426f
+	      (hwcap & PPC_FEATURE_HAS_VSX)
ce426f
+	      ? __memset_power7 :
ce426f
+		(hwcap & PPC_FEATURE_ARCH_2_05)
ce426f
 		? __memset_power6 :
ce426f
 		  (hwcap & PPC_FEATURE_POWER4)
ce426f
-		? __memset_power4
ce426f
+		  ? __memset_power4
ce426f
             : __memset_ppc);
ce426f
 
ce426f
 #undef memset
ce426f
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
ce426f
new file mode 100644
ce426f
index 0000000..191a4df
ce426f
--- /dev/null
ce426f
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
ce426f
@@ -0,0 +1,449 @@
ce426f
+/* Optimized memset implementation for PowerPC64/POWER8.
ce426f
+   Copyright (C) 2014 Free Software Foundation, Inc.
ce426f
+   This file is part of the GNU C Library.
ce426f
+
ce426f
+   The GNU C Library is free software; you can redistribute it and/or
ce426f
+   modify it under the terms of the GNU Lesser General Public
ce426f
+   License as published by the Free Software Foundation; either
ce426f
+   version 2.1 of the License, or (at your option) any later version.
ce426f
+
ce426f
+   The GNU C Library is distributed in the hope that it will be useful,
ce426f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
ce426f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
ce426f
+   Lesser General Public License for more details.
ce426f
+
ce426f
+   You should have received a copy of the GNU Lesser General Public
ce426f
+   License along with the GNU C Library; if not, see
ce426f
+   <http://www.gnu.org/licenses/>.  */
ce426f
+
ce426f
+#include <sysdep.h>
ce426f
+
ce426f
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
ce426f
+   Returns 's'.  */
ce426f
+
ce426f
+	.machine power8
ce426f
+EALIGN (memset, 5, 0)
ce426f
+	CALL_MCOUNT 3
ce426f
+
ce426f
+L(_memset):
ce426f
+	cmpldi	cr7,r5,31
ce426f
+	neg	r0,r3
ce426f
+	mr	r10,r3
ce426f
+
ce426f
+	insrdi	r4,r4,8,48
ce426f
+	insrdi	r4,r4,16,32	/* Replicate byte to word.  */
ce426f
+	ble	cr7,L(write_LT_32)
ce426f
+
ce426f
+	andi.	r11,r10,15	/* Check alignment of DST.  */
ce426f
+	insrdi	r4,r4,32,0	/* Replicate word to double word.  */
ce426f
+
ce426f
+	beq	L(big_aligned)
ce426f
+
ce426f
+	mtocrf	0x01,r0
ce426f
+	clrldi	r0,r0,60
ce426f
+
ce426f
+	/* Get DST aligned to 16 bytes.  */
ce426f
+1:	bf	31,2f
ce426f
+	stb	r4,0(r10)
ce426f
+	addi	r10,r10,1
ce426f
+
ce426f
+2:	bf	30,4f
ce426f
+	sth	r4,0(r10)
ce426f
+	addi	r10,r10,2
ce426f
+
ce426f
+4:	bf	29,8f
ce426f
+	stw	r4,0(r10)
ce426f
+	addi	r10,r10,4
ce426f
+
ce426f
+8:	bf      28,16f
ce426f
+	std     r4,0(r10)
ce426f
+	addi    r10,r10,8
ce426f
+
ce426f
+16:	subf	r5,r0,r5
ce426f
+
ce426f
+	.align	4
ce426f
+L(big_aligned):
ce426f
+	/* For sizes larger than 255 two possible paths:
ce426f
+	   - if constant is '0', zero full cache lines with dcbz
ce426f
+	   - otherwise uses vector instructions.  */
ce426f
+	cmpldi	cr5,r5,255
ce426f
+	dcbtst	0,r10
ce426f
+	cmpldi	cr6,r4,0
ce426f
+	crand	27,26,21
ce426f
+	bt	27,L(huge_dcbz)
ce426f
+	bge	cr5,L(huge_vector)
ce426f
+
ce426f
+
ce426f
+	/* Size between 32 and 255 bytes with constant different than 0, use
ce426f
+	   doubleword store instruction to achieve best throughput.  */
ce426f
+	srdi    r8,r5,5
ce426f
+	clrldi  r11,r5,59
ce426f
+	cmpldi  cr6,r11,0
ce426f
+	cmpdi	r8,0
ce426f
+	beq     L(tail_bytes)
ce426f
+	mtctr   r8
ce426f
+
ce426f
+	/* Main aligned write loop, writes 32-bytes at a time.  */
ce426f
+	.align  4
ce426f
+L(big_loop):
ce426f
+	std     r4,0(r10)
ce426f
+	std     r4,8(r10)
ce426f
+	std     r4,16(r10)
ce426f
+	std     r4,24(r10)
ce426f
+	addi    r10,r10,32
ce426f
+	bdz     L(tail_bytes)
ce426f
+
ce426f
+	std     r4,0(r10)
ce426f
+	std     r4,8(r10)
ce426f
+	std     r4,16(r10)
ce426f
+	std     r4,24(r10)
ce426f
+	addi    r10,10,32
ce426f
+	bdnz    L(big_loop)
ce426f
+
ce426f
+	b       L(tail_bytes)
ce426f
+
ce426f
+	/* Write remaining 1~31 bytes.  */
ce426f
+	.align  4
ce426f
+L(tail_bytes):
ce426f
+	beqlr   cr6
ce426f
+
ce426f
+	srdi    r7,r11,4
ce426f
+	clrldi  r8,r11,60
ce426f
+	mtocrf  0x01,r7
ce426f
+
ce426f
+	.align	4
ce426f
+	bf	31,8f
ce426f
+	std	r4,0(r10)
ce426f
+	std	r4,8(r10)
ce426f
+	addi	r10,r10,16
ce426f
+
ce426f
+	.align	4
ce426f
+8:	mtocrf	0x1,r8
ce426f
+	bf	28,4f
ce426f
+	std	r4,0(r10)
ce426f
+	addi	r10,r10,8
ce426f
+
ce426f
+	.align	4
ce426f
+4:	bf      29,2f
ce426f
+	stw     4,0(10)
ce426f
+	addi    10,10,4
ce426f
+
ce426f
+	.align 	4
ce426f
+2:	bf      30,1f
ce426f
+	sth     4,0(10)
ce426f
+	addi    10,10,2
ce426f
+
ce426f
+	.align  4
ce426f
+1:      bflr    31
ce426f
+	stb     4,0(10)
ce426f
+	blr
ce426f
+
ce426f
+	/* Size larger than 255 bytes with constant different than 0, use
ce426f
+	   vector instruction to achieve best throughput.  */
ce426f
+L(huge_vector):
ce426f
+	/* Replicate set byte to quadword in VMX register.  */
ce426f
+	mtvsrd	 v1,r4
ce426f
+	xxpermdi 32,v0,v1,0
ce426f
+	vspltb	 v2,v0,15
ce426f
+
ce426f
+	/* Main aligned write loop: 128 bytes at a time.  */
ce426f
+	li	r6,16
ce426f
+	li	r7,32
ce426f
+	li	r8,48
ce426f
+	mtocrf	0x02,r5
ce426f
+	srdi	r12,r5,7
ce426f
+	cmpdi	r12,0
ce426f
+	beq	L(aligned_tail)
ce426f
+	mtctr	r12
ce426f
+	b	L(aligned_128loop)
ce426f
+
ce426f
+	.align  4
ce426f
+L(aligned_128loop):
ce426f
+	stvx	v2,0,r10
ce426f
+	stvx	v2,r10,r6
ce426f
+	stvx	v2,r10,r7
ce426f
+	stvx	v2,r10,r8
ce426f
+	addi	r10,r10,64
ce426f
+	stvx	v2,0,r10
ce426f
+	stvx	v2,r10,r6
ce426f
+	stvx	v2,r10,r7
ce426f
+	stvx	v2,r10,r8
ce426f
+	addi	r10,r10,64
ce426f
+	bdnz	L(aligned_128loop)
ce426f
+
ce426f
+	/* Write remaining 1~127 bytes.  */
ce426f
+L(aligned_tail):
ce426f
+	mtocrf	0x01,r5
ce426f
+	bf	25,32f
ce426f
+	stvx	v2,0,r10
ce426f
+	stvx	v2,r10,r6
ce426f
+	stvx	v2,r10,r7
ce426f
+	stvx	v2,r10,r8
ce426f
+	addi	r10,r10,64
ce426f
+
ce426f
+32:	bf	26,16f
ce426f
+	stvx	v2,0,r10
ce426f
+	stvx	v2,r10,r6
ce426f
+	addi	r10,r10,32
ce426f
+
ce426f
+16:	bf	27,8f
ce426f
+	stvx	v2,0,r10
ce426f
+	addi	r10,r10,16
ce426f
+
ce426f
+8:	bf	28,4f
ce426f
+	std     r4,0(r10)
ce426f
+	addi	r10,r10,8
ce426f
+
ce426f
+	/* Copies 4~7 bytes.  */
ce426f
+4:	bf	29,L(tail2)
ce426f
+	stw     r4,0(r10)
ce426f
+	bf      30,L(tail5)
ce426f
+	sth     r4,4(r10)
ce426f
+	bflr	31
ce426f
+	stb     r4,6(r10)
ce426f
+	/* Return original DST pointer.  */
ce426f
+	blr
ce426f
+
ce426f
+	/* Special case when value is 0 and we have a long length to deal
ce426f
+	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
ce426f
+	   Before using dcbz though, we need to get the destination 128-byte
ce426f
+	   aligned.  */
ce426f
+	.align	4
ce426f
+L(huge_dcbz):
ce426f
+	andi.	r11,r10,127
ce426f
+	neg	r0,r10
ce426f
+	beq	L(huge_dcbz_aligned)
ce426f
+
ce426f
+	clrldi	r0,r0,57
ce426f
+	subf	r5,r0,r5
ce426f
+	srdi	r0,r0,3
ce426f
+	mtocrf	0x01,r0
ce426f
+
ce426f
+	/* Write 1~128 bytes until DST is aligned to 128 bytes.  */
ce426f
+8:	bf	28,4f
ce426f
+
ce426f
+	std	r4,0(r10)
ce426f
+	std	r4,8(r10)
ce426f
+	std	r4,16(r10)
ce426f
+	std	r4,24(r10)
ce426f
+	std	r4,32(r10)
ce426f
+	std	r4,40(r10)
ce426f
+	std	r4,48(r10)
ce426f
+	std	r4,56(r10)
ce426f
+	addi	r10,r10,64
ce426f
+
ce426f
+	.align	4
ce426f
+4:	bf	29,2f
ce426f
+	std	r4,0(r10)
ce426f
+	std	r4,8(r10)
ce426f
+	std	r4,16(r10)
ce426f
+	std	r4,24(r10)
ce426f
+	addi	r10,r10,32
ce426f
+
ce426f
+	.align	4
ce426f
+2:	bf	30,1f
ce426f
+	std	r4,0(r10)
ce426f
+	std	r4,8(r10)
ce426f
+	addi	r10,r10,16
ce426f
+
ce426f
+	.align	4
ce426f
+1:	bf	31,L(huge_dcbz_aligned)
ce426f
+	std	r4,0(r10)
ce426f
+	addi	r10,r10,8
ce426f
+
ce426f
+L(huge_dcbz_aligned):
ce426f
+	/* Setup dcbz unroll offsets and count numbers.  */
ce426f
+	srdi	r8,r5,9
ce426f
+	clrldi	r11,r5,55
ce426f
+	cmpldi	cr6,r11,0
ce426f
+	li	r9,128
ce426f
+	cmpdi	r8,0
ce426f
+	beq     L(huge_tail)
ce426f
+	li	r7,256
ce426f
+	li	r6,384
ce426f
+	mtctr	r8
ce426f
+
ce426f
+	.align	4
ce426f
+L(huge_loop):
ce426f
+	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
ce426f
+	   a throughput boost for large sizes (2048 bytes or higher).  */
ce426f
+	dcbz	0,r10
ce426f
+	dcbz	r9,r10
ce426f
+	dcbz	r7,r10
ce426f
+	dcbz	r6,r10
ce426f
+	addi	r10,r10,512
ce426f
+	bdnz	L(huge_loop)
ce426f
+
ce426f
+	beqlr	cr6
ce426f
+
ce426f
+L(huge_tail):
ce426f
+	srdi    r6,r11,8
ce426f
+	srdi    r7,r11,4
ce426f
+	clrldi  r8,r11,4
ce426f
+	cmpldi  cr6,r8,0
ce426f
+	mtocrf  0x01,r6
ce426f
+
ce426f
+	beq	cr6,L(tail)
ce426f
+
ce426f
+	/* We have 1~511 bytes remaining.  */
ce426f
+	.align	4
ce426f
+32:	bf	31,16f
ce426f
+	dcbz	0,r10
ce426f
+	dcbz	r9,r10
ce426f
+	addi	r10,r10,256
ce426f
+
ce426f
+	.align	4
ce426f
+16:	mtocrf  0x01,r7
ce426f
+	bf	28,8f
ce426f
+	dcbz	0,r10
ce426f
+	addi	r10,r10,128
ce426f
+
ce426f
+	.align 	4
ce426f
+8:	bf	29,4f
ce426f
+	std	r4,0(r10)
ce426f
+	std	r4,8(r10)
ce426f
+	std	r4,16(r10)
ce426f
+	std	r4,24(r10)
ce426f
+	std	r4,32(r10)
ce426f
+	std	r4,40(r10)
ce426f
+	std	r4,48(r10)
ce426f
+	std	r4,56(r10)
ce426f
+	addi	r10,r10,64
ce426f
+
ce426f
+	.align	4
ce426f
+4:	bf	30,2f
ce426f
+	std	r4,0(r10)
ce426f
+	std	r4,8(r10)
ce426f
+	std	r4,16(r10)
ce426f
+	std	r4,24(r10)
ce426f
+	addi	r10,r10,32
ce426f
+
ce426f
+	.align	4
ce426f
+2:	bf	31,L(tail)
ce426f
+	std	r4,0(r10)
ce426f
+	std	r4,8(r10)
ce426f
+	addi	r10,r10,16
ce426f
+	.align	4
ce426f
+
ce426f
+	/* Remaining 1~15 bytes.  */
ce426f
+L(tail):
ce426f
+	mtocrf  0x01,r8
ce426f
+
ce426f
+	.align
ce426f
+8:	bf	28,4f
ce426f
+	std	r4,0(r10)
ce426f
+	addi	r10,r10,8
ce426f
+
ce426f
+	.align	4
ce426f
+4:	bf	29,2f
ce426f
+	stw	r4,0(r10)
ce426f
+	addi	r10,r10,4
ce426f
+
ce426f
+	.align	4
ce426f
+2:	bf	30,1f
ce426f
+	sth	r4,0(r10)
ce426f
+	addi	r10,r10,2
ce426f
+
ce426f
+	.align	4
ce426f
+1:	bflr	31
ce426f
+	stb	r4,0(r10)
ce426f
+	blr
ce426f
+
ce426f
+	/* Handle short copies of 0~31 bytes.  Best throughput is achieved
ce426f
+	   by just unrolling all operations.  */
ce426f
+	.align	4
ce426f
+L(write_LT_32):
ce426f
+	cmpldi	cr6,5,8
ce426f
+	mtocrf	0x01,r5
ce426f
+	ble	cr6,L(write_LE_8)
ce426f
+
ce426f
+	/* At least 9 bytes to go.  */
ce426f
+	neg	r8,r4
ce426f
+	andi.	r0,r8,3
ce426f
+	cmpldi	cr1,r5,16
ce426f
+	beq	L(write_LT_32_aligned)
ce426f
+
ce426f
+	/* Force 4-byte alignment for SRC.  */
ce426f
+	mtocrf	0x01,r0
ce426f
+	subf	r5,r0,r5
ce426f
+
ce426f
+2:	bf	30,1f
ce426f
+	sth	r4,0(r10)
ce426f
+	addi	r10,r10,2
ce426f
+
ce426f
+1:	bf	31,L(end_4bytes_alignment)
ce426f
+	stb	r4,0(r10)
ce426f
+	addi	r10,r10,1
ce426f
+
ce426f
+	.align	4
ce426f
+L(end_4bytes_alignment):
ce426f
+	cmpldi	cr1,r5,16
ce426f
+	mtocrf	0x01,r5
ce426f
+
ce426f
+L(write_LT_32_aligned):
ce426f
+	blt	cr1,8f
ce426f
+
ce426f
+	stw	r4,0(r10)
ce426f
+	stw	r4,4(r10)
ce426f
+	stw	r4,8(r10)
ce426f
+	stw	r4,12(r10)
ce426f
+	addi	r10,r10,16
ce426f
+
ce426f
+8:	bf	28,L(tail4)
ce426f
+	stw	r4,0(r10)
ce426f
+	stw	r4,4(r10)
ce426f
+	addi	r10,r10,8
ce426f
+
ce426f
+	.align	4
ce426f
+	/* Copies 4~7 bytes.  */
ce426f
+L(tail4):
ce426f
+	bf	29,L(tail2)
ce426f
+	stw	r4,0(r10)
ce426f
+	bf	30,L(tail5)
ce426f
+	sth	r4,4(r10)
ce426f
+	bflr	31
ce426f
+	stb	r4,6(r10)
ce426f
+	blr
ce426f
+
ce426f
+	.align	4
ce426f
+	/* Copies 2~3 bytes.  */
ce426f
+L(tail2):
ce426f
+	bf	30,1f
ce426f
+	sth	r4,0(r10)
ce426f
+	bflr	31
ce426f
+	stb	r4,2(r10)
ce426f
+	blr
ce426f
+
ce426f
+	.align	4
ce426f
+L(tail5):
ce426f
+	bflr	31
ce426f
+	stb	r4,4(r10)
ce426f
+	blr
ce426f
+
ce426f
+	.align	4
ce426f
+1: 	bflr	31
ce426f
+	stb	r4,0(r10)
ce426f
+	blr
ce426f
+
ce426f
+	/* Handles copies of 0~8 bytes.  */
ce426f
+	.align	4
ce426f
+L(write_LE_8):
ce426f
+	bne	cr6,L(tail4)
ce426f
+
ce426f
+	stw	r4,0(r10)
ce426f
+	stw	r4,4(r10)
ce426f
+	blr
ce426f
+END_GEN_TB (memset,TB_TOCLESS)
ce426f
+libc_hidden_builtin_def (memset)
ce426f
+
ce426f
+/* Copied from bzero.S to prevent the linker from inserting a stub
ce426f
+   between bzero and memset.  */
ce426f
+ENTRY (__bzero)
ce426f
+	CALL_MCOUNT 3
ce426f
+	mr	r5,r4
ce426f
+	li	r4,0
ce426f
+	b	L(_memset)
ce426f
+END (__bzero)
ce426f
+#ifndef __bzero
ce426f
+weak_alias (__bzero, bzero)
ce426f
+#endif
ce426f
-- 
ce426f
2.1.0
ce426f