8ae002
From 71ae86478edc7b21872464f43fb29ff650c1681a Mon Sep 17 00:00:00 2001
8ae002
From: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
8ae002
Date: Tue, 15 Jul 2014 12:19:09 -0400
8ae002
Subject: [PATCH] PowerPC: memset optimization for POWER8/PPC64
8ae002
8ae002
This patch adds an optimized memset implementation for POWER8.  For
8ae002
sizes from 0 to 255 bytes, a word/doubleword algorithm similar to
8ae002
POWER7 optimized one is used.
8ae002
8ae002
For size higher than 255 two strategies are used:
8ae002
8ae002
1. If the constant is different than 0, the memory is written with
8ae002
   altivec vector instruction;
8ae002
8ae002
2. If constant is 0, dbcz instructions are used.  The loop is unrolled
8ae002
   to clear 512 byte at time.
8ae002
8ae002
Using vector instructions increases throughput considerable, with a
8ae002
double performance for sizes larger than 1024.  The dcbz loops unrolls
8ae002
also shows performance improvement, by doubling throughput for sizes
8ae002
larger than 8192 bytes.
8ae002
---
8ae002
 ChangeLog                                          |  15 +
8ae002
 benchtests/bench-memset.c                          |   5 +
8ae002
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   2 +-
8ae002
 sysdeps/powerpc/powerpc64/multiarch/bzero.c        |  11 +-
8ae002
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
8ae002
 .../powerpc/powerpc64/multiarch/memset-power8.S    |  43 ++
8ae002
 sysdeps/powerpc/powerpc64/multiarch/memset.c       |  11 +-
8ae002
 sysdeps/powerpc/powerpc64/power8/memset.S          | 449 +++++++++++++++++++++
8ae002
 8 files changed, 533 insertions(+), 9 deletions(-)
8ae002
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
8ae002
 create mode 100644 sysdeps/powerpc/powerpc64/power8/memset.S
8ae002
8ae002
diff --git a/ChangeLog b/ChangeLog
8ae002
index ddaf70f..dc61c87 100644
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
8ae002
index 0de3804..abc9d2e 100644
8ae002
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
8ae002
@@ -1,7 +1,8 @@ ifeq ($(subdir),string)
8ae002
 ifeq ($(subdir),string)
8ae002
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
8ae002
                   memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
8ae002
-                  memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
8ae002
+                  memcmp-ppc64 memset-power8 memset-power7 memset-power6 \
8ae002
+                  memset-power4 \
8ae002
                   memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \
8ae002
                   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
8ae002
                   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
8ae002
index ed83541..298cf00 100644
8ae002
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
8ae002
@@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden;
8ae002
 extern __typeof (bzero) __bzero_power4 attribute_hidden;
8ae002
 extern __typeof (bzero) __bzero_power6 attribute_hidden;
8ae002
 extern __typeof (bzero) __bzero_power7 attribute_hidden;
8ae002
+extern __typeof (bzero) __bzero_power8 attribute_hidden;
8ae002
 
8ae002
 libc_ifunc (__bzero,
8ae002
-            (hwcap & PPC_FEATURE_HAS_VSX)
8ae002
-            ? __bzero_power7 :
8ae002
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
8ae002
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
8ae002
+            ? __bzero_power8 :
8ae002
+	      (hwcap & PPC_FEATURE_HAS_VSX)
8ae002
+	      ? __bzero_power7 :
8ae002
+		(hwcap & PPC_FEATURE_ARCH_2_05)
8ae002
 		? __bzero_power6 :
8ae002
 		  (hwcap & PPC_FEATURE_POWER4)
8ae002
-		? __bzero_power4
8ae002
+		  ? __bzero_power4
8ae002
             : __bzero_ppc);
8ae002
 
8ae002
 weak_alias (__bzero, bzero)
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
8ae002
index a574487..06d5be9 100644
8ae002
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
8ae002
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
8ae002
 
8ae002
   /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c.  */
8ae002
   IFUNC_IMPL (i, name, memset,
8ae002
+             IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
8ae002
+                             __memset_power8)
8ae002
              IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
8ae002
                              __memset_power7)
8ae002
              IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
8ae002
@@ -134,6 +138,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
8ae002
 
8ae002
   /* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c.  */
8ae002
   IFUNC_IMPL (i, name, bzero,
8ae002
+             IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
8ae002
+                             __bzero_power8)
8ae002
              IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
8ae002
                              __bzero_power7)
8ae002
              IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
8ae002
new file mode 100644
8ae002
index 0000000..e8a604b
8ae002
--- /dev/null
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
8ae002
@@ -0,0 +1,43 @@
8ae002
+/* Optimized memset implementation for PowerPC64/POWER8.
8ae002
+   Copyright (C) 2014 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <sysdep.h>
8ae002
+
8ae002
+#undef EALIGN
8ae002
+#define EALIGN(name, alignt, words)				\
8ae002
+  .section ".text";						\
8ae002
+  ENTRY_2(__memset_power8)					\
8ae002
+  .align ALIGNARG(alignt);					\
8ae002
+  EALIGN_W_##words;						\
8ae002
+  BODY_LABEL(__memset_power8):					\
8ae002
+  cfi_startproc;						\
8ae002
+  LOCALENTRY(__memset_power8)
8ae002
+
8ae002
+#undef END_GEN_TB
8ae002
+#define END_GEN_TB(name, mask)					\
8ae002
+  cfi_endproc;							\
8ae002
+  TRACEBACK_MASK(__memset_power8,mask)				\
8ae002
+  END_2(__memset_power8)
8ae002
+
8ae002
+#undef libc_hidden_builtin_def
8ae002
+#define libc_hidden_builtin_def(name)
8ae002
+
8ae002
+#undef __bzero
8ae002
+#define __bzero __bzero_power8
8ae002
+
8ae002
+#include <sysdeps/powerpc/powerpc64/power8/memset.S>
8ae002
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
8ae002
index aa2ae70..9c7ed10 100644
8ae002
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
8ae002
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
8ae002
@@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
8ae002
 extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
8ae002
 extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
8ae002
 extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
8ae002
+extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
8ae002
 
8ae002
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
8ae002
    ifunc symbol properly.  */
8ae002
 libc_ifunc (__libc_memset,
8ae002
-            (hwcap & PPC_FEATURE_HAS_VSX)
8ae002
-            ? __memset_power7 :
8ae002
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
8ae002
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
8ae002
+            ? __memset_power8 :
8ae002
+	      (hwcap & PPC_FEATURE_HAS_VSX)
8ae002
+	      ? __memset_power7 :
8ae002
+		(hwcap & PPC_FEATURE_ARCH_2_05)
8ae002
 		? __memset_power6 :
8ae002
 		  (hwcap & PPC_FEATURE_POWER4)
8ae002
-		? __memset_power4
8ae002
+		  ? __memset_power4
8ae002
             : __memset_ppc);
8ae002
 
8ae002
 #undef memset
8ae002
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
8ae002
new file mode 100644
8ae002
index 0000000..191a4df
8ae002
--- /dev/null
8ae002
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
8ae002
@@ -0,0 +1,449 @@
8ae002
+/* Optimized memset implementation for PowerPC64/POWER8.
8ae002
+   Copyright (C) 2014 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <sysdep.h>
8ae002
+
8ae002
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
8ae002
+   Returns 's'.  */
8ae002
+
8ae002
+	.machine power8
8ae002
+EALIGN (memset, 5, 0)
8ae002
+	CALL_MCOUNT 3
8ae002
+
8ae002
+L(_memset):
8ae002
+	cmpldi	cr7,r5,31
8ae002
+	neg	r0,r3
8ae002
+	mr	r10,r3
8ae002
+
8ae002
+	insrdi	r4,r4,8,48
8ae002
+	insrdi	r4,r4,16,32	/* Replicate byte to word.  */
8ae002
+	ble	cr7,L(write_LT_32)
8ae002
+
8ae002
+	andi.	r11,r10,15	/* Check alignment of DST.  */
8ae002
+	insrdi	r4,r4,32,0	/* Replicate word to double word.  */
8ae002
+
8ae002
+	beq	L(big_aligned)
8ae002
+
8ae002
+	mtocrf	0x01,r0
8ae002
+	clrldi	r0,r0,60
8ae002
+
8ae002
+	/* Get DST aligned to 16 bytes.  */
8ae002
+1:	bf	31,2f
8ae002
+	stb	r4,0(r10)
8ae002
+	addi	r10,r10,1
8ae002
+
8ae002
+2:	bf	30,4f
8ae002
+	sth	r4,0(r10)
8ae002
+	addi	r10,r10,2
8ae002
+
8ae002
+4:	bf	29,8f
8ae002
+	stw	r4,0(r10)
8ae002
+	addi	r10,r10,4
8ae002
+
8ae002
+8:	bf      28,16f
8ae002
+	std     r4,0(r10)
8ae002
+	addi    r10,r10,8
8ae002
+
8ae002
+16:	subf	r5,r0,r5
8ae002
+
8ae002
+	.align	4
8ae002
+L(big_aligned):
8ae002
+	/* For sizes larger than 255 two possible paths:
8ae002
+	   - if constant is '0', zero full cache lines with dcbz
8ae002
+	   - otherwise uses vector instructions.  */
8ae002
+	cmpldi	cr5,r5,255
8ae002
+	dcbtst	0,r10
8ae002
+	cmpldi	cr6,r4,0
8ae002
+	crand	27,26,21
8ae002
+	bt	27,L(huge_dcbz)
8ae002
+	bge	cr5,L(huge_vector)
8ae002
+
8ae002
+
8ae002
+	/* Size between 32 and 255 bytes with constant different than 0, use
8ae002
+	   doubleword store instruction to achieve best throughput.  */
8ae002
+	srdi    r8,r5,5
8ae002
+	clrldi  r11,r5,59
8ae002
+	cmpldi  cr6,r11,0
8ae002
+	cmpdi	r8,0
8ae002
+	beq     L(tail_bytes)
8ae002
+	mtctr   r8
8ae002
+
8ae002
+	/* Main aligned write loop, writes 32-bytes at a time.  */
8ae002
+	.align  4
8ae002
+L(big_loop):
8ae002
+	std     r4,0(r10)
8ae002
+	std     r4,8(r10)
8ae002
+	std     r4,16(r10)
8ae002
+	std     r4,24(r10)
8ae002
+	addi    r10,r10,32
8ae002
+	bdz     L(tail_bytes)
8ae002
+
8ae002
+	std     r4,0(r10)
8ae002
+	std     r4,8(r10)
8ae002
+	std     r4,16(r10)
8ae002
+	std     r4,24(r10)
8ae002
+	addi    r10,10,32
8ae002
+	bdnz    L(big_loop)
8ae002
+
8ae002
+	b       L(tail_bytes)
8ae002
+
8ae002
+	/* Write remaining 1~31 bytes.  */
8ae002
+	.align  4
8ae002
+L(tail_bytes):
8ae002
+	beqlr   cr6
8ae002
+
8ae002
+	srdi    r7,r11,4
8ae002
+	clrldi  r8,r11,60
8ae002
+	mtocrf  0x01,r7
8ae002
+
8ae002
+	.align	4
8ae002
+	bf	31,8f
8ae002
+	std	r4,0(r10)
8ae002
+	std	r4,8(r10)
8ae002
+	addi	r10,r10,16
8ae002
+
8ae002
+	.align	4
8ae002
+8:	mtocrf	0x1,r8
8ae002
+	bf	28,4f
8ae002
+	std	r4,0(r10)
8ae002
+	addi	r10,r10,8
8ae002
+
8ae002
+	.align	4
8ae002
+4:	bf      29,2f
8ae002
+	stw     4,0(10)
8ae002
+	addi    10,10,4
8ae002
+
8ae002
+	.align 	4
8ae002
+2:	bf      30,1f
8ae002
+	sth     4,0(10)
8ae002
+	addi    10,10,2
8ae002
+
8ae002
+	.align  4
8ae002
+1:      bflr    31
8ae002
+	stb     4,0(10)
8ae002
+	blr
8ae002
+
8ae002
+	/* Size larger than 255 bytes with constant different than 0, use
8ae002
+	   vector instruction to achieve best throughput.  */
8ae002
+L(huge_vector):
8ae002
+	/* Replicate set byte to quadword in VMX register.  */
8ae002
+	mtvsrd	 v1,r4
8ae002
+	xxpermdi 32,v0,v1,0
8ae002
+	vspltb	 v2,v0,15
8ae002
+
8ae002
+	/* Main aligned write loop: 128 bytes at a time.  */
8ae002
+	li	r6,16
8ae002
+	li	r7,32
8ae002
+	li	r8,48
8ae002
+	mtocrf	0x02,r5
8ae002
+	srdi	r12,r5,7
8ae002
+	cmpdi	r12,0
8ae002
+	beq	L(aligned_tail)
8ae002
+	mtctr	r12
8ae002
+	b	L(aligned_128loop)
8ae002
+
8ae002
+	.align  4
8ae002
+L(aligned_128loop):
8ae002
+	stvx	v2,0,r10
8ae002
+	stvx	v2,r10,r6
8ae002
+	stvx	v2,r10,r7
8ae002
+	stvx	v2,r10,r8
8ae002
+	addi	r10,r10,64
8ae002
+	stvx	v2,0,r10
8ae002
+	stvx	v2,r10,r6
8ae002
+	stvx	v2,r10,r7
8ae002
+	stvx	v2,r10,r8
8ae002
+	addi	r10,r10,64
8ae002
+	bdnz	L(aligned_128loop)
8ae002
+
8ae002
+	/* Write remaining 1~127 bytes.  */
8ae002
+L(aligned_tail):
8ae002
+	mtocrf	0x01,r5
8ae002
+	bf	25,32f
8ae002
+	stvx	v2,0,r10
8ae002
+	stvx	v2,r10,r6
8ae002
+	stvx	v2,r10,r7
8ae002
+	stvx	v2,r10,r8
8ae002
+	addi	r10,r10,64
8ae002
+
8ae002
+32:	bf	26,16f
8ae002
+	stvx	v2,0,r10
8ae002
+	stvx	v2,r10,r6
8ae002
+	addi	r10,r10,32
8ae002
+
8ae002
+16:	bf	27,8f
8ae002
+	stvx	v2,0,r10
8ae002
+	addi	r10,r10,16
8ae002
+
8ae002
+8:	bf	28,4f
8ae002
+	std     r4,0(r10)
8ae002
+	addi	r10,r10,8
8ae002
+
8ae002
+	/* Copies 4~7 bytes.  */
8ae002
+4:	bf	29,L(tail2)
8ae002
+	stw     r4,0(r10)
8ae002
+	bf      30,L(tail5)
8ae002
+	sth     r4,4(r10)
8ae002
+	bflr	31
8ae002
+	stb     r4,6(r10)
8ae002
+	/* Return original DST pointer.  */
8ae002
+	blr
8ae002
+
8ae002
+	/* Special case when value is 0 and we have a long length to deal
8ae002
+	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
8ae002
+	   Before using dcbz though, we need to get the destination 128-byte
8ae002
+	   aligned.  */
8ae002
+	.align	4
8ae002
+L(huge_dcbz):
8ae002
+	andi.	r11,r10,127
8ae002
+	neg	r0,r10
8ae002
+	beq	L(huge_dcbz_aligned)
8ae002
+
8ae002
+	clrldi	r0,r0,57
8ae002
+	subf	r5,r0,r5
8ae002
+	srdi	r0,r0,3
8ae002
+	mtocrf	0x01,r0
8ae002
+
8ae002
+	/* Write 1~128 bytes until DST is aligned to 128 bytes.  */
8ae002
+8:	bf	28,4f
8ae002
+
8ae002
+	std	r4,0(r10)
8ae002
+	std	r4,8(r10)
8ae002
+	std	r4,16(r10)
8ae002
+	std	r4,24(r10)
8ae002
+	std	r4,32(r10)
8ae002
+	std	r4,40(r10)
8ae002
+	std	r4,48(r10)
8ae002
+	std	r4,56(r10)
8ae002
+	addi	r10,r10,64
8ae002
+
8ae002
+	.align	4
8ae002
+4:	bf	29,2f
8ae002
+	std	r4,0(r10)
8ae002
+	std	r4,8(r10)
8ae002
+	std	r4,16(r10)
8ae002
+	std	r4,24(r10)
8ae002
+	addi	r10,r10,32
8ae002
+
8ae002
+	.align	4
8ae002
+2:	bf	30,1f
8ae002
+	std	r4,0(r10)
8ae002
+	std	r4,8(r10)
8ae002
+	addi	r10,r10,16
8ae002
+
8ae002
+	.align	4
8ae002
+1:	bf	31,L(huge_dcbz_aligned)
8ae002
+	std	r4,0(r10)
8ae002
+	addi	r10,r10,8
8ae002
+
8ae002
+L(huge_dcbz_aligned):
8ae002
+	/* Setup dcbz unroll offsets and count numbers.  */
8ae002
+	srdi	r8,r5,9
8ae002
+	clrldi	r11,r5,55
8ae002
+	cmpldi	cr6,r11,0
8ae002
+	li	r9,128
8ae002
+	cmpdi	r8,0
8ae002
+	beq     L(huge_tail)
8ae002
+	li	r7,256
8ae002
+	li	r6,384
8ae002
+	mtctr	r8
8ae002
+
8ae002
+	.align	4
8ae002
+L(huge_loop):
8ae002
+	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
8ae002
+	   a throughput boost for large sizes (2048 bytes or higher).  */
8ae002
+	dcbz	0,r10
8ae002
+	dcbz	r9,r10
8ae002
+	dcbz	r7,r10
8ae002
+	dcbz	r6,r10
8ae002
+	addi	r10,r10,512
8ae002
+	bdnz	L(huge_loop)
8ae002
+
8ae002
+	beqlr	cr6
8ae002
+
8ae002
+L(huge_tail):
8ae002
+	srdi    r6,r11,8
8ae002
+	srdi    r7,r11,4
8ae002
+	clrldi  r8,r11,4
8ae002
+	cmpldi  cr6,r8,0
8ae002
+	mtocrf  0x01,r6
8ae002
+
8ae002
+	beq	cr6,L(tail)
8ae002
+
8ae002
+	/* We have 1~511 bytes remaining.  */
8ae002
+	.align	4
8ae002
+32:	bf	31,16f
8ae002
+	dcbz	0,r10
8ae002
+	dcbz	r9,r10
8ae002
+	addi	r10,r10,256
8ae002
+
8ae002
+	.align	4
8ae002
+16:	mtocrf  0x01,r7
8ae002
+	bf	28,8f
8ae002
+	dcbz	0,r10
8ae002
+	addi	r10,r10,128
8ae002
+
8ae002
+	.align 	4
8ae002
+8:	bf	29,4f
8ae002
+	std	r4,0(r10)
8ae002
+	std	r4,8(r10)
8ae002
+	std	r4,16(r10)
8ae002
+	std	r4,24(r10)
8ae002
+	std	r4,32(r10)
8ae002
+	std	r4,40(r10)
8ae002
+	std	r4,48(r10)
8ae002
+	std	r4,56(r10)
8ae002
+	addi	r10,r10,64
8ae002
+
8ae002
+	.align	4
8ae002
+4:	bf	30,2f
8ae002
+	std	r4,0(r10)
8ae002
+	std	r4,8(r10)
8ae002
+	std	r4,16(r10)
8ae002
+	std	r4,24(r10)
8ae002
+	addi	r10,r10,32
8ae002
+
8ae002
+	.align	4
8ae002
+2:	bf	31,L(tail)
8ae002
+	std	r4,0(r10)
8ae002
+	std	r4,8(r10)
8ae002
+	addi	r10,r10,16
8ae002
+	.align	4
8ae002
+
8ae002
+	/* Remaining 1~15 bytes.  */
8ae002
+L(tail):
8ae002
+	mtocrf  0x01,r8
8ae002
+
8ae002
+	.align
8ae002
+8:	bf	28,4f
8ae002
+	std	r4,0(r10)
8ae002
+	addi	r10,r10,8
8ae002
+
8ae002
+	.align	4
8ae002
+4:	bf	29,2f
8ae002
+	stw	r4,0(r10)
8ae002
+	addi	r10,r10,4
8ae002
+
8ae002
+	.align	4
8ae002
+2:	bf	30,1f
8ae002
+	sth	r4,0(r10)
8ae002
+	addi	r10,r10,2
8ae002
+
8ae002
+	.align	4
8ae002
+1:	bflr	31
8ae002
+	stb	r4,0(r10)
8ae002
+	blr
8ae002
+
8ae002
+	/* Handle short copies of 0~31 bytes.  Best throughput is achieved
8ae002
+	   by just unrolling all operations.  */
8ae002
+	.align	4
8ae002
+L(write_LT_32):
8ae002
+	cmpldi	cr6,5,8
8ae002
+	mtocrf	0x01,r5
8ae002
+	ble	cr6,L(write_LE_8)
8ae002
+
8ae002
+	/* At least 9 bytes to go.  */
8ae002
+	neg	r8,r4
8ae002
+	andi.	r0,r8,3
8ae002
+	cmpldi	cr1,r5,16
8ae002
+	beq	L(write_LT_32_aligned)
8ae002
+
8ae002
+	/* Force 4-byte alignment for SRC.  */
8ae002
+	mtocrf	0x01,r0
8ae002
+	subf	r5,r0,r5
8ae002
+
8ae002
+2:	bf	30,1f
8ae002
+	sth	r4,0(r10)
8ae002
+	addi	r10,r10,2
8ae002
+
8ae002
+1:	bf	31,L(end_4bytes_alignment)
8ae002
+	stb	r4,0(r10)
8ae002
+	addi	r10,r10,1
8ae002
+
8ae002
+	.align	4
8ae002
+L(end_4bytes_alignment):
8ae002
+	cmpldi	cr1,r5,16
8ae002
+	mtocrf	0x01,r5
8ae002
+
8ae002
+L(write_LT_32_aligned):
8ae002
+	blt	cr1,8f
8ae002
+
8ae002
+	stw	r4,0(r10)
8ae002
+	stw	r4,4(r10)
8ae002
+	stw	r4,8(r10)
8ae002
+	stw	r4,12(r10)
8ae002
+	addi	r10,r10,16
8ae002
+
8ae002
+8:	bf	28,L(tail4)
8ae002
+	stw	r4,0(r10)
8ae002
+	stw	r4,4(r10)
8ae002
+	addi	r10,r10,8
8ae002
+
8ae002
+	.align	4
8ae002
+	/* Copies 4~7 bytes.  */
8ae002
+L(tail4):
8ae002
+	bf	29,L(tail2)
8ae002
+	stw	r4,0(r10)
8ae002
+	bf	30,L(tail5)
8ae002
+	sth	r4,4(r10)
8ae002
+	bflr	31
8ae002
+	stb	r4,6(r10)
8ae002
+	blr
8ae002
+
8ae002
+	.align	4
8ae002
+	/* Copies 2~3 bytes.  */
8ae002
+L(tail2):
8ae002
+	bf	30,1f
8ae002
+	sth	r4,0(r10)
8ae002
+	bflr	31
8ae002
+	stb	r4,2(r10)
8ae002
+	blr
8ae002
+
8ae002
+	.align	4
8ae002
+L(tail5):
8ae002
+	bflr	31
8ae002
+	stb	r4,4(r10)
8ae002
+	blr
8ae002
+
8ae002
+	.align	4
8ae002
+1: 	bflr	31
8ae002
+	stb	r4,0(r10)
8ae002
+	blr
8ae002
+
8ae002
+	/* Handles copies of 0~8 bytes.  */
8ae002
+	.align	4
8ae002
+L(write_LE_8):
8ae002
+	bne	cr6,L(tail4)
8ae002
+
8ae002
+	stw	r4,0(r10)
8ae002
+	stw	r4,4(r10)
8ae002
+	blr
8ae002
+END_GEN_TB (memset,TB_TOCLESS)
8ae002
+libc_hidden_builtin_def (memset)
8ae002
+
8ae002
+/* Copied from bzero.S to prevent the linker from inserting a stub
8ae002
+   between bzero and memset.  */
8ae002
+ENTRY (__bzero)
8ae002
+	CALL_MCOUNT 3
8ae002
+	mr	r5,r4
8ae002
+	li	r4,0
8ae002
+	b	L(_memset)
8ae002
+END (__bzero)
8ae002
+#ifndef __bzero
8ae002
+weak_alias (__bzero, bzero)
8ae002
+#endif
8ae002
-- 
8ae002
2.1.0
8ae002