00db10
From 71ae86478edc7b21872464f43fb29ff650c1681a Mon Sep 17 00:00:00 2001
00db10
From: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
00db10
Date: Tue, 15 Jul 2014 12:19:09 -0400
00db10
Subject: [PATCH] PowerPC: memset optimization for POWER8/PPC64
00db10
00db10
This patch adds an optimized memset implementation for POWER8.  For
00db10
sizes from 0 to 255 bytes, a word/doubleword algorithm similar to
00db10
POWER7 optimized one is used.
00db10
00db10
For size higher than 255 two strategies are used:
00db10
00db10
1. If the constant is different than 0, the memory is written with
00db10
   altivec vector instruction;
00db10
00db10
2. If constant is 0, dbcz instructions are used.  The loop is unrolled
00db10
   to clear 512 byte at time.
00db10
00db10
Using vector instructions increases throughput considerable, with a
00db10
double performance for sizes larger than 1024.  The dcbz loops unrolls
00db10
also shows performance improvement, by doubling throughput for sizes
00db10
larger than 8192 bytes.
00db10
---
00db10
 ChangeLog                                          |  15 +
00db10
 benchtests/bench-memset.c                          |   5 +
00db10
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   2 +-
00db10
 sysdeps/powerpc/powerpc64/multiarch/bzero.c        |  11 +-
00db10
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
00db10
 .../powerpc/powerpc64/multiarch/memset-power8.S    |  43 ++
00db10
 sysdeps/powerpc/powerpc64/multiarch/memset.c       |  11 +-
00db10
 sysdeps/powerpc/powerpc64/power8/memset.S          | 449 +++++++++++++++++++++
00db10
 8 files changed, 533 insertions(+), 9 deletions(-)
00db10
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
00db10
 create mode 100644 sysdeps/powerpc/powerpc64/power8/memset.S
00db10
00db10
diff --git a/ChangeLog b/ChangeLog
00db10
index ddaf70f..dc61c87 100644
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
00db10
index 0de3804..abc9d2e 100644
00db10
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
00db10
@@ -1,7 +1,8 @@ ifeq ($(subdir),string)
00db10
 ifeq ($(subdir),string)
00db10
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
00db10
                   memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
00db10
-                  memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
00db10
+                  memcmp-ppc64 memset-power8 memset-power7 memset-power6 \
00db10
+                  memset-power4 \
00db10
                   memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \
00db10
                   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
00db10
                   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
00db10
index ed83541..298cf00 100644
00db10
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
00db10
@@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden;
00db10
 extern __typeof (bzero) __bzero_power4 attribute_hidden;
00db10
 extern __typeof (bzero) __bzero_power6 attribute_hidden;
00db10
 extern __typeof (bzero) __bzero_power7 attribute_hidden;
00db10
+extern __typeof (bzero) __bzero_power8 attribute_hidden;
00db10
 
00db10
 libc_ifunc (__bzero,
00db10
-            (hwcap & PPC_FEATURE_HAS_VSX)
00db10
-            ? __bzero_power7 :
00db10
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
00db10
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
00db10
+            ? __bzero_power8 :
00db10
+	      (hwcap & PPC_FEATURE_HAS_VSX)
00db10
+	      ? __bzero_power7 :
00db10
+		(hwcap & PPC_FEATURE_ARCH_2_05)
00db10
 		? __bzero_power6 :
00db10
 		  (hwcap & PPC_FEATURE_POWER4)
00db10
-		? __bzero_power4
00db10
+		  ? __bzero_power4
00db10
             : __bzero_ppc);
00db10
 
00db10
 weak_alias (__bzero, bzero)
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
00db10
index a574487..06d5be9 100644
00db10
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
00db10
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
00db10
 
00db10
   /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c.  */
00db10
   IFUNC_IMPL (i, name, memset,
00db10
+             IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
00db10
+                             __memset_power8)
00db10
              IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
00db10
                              __memset_power7)
00db10
              IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
00db10
@@ -134,6 +138,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
00db10
 
00db10
   /* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c.  */
00db10
   IFUNC_IMPL (i, name, bzero,
00db10
+             IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
00db10
+                             __bzero_power8)
00db10
              IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
00db10
                              __bzero_power7)
00db10
              IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
00db10
new file mode 100644
00db10
index 0000000..e8a604b
00db10
--- /dev/null
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
00db10
@@ -0,0 +1,43 @@
00db10
+/* Optimized memset implementation for PowerPC64/POWER8.
00db10
+   Copyright (C) 2014 Free Software Foundation, Inc.
00db10
+   This file is part of the GNU C Library.
00db10
+
00db10
+   The GNU C Library is free software; you can redistribute it and/or
00db10
+   modify it under the terms of the GNU Lesser General Public
00db10
+   License as published by the Free Software Foundation; either
00db10
+   version 2.1 of the License, or (at your option) any later version.
00db10
+
00db10
+   The GNU C Library is distributed in the hope that it will be useful,
00db10
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
00db10
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00db10
+   Lesser General Public License for more details.
00db10
+
00db10
+   You should have received a copy of the GNU Lesser General Public
00db10
+   License along with the GNU C Library; if not, see
00db10
+   <http://www.gnu.org/licenses/>.  */
00db10
+
00db10
+#include <sysdep.h>
00db10
+
00db10
+#undef EALIGN
00db10
+#define EALIGN(name, alignt, words)				\
00db10
+  .section ".text";						\
00db10
+  ENTRY_2(__memset_power8)					\
00db10
+  .align ALIGNARG(alignt);					\
00db10
+  EALIGN_W_##words;						\
00db10
+  BODY_LABEL(__memset_power8):					\
00db10
+  cfi_startproc;						\
00db10
+  LOCALENTRY(__memset_power8)
00db10
+
00db10
+#undef END_GEN_TB
00db10
+#define END_GEN_TB(name, mask)					\
00db10
+  cfi_endproc;							\
00db10
+  TRACEBACK_MASK(__memset_power8,mask)				\
00db10
+  END_2(__memset_power8)
00db10
+
00db10
+#undef libc_hidden_builtin_def
00db10
+#define libc_hidden_builtin_def(name)
00db10
+
00db10
+#undef __bzero
00db10
+#define __bzero __bzero_power8
00db10
+
00db10
+#include <sysdeps/powerpc/powerpc64/power8/memset.S>
00db10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
00db10
index aa2ae70..9c7ed10 100644
00db10
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
00db10
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
00db10
@@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
00db10
 extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
00db10
 extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
00db10
 extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
00db10
+extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
00db10
 
00db10
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
00db10
    ifunc symbol properly.  */
00db10
 libc_ifunc (__libc_memset,
00db10
-            (hwcap & PPC_FEATURE_HAS_VSX)
00db10
-            ? __memset_power7 :
00db10
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
00db10
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
00db10
+            ? __memset_power8 :
00db10
+	      (hwcap & PPC_FEATURE_HAS_VSX)
00db10
+	      ? __memset_power7 :
00db10
+		(hwcap & PPC_FEATURE_ARCH_2_05)
00db10
 		? __memset_power6 :
00db10
 		  (hwcap & PPC_FEATURE_POWER4)
00db10
-		? __memset_power4
00db10
+		  ? __memset_power4
00db10
             : __memset_ppc);
00db10
 
00db10
 #undef memset
00db10
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
00db10
new file mode 100644
00db10
index 0000000..191a4df
00db10
--- /dev/null
00db10
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
00db10
@@ -0,0 +1,449 @@
00db10
+/* Optimized memset implementation for PowerPC64/POWER8.
00db10
+   Copyright (C) 2014 Free Software Foundation, Inc.
00db10
+   This file is part of the GNU C Library.
00db10
+
00db10
+   The GNU C Library is free software; you can redistribute it and/or
00db10
+   modify it under the terms of the GNU Lesser General Public
00db10
+   License as published by the Free Software Foundation; either
00db10
+   version 2.1 of the License, or (at your option) any later version.
00db10
+
00db10
+   The GNU C Library is distributed in the hope that it will be useful,
00db10
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
00db10
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00db10
+   Lesser General Public License for more details.
00db10
+
00db10
+   You should have received a copy of the GNU Lesser General Public
00db10
+   License along with the GNU C Library; if not, see
00db10
+   <http://www.gnu.org/licenses/>.  */
00db10
+
00db10
+#include <sysdep.h>
00db10
+
00db10
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
00db10
+   Returns 's'.  */
00db10
+
00db10
+	.machine power8
00db10
+EALIGN (memset, 5, 0)
00db10
+	CALL_MCOUNT 3
00db10
+
00db10
+L(_memset):
00db10
+	cmpldi	cr7,r5,31
00db10
+	neg	r0,r3
00db10
+	mr	r10,r3
00db10
+
00db10
+	insrdi	r4,r4,8,48
00db10
+	insrdi	r4,r4,16,32	/* Replicate byte to word.  */
00db10
+	ble	cr7,L(write_LT_32)
00db10
+
00db10
+	andi.	r11,r10,15	/* Check alignment of DST.  */
00db10
+	insrdi	r4,r4,32,0	/* Replicate word to double word.  */
00db10
+
00db10
+	beq	L(big_aligned)
00db10
+
00db10
+	mtocrf	0x01,r0
00db10
+	clrldi	r0,r0,60
00db10
+
00db10
+	/* Get DST aligned to 16 bytes.  */
00db10
+1:	bf	31,2f
00db10
+	stb	r4,0(r10)
00db10
+	addi	r10,r10,1
00db10
+
00db10
+2:	bf	30,4f
00db10
+	sth	r4,0(r10)
00db10
+	addi	r10,r10,2
00db10
+
00db10
+4:	bf	29,8f
00db10
+	stw	r4,0(r10)
00db10
+	addi	r10,r10,4
00db10
+
00db10
+8:	bf      28,16f
00db10
+	std     r4,0(r10)
00db10
+	addi    r10,r10,8
00db10
+
00db10
+16:	subf	r5,r0,r5
00db10
+
00db10
+	.align	4
00db10
+L(big_aligned):
00db10
+	/* For sizes larger than 255 two possible paths:
00db10
+	   - if constant is '0', zero full cache lines with dcbz
00db10
+	   - otherwise uses vector instructions.  */
00db10
+	cmpldi	cr5,r5,255
00db10
+	dcbtst	0,r10
00db10
+	cmpldi	cr6,r4,0
00db10
+	crand	27,26,21
00db10
+	bt	27,L(huge_dcbz)
00db10
+	bge	cr5,L(huge_vector)
00db10
+
00db10
+
00db10
+	/* Size between 32 and 255 bytes with constant different than 0, use
00db10
+	   doubleword store instruction to achieve best throughput.  */
00db10
+	srdi    r8,r5,5
00db10
+	clrldi  r11,r5,59
00db10
+	cmpldi  cr6,r11,0
00db10
+	cmpdi	r8,0
00db10
+	beq     L(tail_bytes)
00db10
+	mtctr   r8
00db10
+
00db10
+	/* Main aligned write loop, writes 32-bytes at a time.  */
00db10
+	.align  4
00db10
+L(big_loop):
00db10
+	std     r4,0(r10)
00db10
+	std     r4,8(r10)
00db10
+	std     r4,16(r10)
00db10
+	std     r4,24(r10)
00db10
+	addi    r10,r10,32
00db10
+	bdz     L(tail_bytes)
00db10
+
00db10
+	std     r4,0(r10)
00db10
+	std     r4,8(r10)
00db10
+	std     r4,16(r10)
00db10
+	std     r4,24(r10)
00db10
+	addi    r10,10,32
00db10
+	bdnz    L(big_loop)
00db10
+
00db10
+	b       L(tail_bytes)
00db10
+
00db10
+	/* Write remaining 1~31 bytes.  */
00db10
+	.align  4
00db10
+L(tail_bytes):
00db10
+	beqlr   cr6
00db10
+
00db10
+	srdi    r7,r11,4
00db10
+	clrldi  r8,r11,60
00db10
+	mtocrf  0x01,r7
00db10
+
00db10
+	.align	4
00db10
+	bf	31,8f
00db10
+	std	r4,0(r10)
00db10
+	std	r4,8(r10)
00db10
+	addi	r10,r10,16
00db10
+
00db10
+	.align	4
00db10
+8:	mtocrf	0x1,r8
00db10
+	bf	28,4f
00db10
+	std	r4,0(r10)
00db10
+	addi	r10,r10,8
00db10
+
00db10
+	.align	4
00db10
+4:	bf      29,2f
00db10
+	stw     4,0(10)
00db10
+	addi    10,10,4
00db10
+
00db10
+	.align 	4
00db10
+2:	bf      30,1f
00db10
+	sth     4,0(10)
00db10
+	addi    10,10,2
00db10
+
00db10
+	.align  4
00db10
+1:      bflr    31
00db10
+	stb     4,0(10)
00db10
+	blr
00db10
+
00db10
+	/* Size larger than 255 bytes with constant different than 0, use
00db10
+	   vector instruction to achieve best throughput.  */
00db10
+L(huge_vector):
00db10
+	/* Replicate set byte to quadword in VMX register.  */
00db10
+	mtvsrd	 v1,r4
00db10
+	xxpermdi 32,v0,v1,0
00db10
+	vspltb	 v2,v0,15
00db10
+
00db10
+	/* Main aligned write loop: 128 bytes at a time.  */
00db10
+	li	r6,16
00db10
+	li	r7,32
00db10
+	li	r8,48
00db10
+	mtocrf	0x02,r5
00db10
+	srdi	r12,r5,7
00db10
+	cmpdi	r12,0
00db10
+	beq	L(aligned_tail)
00db10
+	mtctr	r12
00db10
+	b	L(aligned_128loop)
00db10
+
00db10
+	.align  4
00db10
+L(aligned_128loop):
00db10
+	stvx	v2,0,r10
00db10
+	stvx	v2,r10,r6
00db10
+	stvx	v2,r10,r7
00db10
+	stvx	v2,r10,r8
00db10
+	addi	r10,r10,64
00db10
+	stvx	v2,0,r10
00db10
+	stvx	v2,r10,r6
00db10
+	stvx	v2,r10,r7
00db10
+	stvx	v2,r10,r8
00db10
+	addi	r10,r10,64
00db10
+	bdnz	L(aligned_128loop)
00db10
+
00db10
+	/* Write remaining 1~127 bytes.  */
00db10
+L(aligned_tail):
00db10
+	mtocrf	0x01,r5
00db10
+	bf	25,32f
00db10
+	stvx	v2,0,r10
00db10
+	stvx	v2,r10,r6
00db10
+	stvx	v2,r10,r7
00db10
+	stvx	v2,r10,r8
00db10
+	addi	r10,r10,64
00db10
+
00db10
+32:	bf	26,16f
00db10
+	stvx	v2,0,r10
00db10
+	stvx	v2,r10,r6
00db10
+	addi	r10,r10,32
00db10
+
00db10
+16:	bf	27,8f
00db10
+	stvx	v2,0,r10
00db10
+	addi	r10,r10,16
00db10
+
00db10
+8:	bf	28,4f
00db10
+	std     r4,0(r10)
00db10
+	addi	r10,r10,8
00db10
+
00db10
+	/* Copies 4~7 bytes.  */
00db10
+4:	bf	29,L(tail2)
00db10
+	stw     r4,0(r10)
00db10
+	bf      30,L(tail5)
00db10
+	sth     r4,4(r10)
00db10
+	bflr	31
00db10
+	stb     r4,6(r10)
00db10
+	/* Return original DST pointer.  */
00db10
+	blr
00db10
+
00db10
+	/* Special case when value is 0 and we have a long length to deal
00db10
+	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
00db10
+	   Before using dcbz though, we need to get the destination 128-byte
00db10
+	   aligned.  */
00db10
+	.align	4
00db10
+L(huge_dcbz):
00db10
+	andi.	r11,r10,127
00db10
+	neg	r0,r10
00db10
+	beq	L(huge_dcbz_aligned)
00db10
+
00db10
+	clrldi	r0,r0,57
00db10
+	subf	r5,r0,r5
00db10
+	srdi	r0,r0,3
00db10
+	mtocrf	0x01,r0
00db10
+
00db10
+	/* Write 1~128 bytes until DST is aligned to 128 bytes.  */
00db10
+8:	bf	28,4f
00db10
+
00db10
+	std	r4,0(r10)
00db10
+	std	r4,8(r10)
00db10
+	std	r4,16(r10)
00db10
+	std	r4,24(r10)
00db10
+	std	r4,32(r10)
00db10
+	std	r4,40(r10)
00db10
+	std	r4,48(r10)
00db10
+	std	r4,56(r10)
00db10
+	addi	r10,r10,64
00db10
+
00db10
+	.align	4
00db10
+4:	bf	29,2f
00db10
+	std	r4,0(r10)
00db10
+	std	r4,8(r10)
00db10
+	std	r4,16(r10)
00db10
+	std	r4,24(r10)
00db10
+	addi	r10,r10,32
00db10
+
00db10
+	.align	4
00db10
+2:	bf	30,1f
00db10
+	std	r4,0(r10)
00db10
+	std	r4,8(r10)
00db10
+	addi	r10,r10,16
00db10
+
00db10
+	.align	4
00db10
+1:	bf	31,L(huge_dcbz_aligned)
00db10
+	std	r4,0(r10)
00db10
+	addi	r10,r10,8
00db10
+
00db10
+L(huge_dcbz_aligned):
00db10
+	/* Setup dcbz unroll offsets and count numbers.  */
00db10
+	srdi	r8,r5,9
00db10
+	clrldi	r11,r5,55
00db10
+	cmpldi	cr6,r11,0
00db10
+	li	r9,128
00db10
+	cmpdi	r8,0
00db10
+	beq     L(huge_tail)
00db10
+	li	r7,256
00db10
+	li	r6,384
00db10
+	mtctr	r8
00db10
+
00db10
+	.align	4
00db10
+L(huge_loop):
00db10
+	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
00db10
+	   a throughput boost for large sizes (2048 bytes or higher).  */
00db10
+	dcbz	0,r10
00db10
+	dcbz	r9,r10
00db10
+	dcbz	r7,r10
00db10
+	dcbz	r6,r10
00db10
+	addi	r10,r10,512
00db10
+	bdnz	L(huge_loop)
00db10
+
00db10
+	beqlr	cr6
00db10
+
00db10
+L(huge_tail):
00db10
+	srdi    r6,r11,8
00db10
+	srdi    r7,r11,4
00db10
+	clrldi  r8,r11,4
00db10
+	cmpldi  cr6,r8,0
00db10
+	mtocrf  0x01,r6
00db10
+
00db10
+	beq	cr6,L(tail)
00db10
+
00db10
+	/* We have 1~511 bytes remaining.  */
00db10
+	.align	4
00db10
+32:	bf	31,16f
00db10
+	dcbz	0,r10
00db10
+	dcbz	r9,r10
00db10
+	addi	r10,r10,256
00db10
+
00db10
+	.align	4
00db10
+16:	mtocrf  0x01,r7
00db10
+	bf	28,8f
00db10
+	dcbz	0,r10
00db10
+	addi	r10,r10,128
00db10
+
00db10
+	.align 	4
00db10
+8:	bf	29,4f
00db10
+	std	r4,0(r10)
00db10
+	std	r4,8(r10)
00db10
+	std	r4,16(r10)
00db10
+	std	r4,24(r10)
00db10
+	std	r4,32(r10)
00db10
+	std	r4,40(r10)
00db10
+	std	r4,48(r10)
00db10
+	std	r4,56(r10)
00db10
+	addi	r10,r10,64
00db10
+
00db10
+	.align	4
00db10
+4:	bf	30,2f
00db10
+	std	r4,0(r10)
00db10
+	std	r4,8(r10)
00db10
+	std	r4,16(r10)
00db10
+	std	r4,24(r10)
00db10
+	addi	r10,r10,32
00db10
+
00db10
+	.align	4
00db10
+2:	bf	31,L(tail)
00db10
+	std	r4,0(r10)
00db10
+	std	r4,8(r10)
00db10
+	addi	r10,r10,16
00db10
+	.align	4
00db10
+
00db10
+	/* Remaining 1~15 bytes.  */
00db10
+L(tail):
00db10
+	mtocrf  0x01,r8
00db10
+
00db10
+	.align
00db10
+8:	bf	28,4f
00db10
+	std	r4,0(r10)
00db10
+	addi	r10,r10,8
00db10
+
00db10
+	.align	4
00db10
+4:	bf	29,2f
00db10
+	stw	r4,0(r10)
00db10
+	addi	r10,r10,4
00db10
+
00db10
+	.align	4
00db10
+2:	bf	30,1f
00db10
+	sth	r4,0(r10)
00db10
+	addi	r10,r10,2
00db10
+
00db10
+	.align	4
00db10
+1:	bflr	31
00db10
+	stb	r4,0(r10)
00db10
+	blr
00db10
+
00db10
+	/* Handle short copies of 0~31 bytes.  Best throughput is achieved
00db10
+	   by just unrolling all operations.  */
00db10
+	.align	4
00db10
+L(write_LT_32):
00db10
+	cmpldi	cr6,5,8
00db10
+	mtocrf	0x01,r5
00db10
+	ble	cr6,L(write_LE_8)
00db10
+
00db10
+	/* At least 9 bytes to go.  */
00db10
+	neg	r8,r4
00db10
+	andi.	r0,r8,3
00db10
+	cmpldi	cr1,r5,16
00db10
+	beq	L(write_LT_32_aligned)
00db10
+
00db10
+	/* Force 4-byte alignment for SRC.  */
00db10
+	mtocrf	0x01,r0
00db10
+	subf	r5,r0,r5
00db10
+
00db10
+2:	bf	30,1f
00db10
+	sth	r4,0(r10)
00db10
+	addi	r10,r10,2
00db10
+
00db10
+1:	bf	31,L(end_4bytes_alignment)
00db10
+	stb	r4,0(r10)
00db10
+	addi	r10,r10,1
00db10
+
00db10
+	.align	4
00db10
+L(end_4bytes_alignment):
00db10
+	cmpldi	cr1,r5,16
00db10
+	mtocrf	0x01,r5
00db10
+
00db10
+L(write_LT_32_aligned):
00db10
+	blt	cr1,8f
00db10
+
00db10
+	stw	r4,0(r10)
00db10
+	stw	r4,4(r10)
00db10
+	stw	r4,8(r10)
00db10
+	stw	r4,12(r10)
00db10
+	addi	r10,r10,16
00db10
+
00db10
+8:	bf	28,L(tail4)
00db10
+	stw	r4,0(r10)
00db10
+	stw	r4,4(r10)
00db10
+	addi	r10,r10,8
00db10
+
00db10
+	.align	4
00db10
+	/* Copies 4~7 bytes.  */
00db10
+L(tail4):
00db10
+	bf	29,L(tail2)
00db10
+	stw	r4,0(r10)
00db10
+	bf	30,L(tail5)
00db10
+	sth	r4,4(r10)
00db10
+	bflr	31
00db10
+	stb	r4,6(r10)
00db10
+	blr
00db10
+
00db10
+	.align	4
00db10
+	/* Copies 2~3 bytes.  */
00db10
+L(tail2):
00db10
+	bf	30,1f
00db10
+	sth	r4,0(r10)
00db10
+	bflr	31
00db10
+	stb	r4,2(r10)
00db10
+	blr
00db10
+
00db10
+	.align	4
00db10
+L(tail5):
00db10
+	bflr	31
00db10
+	stb	r4,4(r10)
00db10
+	blr
00db10
+
00db10
+	.align	4
00db10
+1: 	bflr	31
00db10
+	stb	r4,0(r10)
00db10
+	blr
00db10
+
00db10
+	/* Handles copies of 0~8 bytes.  */
00db10
+	.align	4
00db10
+L(write_LE_8):
00db10
+	bne	cr6,L(tail4)
00db10
+
00db10
+	stw	r4,0(r10)
00db10
+	stw	r4,4(r10)
00db10
+	blr
00db10
+END_GEN_TB (memset,TB_TOCLESS)
00db10
+libc_hidden_builtin_def (memset)
00db10
+
00db10
+/* Copied from bzero.S to prevent the linker from inserting a stub
00db10
+   between bzero and memset.  */
00db10
+ENTRY (__bzero)
00db10
+	CALL_MCOUNT 3
00db10
+	mr	r5,r4
00db10
+	li	r4,0
00db10
+	b	L(_memset)
00db10
+END (__bzero)
00db10
+#ifndef __bzero
00db10
+weak_alias (__bzero, bzero)
00db10
+#endif
00db10
-- 
00db10
2.1.0
00db10