|
|
8ae002 |
From 2ffa8b8660a7a17572ae5a398171c8be59985eb3 Mon Sep 17 00:00:00 2001
|
|
|
8ae002 |
From: "Gabriel F. T. Gomes" <gftg@linux.vnet.ibm.com>
|
|
|
8ae002 |
Date: Mon, 25 Jan 2016 10:50:34 -0500
|
|
|
8ae002 |
Subject: [PATCH] powerpc: Zero pad using memset in strncpy/stpncpy
|
|
|
8ae002 |
|
|
|
8ae002 |
Call __memset_power8 to pad, with zeros, the remaining bytes in the
|
|
|
8ae002 |
dest string on __strncpy_power8 and __stpncpy_power8. This improves
|
|
|
8ae002 |
performance when n is larger than the input string, giving ~30% gain for
|
|
|
8ae002 |
larger strings without impacting much shorter strings.
|
|
|
8ae002 |
|
|
|
8ae002 |
(cherry picked from commit 72c11b353ede72931cc474c9071d143d9a05c0d7)
|
|
|
8ae002 |
---
|
|
|
8ae002 |
ChangeLog | 5 ++
|
|
|
8ae002 |
sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
|
|
|
8ae002 |
2 files changed, 61 insertions(+), 67 deletions(-)
|
|
|
8ae002 |
|
|
|
8ae002 |
diff --git a/ChangeLog b/ChangeLog
|
|
|
8ae002 |
index 5537fc6..8d0e296 100644
|
|
|
8ae002 |
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
|
|
|
8ae002 |
index 5fda953..80136cc 100644
|
|
|
8ae002 |
--- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
|
|
|
8ae002 |
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
|
|
|
8ae002 |
@@ -24,6 +24,8 @@
|
|
|
8ae002 |
# define FUNC_NAME strncpy
|
|
|
8ae002 |
#endif
|
|
|
8ae002 |
|
|
|
8ae002 |
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
|
|
|
8ae002 |
+
|
|
|
8ae002 |
/* Implements the function
|
|
|
8ae002 |
|
|
|
8ae002 |
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
|
|
|
8ae002 |
@@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
|
|
|
8ae002 |
addi r10,r4,16
|
|
|
8ae002 |
rlwinm r9,r4,0,19,19
|
|
|
8ae002 |
|
|
|
8ae002 |
- /* Since it is a leaf function, save some non-volatile registers on the
|
|
|
8ae002 |
- protected/red zone. */
|
|
|
8ae002 |
+ /* Save some non-volatile registers on the stack. */
|
|
|
8ae002 |
std r26,-48(r1)
|
|
|
8ae002 |
std r27,-40(r1)
|
|
|
8ae002 |
|
|
|
8ae002 |
@@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
|
|
|
8ae002 |
std r30,-16(r1)
|
|
|
8ae002 |
std r31,-8(r1)
|
|
|
8ae002 |
|
|
|
8ae002 |
+ /* Update CFI. */
|
|
|
8ae002 |
+ cfi_offset(r26, -48)
|
|
|
8ae002 |
+ cfi_offset(r27, -40)
|
|
|
8ae002 |
+ cfi_offset(r28, -32)
|
|
|
8ae002 |
+ cfi_offset(r29, -24)
|
|
|
8ae002 |
+ cfi_offset(r30, -16)
|
|
|
8ae002 |
+ cfi_offset(r31, -8)
|
|
|
8ae002 |
+
|
|
|
8ae002 |
beq cr7,L(unaligned_lt_16)
|
|
|
8ae002 |
rldicl r9,r4,0,61
|
|
|
8ae002 |
subfic r8,r9,8
|
|
|
8ae002 |
@@ -144,74 +153,58 @@ L(short_path_loop_end):
|
|
|
8ae002 |
ld r31,-8(r1)
|
|
|
8ae002 |
blr
|
|
|
8ae002 |
|
|
|
8ae002 |
- /* This code pads the remainder dest with NULL bytes. The algorithm
|
|
|
8ae002 |
- calculate the remanining size and issues a doubleword unrolled
|
|
|
8ae002 |
- loops followed by a byte a byte set. */
|
|
|
8ae002 |
+ /* This code pads the remainder of dest with NULL bytes. The algorithm
|
|
|
8ae002 |
+ calculates the remaining size and calls memset. */
|
|
|
8ae002 |
.align 4
|
|
|
8ae002 |
L(zero_pad_start):
|
|
|
8ae002 |
mr r5,r10
|
|
|
8ae002 |
mr r9,r6
|
|
|
8ae002 |
L(zero_pad_start_1):
|
|
|
8ae002 |
- srdi. r8,r5,r3
|
|
|
8ae002 |
- mr r10,r9
|
|
|
8ae002 |
-#ifdef USE_AS_STPNCPY
|
|
|
8ae002 |
- mr r3,r9
|
|
|
8ae002 |
+ /* At this point:
|
|
|
8ae002 |
+ - r5 holds the number of bytes that still have to be written to
|
|
|
8ae002 |
+ dest.
|
|
|
8ae002 |
+ - r9 points to the position, in dest, where the first null byte
|
|
|
8ae002 |
+ will be written.
|
|
|
8ae002 |
+ The above statements are true both when control reaches this label
|
|
|
8ae002 |
+ from a branch or when falling through the previous lines. */
|
|
|
8ae002 |
+#ifndef USE_AS_STPNCPY
|
|
|
8ae002 |
+ mr r30,r3 /* Save the return value of strncpy. */
|
|
|
8ae002 |
+#endif
|
|
|
8ae002 |
+ /* Prepare the call to memset. */
|
|
|
8ae002 |
+ mr r3,r9 /* Pointer to the area to be zero-filled. */
|
|
|
8ae002 |
+ li r4,0 /* Byte to be written (zero). */
|
|
|
8ae002 |
+
|
|
|
8ae002 |
+ /* We delayed the creation of the stack frame, as well as the saving of
|
|
|
8ae002 |
+ the link register, because only at this point, we are sure that
|
|
|
8ae002 |
+ doing so is actually needed. */
|
|
|
8ae002 |
+
|
|
|
8ae002 |
+ /* Save the link register. */
|
|
|
8ae002 |
+ mflr r0
|
|
|
8ae002 |
+ std r0,16(r1)
|
|
|
8ae002 |
+ cfi_offset(lr, 16)
|
|
|
8ae002 |
+
|
|
|
8ae002 |
+ /* Create the stack frame. */
|
|
|
8ae002 |
+ stdu r1,-FRAMESIZE(r1)
|
|
|
8ae002 |
+ cfi_adjust_cfa_offset(FRAMESIZE)
|
|
|
8ae002 |
+
|
|
|
8ae002 |
+ bl __memset_power8
|
|
|
8ae002 |
+ nop
|
|
|
8ae002 |
+
|
|
|
8ae002 |
+ /* Restore the stack frame. */
|
|
|
8ae002 |
+ addi r1,r1,FRAMESIZE
|
|
|
8ae002 |
+ cfi_adjust_cfa_offset(-FRAMESIZE)
|
|
|
8ae002 |
+ /* Restore the link register. */
|
|
|
8ae002 |
+ ld r0,16(r1)
|
|
|
8ae002 |
+ mtlr r0
|
|
|
8ae002 |
+
|
|
|
8ae002 |
+#ifndef USE_AS_STPNCPY
|
|
|
8ae002 |
+ mr r3,r30 /* Restore the return value of strncpy, i.e.:
|
|
|
8ae002 |
+ dest. For stpncpy, the return value is the
|
|
|
8ae002 |
+ same as return value of memset. */
|
|
|
8ae002 |
#endif
|
|
|
8ae002 |
- beq- cr0,L(zero_pad_loop_b_start)
|
|
|
8ae002 |
- cmpldi cr7,r8,1
|
|
|
8ae002 |
- li cr7,0
|
|
|
8ae002 |
- std r7,0(r9)
|
|
|
8ae002 |
- beq cr7,L(zero_pad_loop_b_prepare)
|
|
|
8ae002 |
- addic. r8,r8,-2
|
|
|
8ae002 |
- addi r10,r9,r16
|
|
|
8ae002 |
- std r7,8(r9)
|
|
|
8ae002 |
- beq cr0,L(zero_pad_loop_dw_2)
|
|
|
8ae002 |
- std r7,16(r9)
|
|
|
8ae002 |
- li r9,0
|
|
|
8ae002 |
- b L(zero_pad_loop_dw_1)
|
|
|
8ae002 |
-
|
|
|
8ae002 |
- .align 4
|
|
|
8ae002 |
-L(zero_pad_loop_dw):
|
|
|
8ae002 |
- addi r10,r10,16
|
|
|
8ae002 |
- std r9,-8(r10)
|
|
|
8ae002 |
- beq cr0,L(zero_pad_loop_dw_2)
|
|
|
8ae002 |
- std r9,0(r10)
|
|
|
8ae002 |
-L(zero_pad_loop_dw_1):
|
|
|
8ae002 |
- cmpldi cr7,r8,1
|
|
|
8ae002 |
- std r9,0(r10)
|
|
|
8ae002 |
- addic. r8,r8,-2
|
|
|
8ae002 |
- bne cr7,L(zero_pad_loop_dw)
|
|
|
8ae002 |
- addi r10,r10,8
|
|
|
8ae002 |
-L(zero_pad_loop_dw_2):
|
|
|
8ae002 |
- rldicl r5,r5,0,61
|
|
|
8ae002 |
-L(zero_pad_loop_b_start):
|
|
|
8ae002 |
- cmpdi cr7,r5,0
|
|
|
8ae002 |
- addi r5,r5,-1
|
|
|
8ae002 |
- addi r9,r10,-1
|
|
|
8ae002 |
- add r10,r10,5
|
|
|
8ae002 |
- subf r10,r9,r10
|
|
|
8ae002 |
- li r8,0
|
|
|
8ae002 |
- beq- cr7,L(short_path_loop_end)
|
|
|
8ae002 |
-
|
|
|
8ae002 |
- /* Write remaining 1-8 bytes. */
|
|
|
8ae002 |
- .align 4
|
|
|
8ae002 |
- addi r9,r9,1
|
|
|
8ae002 |
- mtocrf 0x1,r10
|
|
|
8ae002 |
- bf 29,4f
|
|
|
8ae002 |
- stw r8,0(r9)
|
|
|
8ae002 |
- addi r9,r9,4
|
|
|
8ae002 |
-
|
|
|
8ae002 |
- .align 4
|
|
|
8ae002 |
-4: bf 30,2f
|
|
|
8ae002 |
- sth r8,0(r9)
|
|
|
8ae002 |
- addi r9,r9,2
|
|
|
8ae002 |
-
|
|
|
8ae002 |
- .align 4
|
|
|
8ae002 |
-2: bf 31,1f
|
|
|
8ae002 |
- stb r8,0(r9)
|
|
|
8ae002 |
|
|
|
8ae002 |
- /* Restore non-volatile registers. */
|
|
|
8ae002 |
-1: ld r26,-48(r1)
|
|
|
8ae002 |
+ /* Restore non-volatile registers and return. */
|
|
|
8ae002 |
+ ld r26,-48(r1)
|
|
|
8ae002 |
ld r27,-40(r1)
|
|
|
8ae002 |
ld r28,-32(r1)
|
|
|
8ae002 |
ld r29,-24(r1)
|
|
|
8ae002 |
@@ -407,10 +400,6 @@ L(short_path_prepare_2_3):
|
|
|
8ae002 |
mr r4,r28
|
|
|
8ae002 |
mr r9,r29
|
|
|
8ae002 |
b L(short_path_2)
|
|
|
8ae002 |
-L(zero_pad_loop_b_prepare):
|
|
|
8ae002 |
- addi r10,r9,8
|
|
|
8ae002 |
- rldicl r5,r5,0,61
|
|
|
8ae002 |
- b L(zero_pad_loop_b_start)
|
|
|
8ae002 |
L(zero_pad_start_prepare_1):
|
|
|
8ae002 |
mr r5,r6
|
|
|
8ae002 |
mr r9,r8
|
|
|
8ae002 |
--
|
|
|
8ae002 |
2.1.0
|
|
|
8ae002 |
|