|
|
00db10 |
# commit 466b03933234017473c12dd1d92bda5e7fe49df7
|
|
|
00db10 |
# Author: Alan Modra <amodra@gmail.com>
|
|
|
00db10 |
# Date: Sat Aug 17 18:48:36 2013 +0930
|
|
|
00db10 |
#
|
|
|
00db10 |
# PowerPC LE memchr and memrchr
|
|
|
00db10 |
# http://sourceware.org/ml/libc-alpha/2013-08/msg00105.html
|
|
|
00db10 |
#
|
|
|
00db10 |
# Like strnlen, memchr and memrchr had a number of defects fixed by this
|
|
|
00db10 |
# patch as well as adding little-endian support. The first one I
|
|
|
00db10 |
# noticed was that the entry to the main loop needlessly checked for
|
|
|
00db10 |
# "are we done yet?" when we know the size is large enough that we can't
|
|
|
00db10 |
# be done. The second defect I noticed was that the main loop count was
|
|
|
00db10 |
# wrong, which in turn meant that the small loop needed to handle an
|
|
|
00db10 |
# extra word. Thirdly, there is nothing to say that the string can't
|
|
|
00db10 |
# wrap around zero, except of course that we'd normally hit a segfault
|
|
|
00db10 |
# on trying to read from address zero. Fixing that simplified a number
|
|
|
00db10 |
# of places:
|
|
|
00db10 |
#
|
|
|
00db10 |
# - /* Are we done already? */
|
|
|
00db10 |
# - addi r9,r8,8
|
|
|
00db10 |
# - cmpld r9,r7
|
|
|
00db10 |
# - bge L(null)
|
|
|
00db10 |
#
|
|
|
00db10 |
# becomes
|
|
|
00db10 |
#
|
|
|
00db10 |
# + cmpld r8,r7
|
|
|
00db10 |
# + beqlr
|
|
|
00db10 |
#
|
|
|
00db10 |
# However, the exit gets an extra test because I test for being on the
|
|
|
00db10 |
# last word then if so whether the byte offset is less than the end.
|
|
|
00db10 |
# Overall, the change is a win.
|
|
|
00db10 |
#
|
|
|
00db10 |
# Lastly, memrchr used the wrong cache hint.
|
|
|
00db10 |
#
|
|
|
00db10 |
# * sysdeps/powerpc/powerpc64/power7/memchr.S: Replace rlwimi with
|
|
|
00db10 |
# insrdi. Make better use of reg selection to speed exit slightly.
|
|
|
00db10 |
# Schedule entry path a little better. Remove useless "are we done"
|
|
|
00db10 |
# checks on entry to main loop. Handle wrapping around zero address.
|
|
|
00db10 |
# Correct main loop count. Handle single left-over word from main
|
|
|
00db10 |
# loop inline rather than by using loop_small. Remove extra word
|
|
|
00db10 |
# case in loop_small caused by wrong loop count. Add little-endian
|
|
|
00db10 |
# support.
|
|
|
00db10 |
# * sysdeps/powerpc/powerpc32/power7/memchr.S: Likewise.
|
|
|
00db10 |
# * sysdeps/powerpc/powerpc64/power7/memrchr.S: Likewise. Use proper
|
|
|
00db10 |
# cache hint.
|
|
|
00db10 |
# * sysdeps/powerpc/powerpc32/power7/memrchr.S: Likewise.
|
|
|
00db10 |
# * sysdeps/powerpc/powerpc64/power7/rawmemchr.S: Add little-endian
|
|
|
00db10 |
# support. Avoid rlwimi.
|
|
|
00db10 |
# * sysdeps/powerpc/powerpc32/power7/rawmemchr.S: Likewise.
|
|
|
00db10 |
#
|
|
|
00db10 |
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memchr.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memchr.S
|
|
|
00db10 |
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memchr.S 2014-05-29 13:09:17.000000000 -0500
|
|
|
00db10 |
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memchr.S 2014-05-29 13:13:37.000000000 -0500
|
|
|
00db10 |
@@ -1,5 +1,5 @@
|
|
|
00db10 |
/* Optimized memchr implementation for PowerPC32/POWER7 using cmpb insn.
|
|
|
00db10 |
- Copyright (C) 2010-2012 Free Software Foundation, Inc.
|
|
|
00db10 |
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
|
|
|
00db10 |
Contributed by Luis Machado <luisgpm@br.ibm.com>.
|
|
|
00db10 |
This file is part of the GNU C Library.
|
|
|
00db10 |
|
|
|
00db10 |
@@ -18,116 +18,118 @@
|
|
|
00db10 |
<http://www.gnu.org/licenses/>. */
|
|
|
00db10 |
|
|
|
00db10 |
#include <sysdep.h>
|
|
|
00db10 |
-#include <bp-sym.h>
|
|
|
00db10 |
-#include <bp-asm.h>
|
|
|
00db10 |
|
|
|
00db10 |
/* int [r3] memchr (char *s [r3], int byte [r4], int size [r5]) */
|
|
|
00db10 |
.machine power7
|
|
|
00db10 |
-ENTRY (BP_SYM (__memchr))
|
|
|
00db10 |
+ENTRY (__memchr)
|
|
|
00db10 |
CALL_MCOUNT
|
|
|
00db10 |
dcbt 0,r3
|
|
|
00db10 |
clrrwi r8,r3,2
|
|
|
00db10 |
- rlwimi r4,r4,8,16,23
|
|
|
00db10 |
- rlwimi r4,r4,16,0,15
|
|
|
00db10 |
+ insrwi r4,r4,8,16 /* Replicate byte to word. */
|
|
|
00db10 |
add r7,r3,r5 /* Calculate the last acceptable address. */
|
|
|
00db10 |
+ insrwi r4,r4,16,0
|
|
|
00db10 |
cmplwi r5,16
|
|
|
00db10 |
+ li r9, -1
|
|
|
00db10 |
+ rlwinm r6,r3,3,27,28 /* Calculate padding. */
|
|
|
00db10 |
+ addi r7,r7,-1
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ slw r9,r9,r6
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ srw r9,r9,r6
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
ble L(small_range)
|
|
|
00db10 |
|
|
|
00db10 |
- cmplw cr7,r3,r7 /* Compare the starting address (r3) with the
|
|
|
00db10 |
- ending address (r7). If (r3 >= r7), the size
|
|
|
00db10 |
- passed in is zero or negative. */
|
|
|
00db10 |
- ble cr7,L(proceed)
|
|
|
00db10 |
-
|
|
|
00db10 |
- li r7,-1 /* Artificially set our ending address (r7)
|
|
|
00db10 |
- such that we will exit early. */
|
|
|
00db10 |
-L(proceed):
|
|
|
00db10 |
- rlwinm r6,r3,3,27,28 /* Calculate padding. */
|
|
|
00db10 |
- cmpli cr6,r6,0 /* cr6 == Do we have padding? */
|
|
|
00db10 |
lwz r12,0(r8) /* Load word from memory. */
|
|
|
00db10 |
- cmpb r10,r12,r4 /* Check for BYTE's in WORD1. */
|
|
|
00db10 |
- beq cr6,L(proceed_no_padding)
|
|
|
00db10 |
- slw r10,r10,r6
|
|
|
00db10 |
- srw r10,r10,r6
|
|
|
00db10 |
-L(proceed_no_padding):
|
|
|
00db10 |
- cmplwi cr7,r10,0 /* If r10 == 0, no BYTEs have been found. */
|
|
|
00db10 |
+ cmpb r3,r12,r4 /* Check for BYTEs in WORD1. */
|
|
|
00db10 |
+ and r3,r3,r9
|
|
|
00db10 |
+ clrlwi r5,r7,30 /* Byte count - 1 in last word. */
|
|
|
00db10 |
+ clrrwi r7,r7,2 /* Address of last word. */
|
|
|
00db10 |
+ cmplwi cr7,r3,0 /* If r3 == 0, no BYTEs have been found. */
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
- /* Are we done already? */
|
|
|
00db10 |
- addi r9,r8,4
|
|
|
00db10 |
- cmplw cr6,r9,r7
|
|
|
00db10 |
- bge cr6,L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
mtcrf 0x01,r8
|
|
|
00db10 |
/* Are we now aligned to a doubleword boundary? If so, skip to
|
|
|
00db10 |
the main loop. Otherwise, go through the alignment code. */
|
|
|
00db10 |
-
|
|
|
00db10 |
bt 29,L(loop_setup)
|
|
|
00db10 |
|
|
|
00db10 |
/* Handle WORD2 of pair. */
|
|
|
00db10 |
lwzu r12,4(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- cmplwi cr7,r10,0
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmplwi cr7,r3,0
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
- /* Are we done already? */
|
|
|
00db10 |
- addi r9,r8,4
|
|
|
00db10 |
- cmplw cr6,r9,r7
|
|
|
00db10 |
- bge cr6,L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
L(loop_setup):
|
|
|
00db10 |
- sub r5,r7,r9
|
|
|
00db10 |
- srwi r6,r5,3 /* Number of loop iterations. */
|
|
|
00db10 |
+ /* The last word we want to read in the loop below is the one
|
|
|
00db10 |
+ containing the last byte of the string, ie. the word at
|
|
|
00db10 |
+ (s + size - 1) & ~3, or r7. The first word read is at
|
|
|
00db10 |
+ r8 + 4, we read 2 * cnt words, so the last word read will
|
|
|
00db10 |
+ be at r8 + 4 + 8 * cnt - 4. Solving for cnt gives
|
|
|
00db10 |
+ cnt = (r7 - r8) / 8 */
|
|
|
00db10 |
+ sub r6,r7,r8
|
|
|
00db10 |
+ srwi r6,r6,3 /* Number of loop iterations. */
|
|
|
00db10 |
mtctr r6 /* Setup the counter. */
|
|
|
00db10 |
- b L(loop)
|
|
|
00db10 |
- /* Main loop to look for BYTE backwards in the string. Since
|
|
|
00db10 |
- it's a small loop (< 8 instructions), align it to 32-bytes. */
|
|
|
00db10 |
- .p2align 5
|
|
|
00db10 |
+
|
|
|
00db10 |
+ /* Main loop to look for BYTE in the string. Since
|
|
|
00db10 |
+ it's a small loop (8 instructions), align it to 32-bytes. */
|
|
|
00db10 |
+ .align 5
|
|
|
00db10 |
L(loop):
|
|
|
00db10 |
/* Load two words, compare and merge in a
|
|
|
00db10 |
single register for speed. This is an attempt
|
|
|
00db10 |
to speed up the byte-checking process for bigger strings. */
|
|
|
00db10 |
lwz r12,4(r8)
|
|
|
00db10 |
lwzu r11,8(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
cmpb r9,r11,r4
|
|
|
00db10 |
- or r5,r9,r10 /* Merge everything in one word. */
|
|
|
00db10 |
- cmplwi cr7,r5,0
|
|
|
00db10 |
+ or r6,r9,r3 /* Merge everything in one word. */
|
|
|
00db10 |
+ cmplwi cr7,r6,0
|
|
|
00db10 |
bne cr7,L(found)
|
|
|
00db10 |
bdnz L(loop)
|
|
|
00db10 |
|
|
|
00db10 |
- /* We're here because the counter reached 0, and that means we
|
|
|
00db10 |
- didn't have any matches for BYTE in the whole range. */
|
|
|
00db10 |
- subi r11,r7,4
|
|
|
00db10 |
- cmplw cr6,r8,r11
|
|
|
00db10 |
- blt cr6,L(loop_small)
|
|
|
00db10 |
- b L(null)
|
|
|
00db10 |
+ /* We may have one more dword to read. */
|
|
|
00db10 |
+ cmplw r8,r7
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
+ lwzu r12,4(r8)
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmplwi cr6,r3,0
|
|
|
00db10 |
+ bne cr6,L(done)
|
|
|
00db10 |
+ blr
|
|
|
00db10 |
+
|
|
|
00db10 |
+ .align 4
|
|
|
00db10 |
+L(found):
|
|
|
00db10 |
/* OK, one (or both) of the words contains BYTE. Check
|
|
|
00db10 |
the first word and decrement the address in case the first
|
|
|
00db10 |
word really contains BYTE. */
|
|
|
00db10 |
- .align 4
|
|
|
00db10 |
-L(found):
|
|
|
00db10 |
- cmplwi cr6,r10,0
|
|
|
00db10 |
+ cmplwi cr6,r3,0
|
|
|
00db10 |
addi r8,r8,-4
|
|
|
00db10 |
bne cr6,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
/* BYTE must be in the second word. Adjust the address
|
|
|
00db10 |
- again and move the result of cmpb to r10 so we can calculate the
|
|
|
00db10 |
+ again and move the result of cmpb to r3 so we can calculate the
|
|
|
00db10 |
pointer. */
|
|
|
00db10 |
|
|
|
00db10 |
- mr r10,r9
|
|
|
00db10 |
+ mr r3,r9
|
|
|
00db10 |
addi r8,r8,4
|
|
|
00db10 |
|
|
|
00db10 |
- /* r10 has the output of the cmpb instruction, that is, it contains
|
|
|
00db10 |
+ /* r3 has the output of the cmpb instruction, that is, it contains
|
|
|
00db10 |
0xff in the same position as BYTE in the original
|
|
|
00db10 |
word from the string. Use that to calculate the pointer.
|
|
|
00db10 |
We need to make sure BYTE is *before* the end of the range. */
|
|
|
00db10 |
L(done):
|
|
|
00db10 |
- cntlzw r0,r10 /* Count leading zeroes before the match. */
|
|
|
00db10 |
- srwi r0,r0,3 /* Convert leading zeroes to bytes. */
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ addi r0,r3,-1
|
|
|
00db10 |
+ andc r0,r0,r3
|
|
|
00db10 |
+ popcntw r0,r0 /* Count trailing zeros. */
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ cntlzw r0,r3 /* Count leading zeros before the match. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmplw r8,r7 /* Are we on the last word? */
|
|
|
00db10 |
+ srwi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
|
|
|
00db10 |
add r3,r8,r0
|
|
|
00db10 |
- cmplw r3,r7
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
+ cmplw cr7,r0,r5 /* If on the last dword, check byte offset. */
|
|
|
00db10 |
+ bnelr
|
|
|
00db10 |
+ blelr cr7
|
|
|
00db10 |
+ li r3,0
|
|
|
00db10 |
blr
|
|
|
00db10 |
|
|
|
00db10 |
.align 4
|
|
|
00db10 |
@@ -139,69 +141,44 @@
|
|
|
00db10 |
.align 4
|
|
|
00db10 |
L(small_range):
|
|
|
00db10 |
cmplwi r5,0
|
|
|
00db10 |
- rlwinm r6,r3,3,27,28 /* Calculate padding. */
|
|
|
00db10 |
- beq L(null) /* This branch is for the cmplwi r5,0 above */
|
|
|
00db10 |
+ beq L(null)
|
|
|
00db10 |
lwz r12,0(r8) /* Load word from memory. */
|
|
|
00db10 |
- cmplwi cr6,r6,0 /* cr6 == Do we have padding? */
|
|
|
00db10 |
- cmpb r10,r12,r4 /* Check for BYTE in DWORD1. */
|
|
|
00db10 |
- beq cr6,L(small_no_padding)
|
|
|
00db10 |
- slw r10,r10,r6
|
|
|
00db10 |
- srw r10,r10,r6
|
|
|
00db10 |
-L(small_no_padding):
|
|
|
00db10 |
- cmplwi cr7,r10,0
|
|
|
00db10 |
+ cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
|
|
|
00db10 |
+ and r3,r3,r9
|
|
|
00db10 |
+ cmplwi cr7,r3,0
|
|
|
00db10 |
+ clrlwi r5,r7,30 /* Byte count - 1 in last word. */
|
|
|
00db10 |
+ clrrwi r7,r7,2 /* Address of last word. */
|
|
|
00db10 |
+ cmplw r8,r7 /* Are we done already? */
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
- /* Are we done already? */
|
|
|
00db10 |
- addi r9,r8,4
|
|
|
00db10 |
- cmplw r9,r7
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
-L(loop_small): /* loop_small has been unrolled. */
|
|
|
00db10 |
lwzu r12,4(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- addi r9,r8,4
|
|
|
00db10 |
- cmplwi cr6,r10,0
|
|
|
00db10 |
- cmplw r9,r7
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmplwi cr6,r3,0
|
|
|
00db10 |
+ cmplw r8,r7
|
|
|
00db10 |
bne cr6,L(done)
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
lwzu r12,4(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- addi r9,r8,4
|
|
|
00db10 |
- cmplwi cr6,r10,0
|
|
|
00db10 |
- cmplw r9,r7
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmplwi cr6,r3,0
|
|
|
00db10 |
+ cmplw r8,r7
|
|
|
00db10 |
bne cr6,L(done)
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
lwzu r12,4(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- addi r9,r8,4
|
|
|
00db10 |
- cmplwi cr6,r10,0
|
|
|
00db10 |
- cmplw r9,r7
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmplwi cr6,r3,0
|
|
|
00db10 |
+ cmplw r8,r7
|
|
|
00db10 |
bne cr6,L(done)
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
lwzu r12,4(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- addi r9,r8,4
|
|
|
00db10 |
- cmplwi cr6,r10,0
|
|
|
00db10 |
- cmplw r9,r7
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmplwi cr6,r3,0
|
|
|
00db10 |
bne cr6,L(done)
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
- /* For most cases we will never get here. Under some combinations of
|
|
|
00db10 |
- padding + length there is a leftover word that still needs to be
|
|
|
00db10 |
- checked. */
|
|
|
00db10 |
- lwzu r12,4(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- addi r9,r8,4
|
|
|
00db10 |
- cmplwi cr6,r10,0
|
|
|
00db10 |
- bne cr6,L(done)
|
|
|
00db10 |
-
|
|
|
00db10 |
- /* save a branch and exit directly */
|
|
|
00db10 |
- li r3,0
|
|
|
00db10 |
blr
|
|
|
00db10 |
|
|
|
00db10 |
-END (BP_SYM (__memchr))
|
|
|
00db10 |
-weak_alias (BP_SYM (__memchr), BP_SYM(memchr))
|
|
|
00db10 |
+END (__memchr)
|
|
|
00db10 |
+weak_alias (__memchr, memchr)
|
|
|
00db10 |
libc_hidden_builtin_def (memchr)
|
|
|
00db10 |
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memrchr.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memrchr.S
|
|
|
00db10 |
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memrchr.S 2014-05-29 13:09:17.000000000 -0500
|
|
|
00db10 |
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memrchr.S 2014-05-29 13:13:47.000000000 -0500
|
|
|
00db10 |
@@ -1,5 +1,5 @@
|
|
|
00db10 |
/* Optimized memrchr implementation for PowerPC32/POWER7 using cmpb insn.
|
|
|
00db10 |
- Copyright (C) 2010 Free Software Foundation, Inc.
|
|
|
00db10 |
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
|
|
|
00db10 |
Contributed by Luis Machado <luisgpm@br.ibm.com>.
|
|
|
00db10 |
This file is part of the GNU C Library.
|
|
|
00db10 |
|
|
|
00db10 |
@@ -18,124 +18,136 @@
|
|
|
00db10 |
<http://www.gnu.org/licenses/>. */
|
|
|
00db10 |
|
|
|
00db10 |
#include <sysdep.h>
|
|
|
00db10 |
-#include <bp-sym.h>
|
|
|
00db10 |
-#include <bp-asm.h>
|
|
|
00db10 |
|
|
|
00db10 |
/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */
|
|
|
00db10 |
.machine power7
|
|
|
00db10 |
-ENTRY (BP_SYM (__memrchr))
|
|
|
00db10 |
+ENTRY (__memrchr)
|
|
|
00db10 |
CALL_MCOUNT
|
|
|
00db10 |
- dcbt 0,r3
|
|
|
00db10 |
- mr r7,r3
|
|
|
00db10 |
- add r3,r7,r5 /* Calculate the last acceptable address. */
|
|
|
00db10 |
- cmplw cr7,r3,r7 /* Is the address equal or less than r3? */
|
|
|
00db10 |
+ add r7,r3,r5 /* Calculate the last acceptable address. */
|
|
|
00db10 |
+ neg r0,r7
|
|
|
00db10 |
+ addi r7,r7,-1
|
|
|
00db10 |
+ mr r10,r3
|
|
|
00db10 |
+ clrrwi r6,r7,7
|
|
|
00db10 |
+ li r9,3<<5
|
|
|
00db10 |
+ dcbt r9,r6,16 /* Stream hint, decreasing addresses. */
|
|
|
00db10 |
|
|
|
00db10 |
/* Replicate BYTE to word. */
|
|
|
00db10 |
- rlwimi r4,r4,8,16,23
|
|
|
00db10 |
- rlwimi r4,r4,16,0,15
|
|
|
00db10 |
- bge cr7,L(proceed)
|
|
|
00db10 |
-
|
|
|
00db10 |
- li r3,-1 /* Make r11 the biggest if r4 <= 0. */
|
|
|
00db10 |
-L(proceed):
|
|
|
00db10 |
+ insrwi r4,r4,8,16
|
|
|
00db10 |
+ insrwi r4,r4,16,0
|
|
|
00db10 |
li r6,-4
|
|
|
00db10 |
- addi r9,r3,-1
|
|
|
00db10 |
- clrrwi r8,r9,2
|
|
|
00db10 |
- addi r8,r8,4
|
|
|
00db10 |
- neg r0,r3
|
|
|
00db10 |
+ li r9,-1
|
|
|
00db10 |
rlwinm r0,r0,3,27,28 /* Calculate padding. */
|
|
|
00db10 |
-
|
|
|
00db10 |
+ clrrwi r8,r7,2
|
|
|
00db10 |
+ srw r9,r9,r0
|
|
|
00db10 |
cmplwi r5,16
|
|
|
00db10 |
+ clrrwi r0,r10,2
|
|
|
00db10 |
ble L(small_range)
|
|
|
00db10 |
|
|
|
00db10 |
- lwbrx r12,r8,r6 /* Load reversed word from memory. */
|
|
|
00db10 |
- cmpb r10,r12,r4 /* Check for BYTE in WORD1. */
|
|
|
00db10 |
- slw r10,r10,r0
|
|
|
00db10 |
- srw r10,r10,r0
|
|
|
00db10 |
- cmplwi cr7,r10,0 /* If r10 == 0, no BYTE's have been found. */
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ lwzx r12,0,r8
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ lwbrx r12,0,r8 /* Load reversed word from memory. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4 /* Check for BYTE in WORD1. */
|
|
|
00db10 |
+ and r3,r3,r9
|
|
|
00db10 |
+ cmplwi cr7,r3,0 /* If r3 == 0, no BYTEs have been found. */
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
- /* Are we done already? */
|
|
|
00db10 |
- addi r9,r8,-4
|
|
|
00db10 |
- cmplw cr6,r9,r7
|
|
|
00db10 |
- ble cr6,L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
mtcrf 0x01,r8
|
|
|
00db10 |
/* Are we now aligned to a doubleword boundary? If so, skip to
|
|
|
00db10 |
the main loop. Otherwise, go through the alignment code. */
|
|
|
00db10 |
- mr r8,r9
|
|
|
00db10 |
- bt 29,L(loop_setup)
|
|
|
00db10 |
+ bf 29,L(loop_setup)
|
|
|
00db10 |
|
|
|
00db10 |
/* Handle WORD2 of pair. */
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ lwzx r12,r8,r6
|
|
|
00db10 |
+#else
|
|
|
00db10 |
lwbrx r12,r8,r6
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- cmplwi cr7,r10,0
|
|
|
00db10 |
- bne cr7,L(done)
|
|
|
00db10 |
-
|
|
|
00db10 |
- /* Are we done already? */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
addi r8,r8,-4
|
|
|
00db10 |
- cmplw cr6,r8,r7
|
|
|
00db10 |
- ble cr6,L(null)
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmplwi cr7,r3,0
|
|
|
00db10 |
+ bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
L(loop_setup):
|
|
|
00db10 |
- li r0,-8
|
|
|
00db10 |
- sub r5,r8,r7
|
|
|
00db10 |
- srwi r9,r5,3 /* Number of loop iterations. */
|
|
|
00db10 |
+ /* The last word we want to read in the loop below is the one
|
|
|
00db10 |
+ containing the first byte of the string, ie. the word at
|
|
|
00db10 |
+ s & ~3, or r0. The first word read is at r8 - 4, we
|
|
|
00db10 |
+ read 2 * cnt words, so the last word read will be at
|
|
|
00db10 |
+ r8 - 4 - 8 * cnt + 4. Solving for cnt gives
|
|
|
00db10 |
+ cnt = (r8 - r0) / 8 */
|
|
|
00db10 |
+ sub r5,r8,r0
|
|
|
00db10 |
+ addi r8,r8,-4
|
|
|
00db10 |
+ srwi r9,r5,3 /* Number of loop iterations. */
|
|
|
00db10 |
mtctr r9 /* Setup the counter. */
|
|
|
00db10 |
- b L(loop)
|
|
|
00db10 |
- /* Main loop to look for BYTE backwards in the string. Since it's a
|
|
|
00db10 |
- small loop (< 8 instructions), align it to 32-bytes. */
|
|
|
00db10 |
- .p2align 5
|
|
|
00db10 |
+
|
|
|
00db10 |
+ /* Main loop to look for BYTE backwards in the string.
|
|
|
00db10 |
+ FIXME: Investigate whether 32 byte align helps with this
|
|
|
00db10 |
+ 9 instruction loop. */
|
|
|
00db10 |
+ .align 5
|
|
|
00db10 |
L(loop):
|
|
|
00db10 |
/* Load two words, compare and merge in a
|
|
|
00db10 |
single register for speed. This is an attempt
|
|
|
00db10 |
to speed up the byte-checking process for bigger strings. */
|
|
|
00db10 |
|
|
|
00db10 |
- lwbrx r12,r8,r6
|
|
|
00db10 |
- lwbrx r11,r8,r0
|
|
|
00db10 |
- addi r8,r8,-4
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ lwzx r12,0,r8
|
|
|
00db10 |
+ lwzx r11,r8,r6
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ lwbrx r12,0,r8
|
|
|
00db10 |
+ lwbrx r11,r8,r6
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
cmpb r9,r11,r4
|
|
|
00db10 |
- or r5,r9,r10 /* Merge everything in one word. */
|
|
|
00db10 |
+ or r5,r9,r3 /* Merge everything in one word. */
|
|
|
00db10 |
cmplwi cr7,r5,0
|
|
|
00db10 |
bne cr7,L(found)
|
|
|
00db10 |
- addi r8,r8,-4
|
|
|
00db10 |
+ addi r8,r8,-8
|
|
|
00db10 |
bdnz L(loop)
|
|
|
00db10 |
- /* We're here because the counter reached 0, and that means we
|
|
|
00db10 |
- didn't have any matches for BYTE in the whole range. Just return
|
|
|
00db10 |
- the original range. */
|
|
|
00db10 |
- addi r9,r8,4
|
|
|
00db10 |
- cmplw cr6,r9,r7
|
|
|
00db10 |
- bgt cr6,L(loop_small)
|
|
|
00db10 |
- b L(null)
|
|
|
00db10 |
|
|
|
00db10 |
- /* OK, one (or both) of the words contains BYTE. Check
|
|
|
00db10 |
- the first word and decrement the address in case the first
|
|
|
00db10 |
- word really contains BYTE. */
|
|
|
00db10 |
+ /* We may have one more word to read. */
|
|
|
00db10 |
+ cmplw r8,r0
|
|
|
00db10 |
+ bnelr
|
|
|
00db10 |
+
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ lwzx r12,0,r8
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ lwbrx r12,0,r8
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmplwi cr7,r3,0
|
|
|
00db10 |
+ bne cr7,L(done)
|
|
|
00db10 |
+ blr
|
|
|
00db10 |
+
|
|
|
00db10 |
.align 4
|
|
|
00db10 |
L(found):
|
|
|
00db10 |
- cmplwi cr6,r10,0
|
|
|
00db10 |
- addi r8,r8,4
|
|
|
00db10 |
+ /* OK, one (or both) of the words contains BYTE. Check
|
|
|
00db10 |
+ the first word. */
|
|
|
00db10 |
+ cmplwi cr6,r3,0
|
|
|
00db10 |
bne cr6,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
/* BYTE must be in the second word. Adjust the address
|
|
|
00db10 |
- again and move the result of cmpb to r10 so we can calculate the
|
|
|
00db10 |
+ again and move the result of cmpb to r3 so we can calculate the
|
|
|
00db10 |
pointer. */
|
|
|
00db10 |
|
|
|
00db10 |
- mr r10,r9
|
|
|
00db10 |
+ mr r3,r9
|
|
|
00db10 |
addi r8,r8,-4
|
|
|
00db10 |
|
|
|
00db10 |
- /* r10 has the output of the cmpb instruction, that is, it contains
|
|
|
00db10 |
+ /* r3 has the output of the cmpb instruction, that is, it contains
|
|
|
00db10 |
0xff in the same position as BYTE in the original
|
|
|
00db10 |
word from the string. Use that to calculate the pointer.
|
|
|
00db10 |
We need to make sure BYTE is *before* the end of the
|
|
|
00db10 |
range. */
|
|
|
00db10 |
L(done):
|
|
|
00db10 |
- cntlzw r0,r10 /* Count leading zeroes before the match. */
|
|
|
00db10 |
- srwi r6,r0,3 /* Convert leading zeroes to bytes. */
|
|
|
00db10 |
- addi r0,r6,1
|
|
|
00db10 |
+ cntlzw r9,r3 /* Count leading zeros before the match. */
|
|
|
00db10 |
+ cmplw r8,r0 /* Are we on the last word? */
|
|
|
00db10 |
+ srwi r6,r9,3 /* Convert leading zeros to bytes. */
|
|
|
00db10 |
+ addi r0,r6,-3
|
|
|
00db10 |
sub r3,r8,r0
|
|
|
00db10 |
- cmplw r3,r7
|
|
|
00db10 |
- blt L(null)
|
|
|
00db10 |
+ cmplw cr7,r3,r10
|
|
|
00db10 |
+ bnelr
|
|
|
00db10 |
+ bgelr cr7
|
|
|
00db10 |
+ li r3,0
|
|
|
00db10 |
blr
|
|
|
00db10 |
|
|
|
00db10 |
.align 4
|
|
|
00db10 |
@@ -149,29 +161,36 @@
|
|
|
00db10 |
cmplwi r5,0
|
|
|
00db10 |
beq L(null)
|
|
|
00db10 |
|
|
|
00db10 |
- lwbrx r12,r8,r6 /* Load reversed word from memory. */
|
|
|
00db10 |
- cmpb r10,r12,r4 /* Check for null bytes in WORD1. */
|
|
|
00db10 |
- slw r10,r10,r0
|
|
|
00db10 |
- srw r10,r10,r0
|
|
|
00db10 |
- cmplwi cr7,r10,0
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ lwzx r12,0,r8
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ lwbrx r12,0,r8 /* Load reversed word from memory. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4 /* Check for BYTE in WORD1. */
|
|
|
00db10 |
+ and r3,r3,r9
|
|
|
00db10 |
+ cmplwi cr7,r3,0
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
+ /* Are we done already? */
|
|
|
00db10 |
+ cmplw r8,r0
|
|
|
00db10 |
addi r8,r8,-4
|
|
|
00db10 |
- cmplw r8,r7
|
|
|
00db10 |
- ble L(null)
|
|
|
00db10 |
- b L(loop_small)
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
- .p2align 5
|
|
|
00db10 |
+ .align 5
|
|
|
00db10 |
L(loop_small):
|
|
|
00db10 |
- lwbrx r12,r8,r6
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- cmplwi cr6,r10,0
|
|
|
00db10 |
- bne cr6,L(done)
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ lwzx r12,0,r8
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ lwbrx r12,0,r8
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmplw r8,r0
|
|
|
00db10 |
+ cmplwi cr7,r3,0
|
|
|
00db10 |
+ bne cr7,L(done)
|
|
|
00db10 |
addi r8,r8,-4
|
|
|
00db10 |
- cmplw r8,r7
|
|
|
00db10 |
- ble L(null)
|
|
|
00db10 |
- b L(loop_small)
|
|
|
00db10 |
+ bne L(loop_small)
|
|
|
00db10 |
+ blr
|
|
|
00db10 |
|
|
|
00db10 |
-END (BP_SYM (__memrchr))
|
|
|
00db10 |
-weak_alias (BP_SYM (__memrchr), BP_SYM(memrchr))
|
|
|
00db10 |
+END (__memrchr)
|
|
|
00db10 |
+weak_alias (__memrchr, memrchr)
|
|
|
00db10 |
libc_hidden_builtin_def (memrchr)
|
|
|
00db10 |
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/rawmemchr.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
|
|
|
00db10 |
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/rawmemchr.S 2014-05-29 13:09:17.000000000 -0500
|
|
|
00db10 |
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/rawmemchr.S 2014-05-29 13:09:19.000000000 -0500
|
|
|
00db10 |
@@ -29,16 +29,21 @@
|
|
|
00db10 |
clrrwi r8,r3,2 /* Align the address to word boundary. */
|
|
|
00db10 |
|
|
|
00db10 |
/* Replicate byte to word. */
|
|
|
00db10 |
- rlwimi r4,r4,8,16,23
|
|
|
00db10 |
- rlwimi r4,r4,16,0,15
|
|
|
00db10 |
+ rldimi r4,r4,8,48
|
|
|
00db10 |
+ rldimi r4,r4,16,32
|
|
|
00db10 |
|
|
|
00db10 |
/* Now r4 has a word of c bytes. */
|
|
|
00db10 |
|
|
|
00db10 |
rlwinm r6,r3,3,27,28 /* Calculate padding. */
|
|
|
00db10 |
lwz r12,0(r8) /* Load word from memory. */
|
|
|
00db10 |
cmpb r5,r12,r4 /* Compare each byte against c byte. */
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ srw r5,r5,r6
|
|
|
00db10 |
+ slw r5,r5,r6
|
|
|
00db10 |
+#else
|
|
|
00db10 |
slw r5,r5,r6 /* Move left to discard ignored bits. */
|
|
|
00db10 |
srw r5,r5,r6 /* Bring the bits back as zeros. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
cmpwi cr7,r5,0 /* If r5 == 0, no c bytes have been found. */
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
@@ -92,8 +97,14 @@
|
|
|
00db10 |
word from the string. Use that fact to find out what is
|
|
|
00db10 |
the position of the byte inside the string. */
|
|
|
00db10 |
L(done):
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ addi r0,r5,-1
|
|
|
00db10 |
+ andc r0,r0,r5
|
|
|
00db10 |
+ popcntw r0,r0
|
|
|
00db10 |
+#else
|
|
|
00db10 |
cntlzw r0,r5 /* Count leading zeros before the match. */
|
|
|
00db10 |
- srwi r0,r0,3 /* Convert leading zeroes to bytes. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ srwi r0,r0,3 /* Convert leading zeros to bytes. */
|
|
|
00db10 |
add r3,r8,r0 /* Return address of the matching char. */
|
|
|
00db10 |
blr
|
|
|
00db10 |
END (BP_SYM (__rawmemchr))
|
|
|
00db10 |
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memchr.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memchr.S
|
|
|
00db10 |
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memchr.S 2014-05-29 13:09:17.000000000 -0500
|
|
|
00db10 |
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memchr.S 2014-05-29 13:13:57.000000000 -0500
|
|
|
00db10 |
@@ -1,5 +1,5 @@
|
|
|
00db10 |
/* Optimized memchr implementation for PowerPC64/POWER7 using cmpb insn.
|
|
|
00db10 |
- Copyright (C) 2010-2012 Free Software Foundation, Inc.
|
|
|
00db10 |
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
|
|
|
00db10 |
Contributed by Luis Machado <luisgpm@br.ibm.com>.
|
|
|
00db10 |
This file is part of the GNU C Library.
|
|
|
00db10 |
|
|
|
00db10 |
@@ -18,118 +18,119 @@
|
|
|
00db10 |
<http://www.gnu.org/licenses/>. */
|
|
|
00db10 |
|
|
|
00db10 |
#include <sysdep.h>
|
|
|
00db10 |
-#include <bp-sym.h>
|
|
|
00db10 |
-#include <bp-asm.h>
|
|
|
00db10 |
|
|
|
00db10 |
/* int [r3] memchr (char *s [r3], int byte [r4], int size [r5]) */
|
|
|
00db10 |
.machine power7
|
|
|
00db10 |
-ENTRY (BP_SYM (__memchr))
|
|
|
00db10 |
- CALL_MCOUNT 2
|
|
|
00db10 |
+ENTRY (__memchr)
|
|
|
00db10 |
+ CALL_MCOUNT 3
|
|
|
00db10 |
dcbt 0,r3
|
|
|
00db10 |
clrrdi r8,r3,3
|
|
|
00db10 |
- rlwimi r4,r4,8,16,23
|
|
|
00db10 |
- rlwimi r4,r4,16,0,15
|
|
|
00db10 |
+ insrdi r4,r4,8,48
|
|
|
00db10 |
add r7,r3,r5 /* Calculate the last acceptable address. */
|
|
|
00db10 |
+ insrdi r4,r4,16,32
|
|
|
00db10 |
cmpldi r5,32
|
|
|
00db10 |
+ li r9, -1
|
|
|
00db10 |
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
|
|
|
00db10 |
insrdi r4,r4,32,0
|
|
|
00db10 |
+ addi r7,r7,-1
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ sld r9,r9,r6
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ srd r9,r9,r6
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
ble L(small_range)
|
|
|
00db10 |
|
|
|
00db10 |
- cmpld cr7,r3,r7 /* Compare the starting address (r3) with the
|
|
|
00db10 |
- ending address (r7). If (r3 >= r7),
|
|
|
00db10 |
- the size passed in was zero or negative. */
|
|
|
00db10 |
- ble cr7,L(proceed)
|
|
|
00db10 |
-
|
|
|
00db10 |
- li r7,-1 /* Artificially set our ending address (r7)
|
|
|
00db10 |
- such that we will exit early. */
|
|
|
00db10 |
-
|
|
|
00db10 |
-L(proceed):
|
|
|
00db10 |
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
|
|
|
00db10 |
- cmpldi cr6,r6,0 /* cr6 == Do we have padding? */
|
|
|
00db10 |
ld r12,0(r8) /* Load doubleword from memory. */
|
|
|
00db10 |
- cmpb r10,r12,r4 /* Check for BYTEs in DWORD1. */
|
|
|
00db10 |
- beq cr6,L(proceed_no_padding)
|
|
|
00db10 |
- sld r10,r10,r6
|
|
|
00db10 |
- srd r10,r10,r6
|
|
|
00db10 |
-L(proceed_no_padding):
|
|
|
00db10 |
- cmpldi cr7,r10,0 /* Does r10 indicate we got a hit? */
|
|
|
00db10 |
+ cmpb r3,r12,r4 /* Check for BYTEs in DWORD1. */
|
|
|
00db10 |
+ and r3,r3,r9
|
|
|
00db10 |
+ clrldi r5,r7,61 /* Byte count - 1 in last dword. */
|
|
|
00db10 |
+ clrrdi r7,r7,3 /* Address of last doubleword. */
|
|
|
00db10 |
+ cmpldi cr7,r3,0 /* Does r3 indicate we got a hit? */
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
- /* See if we are at the last acceptable address yet. */
|
|
|
00db10 |
- addi r9,r8,8
|
|
|
00db10 |
- cmpld cr6,r9,r7
|
|
|
00db10 |
- bge cr6,L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
mtcrf 0x01,r8
|
|
|
00db10 |
/* Are we now aligned to a quadword boundary? If so, skip to
|
|
|
00db10 |
the main loop. Otherwise, go through the alignment code. */
|
|
|
00db10 |
-
|
|
|
00db10 |
bt 28,L(loop_setup)
|
|
|
00db10 |
|
|
|
00db10 |
/* Handle DWORD2 of pair. */
|
|
|
00db10 |
ldu r12,8(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- cmpldi cr7,r10,0
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmpldi cr7,r3,0
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
- /* Are we done already? */
|
|
|
00db10 |
- addi r9,r8,8
|
|
|
00db10 |
- cmpld cr6,r9,r7
|
|
|
00db10 |
- bge cr6,L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
L(loop_setup):
|
|
|
00db10 |
- sub r5,r7,r9
|
|
|
00db10 |
- srdi r6,r5,4 /* Number of loop iterations. */
|
|
|
00db10 |
+ /* The last dword we want to read in the loop below is the one
|
|
|
00db10 |
+ containing the last byte of the string, ie. the dword at
|
|
|
00db10 |
+ (s + size - 1) & ~7, or r7. The first dword read is at
|
|
|
00db10 |
+ r8 + 8, we read 2 * cnt dwords, so the last dword read will
|
|
|
00db10 |
+ be at r8 + 8 + 16 * cnt - 8. Solving for cnt gives
|
|
|
00db10 |
+ cnt = (r7 - r8) / 16 */
|
|
|
00db10 |
+ sub r6,r7,r8
|
|
|
00db10 |
+ srdi r6,r6,4 /* Number of loop iterations. */
|
|
|
00db10 |
mtctr r6 /* Setup the counter. */
|
|
|
00db10 |
- b L(loop)
|
|
|
00db10 |
- /* Main loop to look for BYTE backwards in the string. Since
|
|
|
00db10 |
- it's a small loop (< 8 instructions), align it to 32-bytes. */
|
|
|
00db10 |
- .p2align 5
|
|
|
00db10 |
+
|
|
|
00db10 |
+ /* Main loop to look for BYTE in the string. Since
|
|
|
00db10 |
+ it's a small loop (8 instructions), align it to 32-bytes. */
|
|
|
00db10 |
+ .align 5
|
|
|
00db10 |
L(loop):
|
|
|
00db10 |
/* Load two doublewords, compare and merge in a
|
|
|
00db10 |
single register for speed. This is an attempt
|
|
|
00db10 |
to speed up the byte-checking process for bigger strings. */
|
|
|
00db10 |
ld r12,8(r8)
|
|
|
00db10 |
ldu r11,16(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
cmpb r9,r11,r4
|
|
|
00db10 |
- or r5,r9,r10 /* Merge everything in one doubleword. */
|
|
|
00db10 |
- cmpldi cr7,r5,0
|
|
|
00db10 |
+ or r6,r9,r3 /* Merge everything in one doubleword. */
|
|
|
00db10 |
+ cmpldi cr7,r6,0
|
|
|
00db10 |
bne cr7,L(found)
|
|
|
00db10 |
bdnz L(loop)
|
|
|
00db10 |
|
|
|
00db10 |
- /* We're here because the counter reached 0, and that means we
|
|
|
00db10 |
- didn't have any matches for BYTE in the whole range. */
|
|
|
00db10 |
- subi r11,r7,8
|
|
|
00db10 |
- cmpld cr6,r8,r11
|
|
|
00db10 |
- blt cr6,L(loop_small)
|
|
|
00db10 |
- b L(null)
|
|
|
00db10 |
+ /* We may have one more dword to read. */
|
|
|
00db10 |
+ cmpld r8,r7
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
+ ldu r12,8(r8)
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmpldi cr6,r3,0
|
|
|
00db10 |
+ bne cr6,L(done)
|
|
|
00db10 |
+ blr
|
|
|
00db10 |
+
|
|
|
00db10 |
+ .align 4
|
|
|
00db10 |
+L(found):
|
|
|
00db10 |
/* OK, one (or both) of the doublewords contains BYTE. Check
|
|
|
00db10 |
the first doubleword and decrement the address in case the first
|
|
|
00db10 |
doubleword really contains BYTE. */
|
|
|
00db10 |
- .align 4
|
|
|
00db10 |
-L(found):
|
|
|
00db10 |
- cmpldi cr6,r10,0
|
|
|
00db10 |
+ cmpldi cr6,r3,0
|
|
|
00db10 |
addi r8,r8,-8
|
|
|
00db10 |
bne cr6,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
/* BYTE must be in the second doubleword. Adjust the address
|
|
|
00db10 |
- again and move the result of cmpb to r10 so we can calculate the
|
|
|
00db10 |
+ again and move the result of cmpb to r3 so we can calculate the
|
|
|
00db10 |
pointer. */
|
|
|
00db10 |
|
|
|
00db10 |
- mr r10,r9
|
|
|
00db10 |
+ mr r3,r9
|
|
|
00db10 |
addi r8,r8,8
|
|
|
00db10 |
|
|
|
00db10 |
- /* r10 has the output of the cmpb instruction, that is, it contains
|
|
|
00db10 |
+ /* r3 has the output of the cmpb instruction, that is, it contains
|
|
|
00db10 |
0xff in the same position as BYTE in the original
|
|
|
00db10 |
doubleword from the string. Use that to calculate the pointer.
|
|
|
00db10 |
We need to make sure BYTE is *before* the end of the range. */
|
|
|
00db10 |
L(done):
|
|
|
00db10 |
- cntlzd r0,r10 /* Count leading zeroes before the match. */
|
|
|
00db10 |
- srdi r0,r0,3 /* Convert leading zeroes to bytes. */
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ addi r0,r3,-1
|
|
|
00db10 |
+ andc r0,r0,r3
|
|
|
00db10 |
+ popcntd r0,r0 /* Count trailing zeros. */
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ cntlzd r0,r3 /* Count leading zeros before the match. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpld r8,r7 /* Are we on the last dword? */
|
|
|
00db10 |
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
|
|
|
00db10 |
add r3,r8,r0
|
|
|
00db10 |
- cmpld r3,r7
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
+ cmpld cr7,r0,r5 /* If on the last dword, check byte offset. */
|
|
|
00db10 |
+ bnelr
|
|
|
00db10 |
+ blelr cr7
|
|
|
00db10 |
+ li r3,0
|
|
|
00db10 |
blr
|
|
|
00db10 |
|
|
|
00db10 |
.align 4
|
|
|
00db10 |
@@ -141,67 +142,44 @@
|
|
|
00db10 |
.align 4
|
|
|
00db10 |
L(small_range):
|
|
|
00db10 |
cmpldi r5,0
|
|
|
00db10 |
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
|
|
|
00db10 |
- beq L(null) /* This branch is for the cmpldi r5,0 above. */
|
|
|
00db10 |
+ beq L(null)
|
|
|
00db10 |
ld r12,0(r8) /* Load word from memory. */
|
|
|
00db10 |
- cmpldi cr6,r6,0 /* cr6 == Do we have padding? */
|
|
|
00db10 |
- cmpb r10,r12,r4 /* Check for BYTE in DWORD1. */
|
|
|
00db10 |
- /* If no padding, skip the shifts. */
|
|
|
00db10 |
- beq cr6,L(small_no_padding)
|
|
|
00db10 |
- sld r10,r10,r6
|
|
|
00db10 |
- srd r10,r10,r6
|
|
|
00db10 |
-L(small_no_padding):
|
|
|
00db10 |
- cmpldi cr7,r10,0
|
|
|
00db10 |
+ cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
|
|
|
00db10 |
+ and r3,r3,r9
|
|
|
00db10 |
+ cmpldi cr7,r3,0
|
|
|
00db10 |
+ clrldi r5,r7,61 /* Byte count - 1 in last dword. */
|
|
|
00db10 |
+ clrrdi r7,r7,3 /* Address of last doubleword. */
|
|
|
00db10 |
+ cmpld r8,r7 /* Are we done already? */
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
-
|
|
|
00db10 |
- /* Are we done already? */
|
|
|
00db10 |
- addi r9,r8,8
|
|
|
00db10 |
- cmpld r9,r7
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
- /* If we're not done, drop through into loop_small. */
|
|
|
00db10 |
-
|
|
|
00db10 |
-L(loop_small): /* loop_small has been unrolled. */
|
|
|
00db10 |
- ldu r12,8(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- addi r9,r8,8
|
|
|
00db10 |
- cmpldi cr6,r10,0
|
|
|
00db10 |
- cmpld r9,r7
|
|
|
00db10 |
- bne cr6,L(done) /* Found something. */
|
|
|
00db10 |
- bge L(null) /* Hit end of string (length). */
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
ldu r12,8(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- addi r9,r8,8
|
|
|
00db10 |
- cmpldi cr6,r10,0
|
|
|
00db10 |
- cmpld r9,r7
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmpldi cr6,r3,0
|
|
|
00db10 |
+ cmpld r8,r7
|
|
|
00db10 |
bne cr6,L(done) /* Found something. */
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
+ beqlr /* Hit end of string (length). */
|
|
|
00db10 |
|
|
|
00db10 |
ldu r12,8(r8)
|
|
|
00db10 |
- subi r11,r7,8
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- cmpldi cr6,r10,0
|
|
|
00db10 |
- ori r2,r2,0 /* Force a dispatch group. */
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmpldi cr6,r3,0
|
|
|
00db10 |
+ cmpld r8,r7
|
|
|
00db10 |
bne cr6,L(done)
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
- cmpld r8,r11 /* At end of range? */
|
|
|
00db10 |
- bge L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
- /* For most cases we will never get here. Under some combinations of
|
|
|
00db10 |
- padding + length there is a leftover double that still needs to be
|
|
|
00db10 |
- checked. */
|
|
|
00db10 |
- ldu r12,8(r8)
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- addi r9,r8,8
|
|
|
00db10 |
- cmpldi cr6,r10,0
|
|
|
00db10 |
- cmpld r9,r7
|
|
|
00db10 |
- bne cr6,L(done) /* Found something. */
|
|
|
00db10 |
+ ldu r12,8(r8)
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmpldi cr6,r3,0
|
|
|
00db10 |
+ cmpld r8,r7
|
|
|
00db10 |
+ bne cr6,L(done)
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
- /* Save a branch and exit directly. */
|
|
|
00db10 |
- li r3,0
|
|
|
00db10 |
+ ldu r12,8(r8)
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmpldi cr6,r3,0
|
|
|
00db10 |
+ bne cr6,L(done)
|
|
|
00db10 |
blr
|
|
|
00db10 |
|
|
|
00db10 |
-
|
|
|
00db10 |
-END (BP_SYM (__memchr))
|
|
|
00db10 |
-weak_alias (BP_SYM (__memchr), BP_SYM(memchr))
|
|
|
00db10 |
+END (__memchr)
|
|
|
00db10 |
+weak_alias (__memchr, memchr)
|
|
|
00db10 |
libc_hidden_builtin_def (memchr)
|
|
|
00db10 |
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memrchr.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memrchr.S
|
|
|
00db10 |
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memrchr.S 2014-05-29 13:09:17.000000000 -0500
|
|
|
00db10 |
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memrchr.S 2014-05-29 13:14:06.000000000 -0500
|
|
|
00db10 |
@@ -1,5 +1,5 @@
|
|
|
00db10 |
/* Optimized memrchr implementation for PowerPC64/POWER7 using cmpb insn.
|
|
|
00db10 |
- Copyright (C) 2010 Free Software Foundation, Inc.
|
|
|
00db10 |
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
|
|
|
00db10 |
Contributed by Luis Machado <luisgpm@br.ibm.com>.
|
|
|
00db10 |
This file is part of the GNU C Library.
|
|
|
00db10 |
|
|
|
00db10 |
@@ -18,125 +18,137 @@
|
|
|
00db10 |
<http://www.gnu.org/licenses/>. */
|
|
|
00db10 |
|
|
|
00db10 |
#include <sysdep.h>
|
|
|
00db10 |
-#include <bp-sym.h>
|
|
|
00db10 |
-#include <bp-asm.h>
|
|
|
00db10 |
|
|
|
00db10 |
/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */
|
|
|
00db10 |
.machine power7
|
|
|
00db10 |
-ENTRY (BP_SYM (__memrchr))
|
|
|
00db10 |
- CALL_MCOUNT
|
|
|
00db10 |
- dcbt 0,r3
|
|
|
00db10 |
- mr r7,r3
|
|
|
00db10 |
- add r3,r7,r5 /* Calculate the last acceptable address. */
|
|
|
00db10 |
- cmpld cr7,r3,r7 /* Is the address equal or less than r3? */
|
|
|
00db10 |
+ENTRY (__memrchr)
|
|
|
00db10 |
+ CALL_MCOUNT 3
|
|
|
00db10 |
+ add r7,r3,r5 /* Calculate the last acceptable address. */
|
|
|
00db10 |
+ neg r0,r7
|
|
|
00db10 |
+ addi r7,r7,-1
|
|
|
00db10 |
+ mr r10,r3
|
|
|
00db10 |
+ clrrdi r6,r7,7
|
|
|
00db10 |
+ li r9,3<<5
|
|
|
00db10 |
+ dcbt r9,r6,8 /* Stream hint, decreasing addresses. */
|
|
|
00db10 |
|
|
|
00db10 |
/* Replicate BYTE to doubleword. */
|
|
|
00db10 |
- rlwimi r4,r4,8,16,23
|
|
|
00db10 |
- rlwimi r4,r4,16,0,15
|
|
|
00db10 |
+ insrdi r4,r4,8,48
|
|
|
00db10 |
+ insrdi r4,r4,16,32
|
|
|
00db10 |
insrdi r4,r4,32,0
|
|
|
00db10 |
- bge cr7,L(proceed)
|
|
|
00db10 |
-
|
|
|
00db10 |
- li r3,-1 /* Make r11 the biggest if r4 <= 0. */
|
|
|
00db10 |
-L(proceed):
|
|
|
00db10 |
li r6,-8
|
|
|
00db10 |
- addi r9,r3,-1
|
|
|
00db10 |
- clrrdi r8,r9,3
|
|
|
00db10 |
- addi r8,r8,8
|
|
|
00db10 |
- neg r0,r3
|
|
|
00db10 |
+ li r9,-1
|
|
|
00db10 |
rlwinm r0,r0,3,26,28 /* Calculate padding. */
|
|
|
00db10 |
-
|
|
|
00db10 |
+ clrrdi r8,r7,3
|
|
|
00db10 |
+ srd r9,r9,r0
|
|
|
00db10 |
cmpldi r5,32
|
|
|
00db10 |
+ clrrdi r0,r10,3
|
|
|
00db10 |
ble L(small_range)
|
|
|
00db10 |
|
|
|
00db10 |
- ldbrx r12,r8,r6 /* Load reversed doubleword from memory. */
|
|
|
00db10 |
- cmpb r10,r12,r4 /* Check for BYTE in DWORD1. */
|
|
|
00db10 |
- sld r10,r10,r0
|
|
|
00db10 |
- srd r10,r10,r0
|
|
|
00db10 |
- cmpldi cr7,r10,0 /* If r10 == 0, no BYTE's have been found. */
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ ldx r12,0,r8
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ ldbrx r12,0,r8 /* Load reversed doubleword from memory. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
|
|
|
00db10 |
+ and r3,r3,r9
|
|
|
00db10 |
+ cmpldi cr7,r3,0 /* If r3 == 0, no BYTEs have been found. */
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
- /* Are we done already? */
|
|
|
00db10 |
- addi r9,r8,-8
|
|
|
00db10 |
- cmpld cr6,r9,r7
|
|
|
00db10 |
- ble cr6,L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
mtcrf 0x01,r8
|
|
|
00db10 |
- /* Are we now aligned to a doubleword boundary? If so, skip to
|
|
|
00db10 |
+ /* Are we now aligned to a quadword boundary? If so, skip to
|
|
|
00db10 |
the main loop. Otherwise, go through the alignment code. */
|
|
|
00db10 |
- mr r8,r9
|
|
|
00db10 |
- bt 28,L(loop_setup)
|
|
|
00db10 |
+ bf 28,L(loop_setup)
|
|
|
00db10 |
|
|
|
00db10 |
/* Handle DWORD2 of pair. */
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ ldx r12,r8,r6
|
|
|
00db10 |
+#else
|
|
|
00db10 |
ldbrx r12,r8,r6
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- cmpldi cr7,r10,0
|
|
|
00db10 |
- bne cr7,L(done)
|
|
|
00db10 |
-
|
|
|
00db10 |
- /* Are we done already. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
addi r8,r8,-8
|
|
|
00db10 |
- cmpld cr6,r8,r7
|
|
|
00db10 |
- ble cr6,L(null)
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmpldi cr7,r3,0
|
|
|
00db10 |
+ bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
L(loop_setup):
|
|
|
00db10 |
- li r0,-16
|
|
|
00db10 |
- sub r5,r8,r7
|
|
|
00db10 |
- srdi r9,r5,4 /* Number of loop iterations. */
|
|
|
00db10 |
+ /* The last dword we want to read in the loop below is the one
|
|
|
00db10 |
+ containing the first byte of the string, ie. the dword at
|
|
|
00db10 |
+ s & ~7, or r0. The first dword read is at r8 - 8, we
|
|
|
00db10 |
+ read 2 * cnt dwords, so the last dword read will be at
|
|
|
00db10 |
+ r8 - 8 - 16 * cnt + 8. Solving for cnt gives
|
|
|
00db10 |
+ cnt = (r8 - r0) / 16 */
|
|
|
00db10 |
+ sub r5,r8,r0
|
|
|
00db10 |
+ addi r8,r8,-8
|
|
|
00db10 |
+ srdi r9,r5,4 /* Number of loop iterations. */
|
|
|
00db10 |
mtctr r9 /* Setup the counter. */
|
|
|
00db10 |
- b L(loop)
|
|
|
00db10 |
- /* Main loop to look for BYTE backwards in the string. Since it's a
|
|
|
00db10 |
- small loop (< 8 instructions), align it to 32-bytes. */
|
|
|
00db10 |
- .p2align 5
|
|
|
00db10 |
+
|
|
|
00db10 |
+ /* Main loop to look for BYTE backwards in the string.
|
|
|
00db10 |
+ FIXME: Investigate whether 32 byte align helps with this
|
|
|
00db10 |
+ 9 instruction loop. */
|
|
|
00db10 |
+ .align 5
|
|
|
00db10 |
L(loop):
|
|
|
00db10 |
/* Load two doublewords, compare and merge in a
|
|
|
00db10 |
single register for speed. This is an attempt
|
|
|
00db10 |
to speed up the byte-checking process for bigger strings. */
|
|
|
00db10 |
|
|
|
00db10 |
- ldbrx r12,r8,r6
|
|
|
00db10 |
- ldbrx r11,r8,r0
|
|
|
00db10 |
- addi r8,r8,-8
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ ldx r12,0,r8
|
|
|
00db10 |
+ ldx r11,r8,r6
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ ldbrx r12,0,r8
|
|
|
00db10 |
+ ldbrx r11,r8,r6
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
cmpb r9,r11,r4
|
|
|
00db10 |
- or r5,r9,r10 /* Merge everything in one doubleword. */
|
|
|
00db10 |
+ or r5,r9,r3 /* Merge everything in one doubleword. */
|
|
|
00db10 |
cmpldi cr7,r5,0
|
|
|
00db10 |
bne cr7,L(found)
|
|
|
00db10 |
- addi r8,r8,-8
|
|
|
00db10 |
+ addi r8,r8,-16
|
|
|
00db10 |
bdnz L(loop)
|
|
|
00db10 |
- /* We're here because the counter reached 0, and that means we
|
|
|
00db10 |
- didn't have any matches for BYTE in the whole range. Just return
|
|
|
00db10 |
- the original range. */
|
|
|
00db10 |
- addi r9,r8,8
|
|
|
00db10 |
- cmpld cr6,r9,r7
|
|
|
00db10 |
- bgt cr6,L(loop_small)
|
|
|
00db10 |
- b L(null)
|
|
|
00db10 |
-
|
|
|
00db10 |
- /* OK, one (or both) of the words contains BYTE. Check
|
|
|
00db10 |
- the first word and decrement the address in case the first
|
|
|
00db10 |
- word really contains BYTE. */
|
|
|
00db10 |
+
|
|
|
00db10 |
+ /* We may have one more word to read. */
|
|
|
00db10 |
+ cmpld r8,r0
|
|
|
00db10 |
+ bnelr
|
|
|
00db10 |
+
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ ldx r12,0,r8
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ ldbrx r12,0,r8
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmpldi cr7,r3,0
|
|
|
00db10 |
+ bne cr7,L(done)
|
|
|
00db10 |
+ blr
|
|
|
00db10 |
+
|
|
|
00db10 |
.align 4
|
|
|
00db10 |
L(found):
|
|
|
00db10 |
- cmpldi cr6,r10,0
|
|
|
00db10 |
- addi r8,r8,8
|
|
|
00db10 |
+ /* OK, one (or both) of the dwords contains BYTE. Check
|
|
|
00db10 |
+ the first dword. */
|
|
|
00db10 |
+ cmpldi cr6,r3,0
|
|
|
00db10 |
bne cr6,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
/* BYTE must be in the second word. Adjust the address
|
|
|
00db10 |
- again and move the result of cmpb to r10 so we can calculate the
|
|
|
00db10 |
+ again and move the result of cmpb to r3 so we can calculate the
|
|
|
00db10 |
pointer. */
|
|
|
00db10 |
|
|
|
00db10 |
- mr r10,r9
|
|
|
00db10 |
+ mr r3,r9
|
|
|
00db10 |
addi r8,r8,-8
|
|
|
00db10 |
|
|
|
00db10 |
- /* r10 has the output of the cmpb instruction, that is, it contains
|
|
|
00db10 |
- 0xff in the same position as the BYTE in the original
|
|
|
00db10 |
+ /* r3 has the output of the cmpb instruction, that is, it contains
|
|
|
00db10 |
+ 0xff in the same position as BYTE in the original
|
|
|
00db10 |
word from the string. Use that to calculate the pointer.
|
|
|
00db10 |
We need to make sure BYTE is *before* the end of the
|
|
|
00db10 |
range. */
|
|
|
00db10 |
L(done):
|
|
|
00db10 |
- cntlzd r0,r10 /* Count leading zeroes before the match. */
|
|
|
00db10 |
- srdi r6,r0,3 /* Convert leading zeroes to bytes. */
|
|
|
00db10 |
- addi r0,r6,1
|
|
|
00db10 |
+ cntlzd r9,r3 /* Count leading zeros before the match. */
|
|
|
00db10 |
+ cmpld r8,r0 /* Are we on the last word? */
|
|
|
00db10 |
+ srdi r6,r9,3 /* Convert leading zeros to bytes. */
|
|
|
00db10 |
+ addi r0,r6,-7
|
|
|
00db10 |
sub r3,r8,r0
|
|
|
00db10 |
- cmpld r3,r7
|
|
|
00db10 |
- blt L(null)
|
|
|
00db10 |
+ cmpld cr7,r3,r10
|
|
|
00db10 |
+ bnelr
|
|
|
00db10 |
+ bgelr cr7
|
|
|
00db10 |
+ li r3,0
|
|
|
00db10 |
blr
|
|
|
00db10 |
|
|
|
00db10 |
.align 4
|
|
|
00db10 |
@@ -150,30 +162,36 @@
|
|
|
00db10 |
cmpldi r5,0
|
|
|
00db10 |
beq L(null)
|
|
|
00db10 |
|
|
|
00db10 |
- ldbrx r12,r8,r6 /* Load reversed doubleword from memory. */
|
|
|
00db10 |
- cmpb r10,r12,r4 /* Check for BYTE in DWORD1. */
|
|
|
00db10 |
- sld r10,r10,r0
|
|
|
00db10 |
- srd r10,r10,r0
|
|
|
00db10 |
- cmpldi cr7,r10,0
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ ldx r12,0,r8
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ ldbrx r12,0,r8 /* Load reversed doubleword from memory. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
|
|
|
00db10 |
+ and r3,r3,r9
|
|
|
00db10 |
+ cmpldi cr7,r3,0
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
/* Are we done already? */
|
|
|
00db10 |
+ cmpld r8,r0
|
|
|
00db10 |
addi r8,r8,-8
|
|
|
00db10 |
- cmpld r8,r7
|
|
|
00db10 |
- ble L(null)
|
|
|
00db10 |
- b L(loop_small)
|
|
|
00db10 |
+ beqlr
|
|
|
00db10 |
|
|
|
00db10 |
- .p2align 5
|
|
|
00db10 |
+ .align 5
|
|
|
00db10 |
L(loop_small):
|
|
|
00db10 |
- ldbrx r12,r8,r6
|
|
|
00db10 |
- cmpb r10,r12,r4
|
|
|
00db10 |
- cmpldi cr6,r10,0
|
|
|
00db10 |
- bne cr6,L(done)
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ ldx r12,0,r8
|
|
|
00db10 |
+#else
|
|
|
00db10 |
+ ldbrx r12,0,r8
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ cmpb r3,r12,r4
|
|
|
00db10 |
+ cmpld r8,r0
|
|
|
00db10 |
+ cmpldi cr7,r3,0
|
|
|
00db10 |
+ bne cr7,L(done)
|
|
|
00db10 |
addi r8,r8,-8
|
|
|
00db10 |
- cmpld r8,r7
|
|
|
00db10 |
- ble L(null)
|
|
|
00db10 |
- b L(loop_small)
|
|
|
00db10 |
+ bne L(loop_small)
|
|
|
00db10 |
+ blr
|
|
|
00db10 |
|
|
|
00db10 |
-END (BP_SYM (__memrchr))
|
|
|
00db10 |
-weak_alias (BP_SYM (__memrchr), BP_SYM(memrchr))
|
|
|
00db10 |
+END (__memrchr)
|
|
|
00db10 |
+weak_alias (__memrchr, memrchr)
|
|
|
00db10 |
libc_hidden_builtin_def (memrchr)
|
|
|
00db10 |
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/rawmemchr.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
|
|
|
00db10 |
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/rawmemchr.S 2014-05-29 13:09:17.000000000 -0500
|
|
|
00db10 |
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/rawmemchr.S 2014-05-29 13:09:19.000000000 -0500
|
|
|
00db10 |
@@ -29,8 +29,8 @@
|
|
|
00db10 |
clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
|
|
|
00db10 |
|
|
|
00db10 |
/* Replicate byte to doubleword. */
|
|
|
00db10 |
- rlwimi r4,r4,8,16,23
|
|
|
00db10 |
- rlwimi r4,r4,16,0,15
|
|
|
00db10 |
+ insrdi r4,r4,8,48
|
|
|
00db10 |
+ insrdi r4,r4,16,32
|
|
|
00db10 |
insrdi r4,r4,32,0
|
|
|
00db10 |
|
|
|
00db10 |
/* Now r4 has a doubleword of c bytes. */
|
|
|
00db10 |
@@ -38,8 +38,13 @@
|
|
|
00db10 |
rlwinm r6,r3,3,26,28 /* Calculate padding. */
|
|
|
00db10 |
ld r12,0(r8) /* Load doubleword from memory. */
|
|
|
00db10 |
cmpb r5,r12,r4 /* Compare each byte against c byte. */
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ srd r5,r5,r6
|
|
|
00db10 |
+ sld r5,r5,r6
|
|
|
00db10 |
+#else
|
|
|
00db10 |
sld r5,r5,r6 /* Move left to discard ignored bits. */
|
|
|
00db10 |
srd r5,r5,r6 /* Bring the bits back as zeros. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
cmpdi cr7,r5,0 /* If r5 == 0, no c bytes have been found. */
|
|
|
00db10 |
bne cr7,L(done)
|
|
|
00db10 |
|
|
|
00db10 |
@@ -93,8 +98,14 @@
|
|
|
00db10 |
doubleword from the string. Use that fact to find out what is
|
|
|
00db10 |
the position of the byte inside the string. */
|
|
|
00db10 |
L(done):
|
|
|
00db10 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
00db10 |
+ addi r0,r5,-1
|
|
|
00db10 |
+ andc r0,r0,r5
|
|
|
00db10 |
+ popcntd r0,r0 /* Count trailing zeros. */
|
|
|
00db10 |
+#else
|
|
|
00db10 |
cntlzd r0,r5 /* Count leading zeros before the match. */
|
|
|
00db10 |
- srdi r0,r0,3 /* Convert leading zeroes to bytes. */
|
|
|
00db10 |
+#endif
|
|
|
00db10 |
+ srdi r0,r0,3 /* Convert leading zeros to bytes. */
|
|
|
00db10 |
add r3,r8,r0 /* Return address of the matching char. */
|
|
|
00db10 |
blr
|
|
|
00db10 |
END (BP_SYM (__rawmemchr))
|