|
|
8a984d |
commit dd59655e9371af86043b97e38953f43bd9496699
|
|
|
8a984d |
Author: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
|
|
|
8a984d |
Date: Fri Apr 30 18:12:08 2021 -0300
|
|
|
8a984d |
|
|
|
8a984d |
powerpc64le: Optimized memmove for POWER10
|
|
|
8a984d |
|
|
|
8a984d |
This patch was initially based on the __memmove_power7 with some ideas
|
|
|
8a984d |
from strncpy implementation for Power 9.
|
|
|
8a984d |
|
|
|
8a984d |
Improvements from __memmove_power7:
|
|
|
8a984d |
|
|
|
8a984d |
1. Use lxvl/stxvl for alignment code.
|
|
|
8a984d |
|
|
|
8a984d |
The code for Power 7 uses branches when the input is not naturally
|
|
|
8a984d |
aligned to the width of a vector. The new implementation uses
|
|
|
8a984d |
lxvl/stxvl instead which reduces pressure on GPRs. It also allows
|
|
|
8a984d |
the removal of branch instructions, implicitly removing branch stalls
|
|
|
8a984d |
and mispredictions.
|
|
|
8a984d |
|
|
|
8a984d |
2. Use of lxv/stxv and lxvl/stxvl pair is safe to use on Cache Inhibited
|
|
|
8a984d |
memory.
|
|
|
8a984d |
|
|
|
8a984d |
On Power 10 vector load and stores are safe to use on CI memory for
|
|
|
8a984d |
addresses unaligned to 16B. This code takes advantage of this to
|
|
|
8a984d |
do unaligned loads.
|
|
|
8a984d |
|
|
|
8a984d |
The unaligned loads don't have a significant performance impact by
|
|
|
8a984d |
themselves. However doing so decreases register pressure on GPRs
|
|
|
8a984d |
and interdependence stalls on load/store pairs. This also improved
|
|
|
8a984d |
readability as there are now less code paths for different alignments.
|
|
|
8a984d |
Finally this reduces the overall code size.
|
|
|
8a984d |
|
|
|
8a984d |
3. Improved performance.
|
|
|
8a984d |
|
|
|
8a984d |
This version runs on average about 30% better than memmove_power7
|
|
|
8a984d |
for lengths larger than 8KB. For input lengths shorter than 8KB
|
|
|
8a984d |
the improvement is smaller, it has on average about 17% better
|
|
|
8a984d |
performance.
|
|
|
8a984d |
|
|
|
8a984d |
This version has a degradation of about 50% for input lengths
|
|
|
8a984d |
in the 0 to 31 bytes range when dest is unaligned.
|
|
|
8a984d |
|
|
|
8a984d |
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
|
|
|
8a984d |
|
|
|
8a984d |
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memmove.S b/sysdeps/powerpc/powerpc64/le/power10/memmove.S
|
|
|
8a984d |
new file mode 100644
|
|
|
8a984d |
index 0000000000000000..7dfd57edeb37e8e4
|
|
|
8a984d |
--- /dev/null
|
|
|
8a984d |
+++ b/sysdeps/powerpc/powerpc64/le/power10/memmove.S
|
|
|
8a984d |
@@ -0,0 +1,320 @@
|
|
|
8a984d |
+/* Optimized memmove implementation for POWER10.
|
|
|
8a984d |
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
|
8a984d |
+ This file is part of the GNU C Library.
|
|
|
8a984d |
+
|
|
|
8a984d |
+ The GNU C Library is free software; you can redistribute it and/or
|
|
|
8a984d |
+ modify it under the terms of the GNU Lesser General Public
|
|
|
8a984d |
+ License as published by the Free Software Foundation; either
|
|
|
8a984d |
+ version 2.1 of the License, or (at your option) any later version.
|
|
|
8a984d |
+
|
|
|
8a984d |
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
|
8a984d |
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
8a984d |
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
8a984d |
+ Lesser General Public License for more details.
|
|
|
8a984d |
+
|
|
|
8a984d |
+ You should have received a copy of the GNU Lesser General Public
|
|
|
8a984d |
+ License along with the GNU C Library; if not, see
|
|
|
8a984d |
+ <https://www.gnu.org/licenses/>. */
|
|
|
8a984d |
+
|
|
|
8a984d |
+#include <sysdep.h>
|
|
|
8a984d |
+
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
|
|
|
8a984d |
+
|
|
|
8a984d |
+ This optimization checks if 'src' and 'dst' overlap. If they do not
|
|
|
8a984d |
+ or 'src' is ahead of 'dest' then it copies forward.
|
|
|
8a984d |
+ Otherwise, an optimized backward copy is used. */
|
|
|
8a984d |
+
|
|
|
8a984d |
+#ifndef MEMMOVE
|
|
|
8a984d |
+# define MEMMOVE memmove
|
|
|
8a984d |
+#endif
|
|
|
8a984d |
+ .machine power9
|
|
|
8a984d |
+ENTRY_TOCLESS (MEMMOVE, 5)
|
|
|
8a984d |
+ CALL_MCOUNT 3
|
|
|
8a984d |
+
|
|
|
8a984d |
+L(_memmove):
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+ /* Check if there is overlap, if so it will branch to backward copy. */
|
|
|
8a984d |
+ subf r9,r4,r3
|
|
|
8a984d |
+ cmpld cr7,r9,r5
|
|
|
8a984d |
+ blt cr7,L(memmove_bwd)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ /* Fast path for length shorter than 16 bytes. */
|
|
|
8a984d |
+ sldi r7,r5,56
|
|
|
8a984d |
+ lxvl 32+v2,r4,r7
|
|
|
8a984d |
+ stxvl 32+v2,r3,r7
|
|
|
8a984d |
+ subic. r8,r5,16
|
|
|
8a984d |
+ blelr
|
|
|
8a984d |
+
|
|
|
8a984d |
+ /* For shorter lengths aligning the dest address to 16 bytes either
|
|
|
8a984d |
+ decreases performance or is irrelevant. I'm making use of this
|
|
|
8a984d |
+ comparison to skip the alignment in. */
|
|
|
8a984d |
+ cmpldi cr6,r5,256
|
|
|
8a984d |
+ bge cr6,L(ge_256)
|
|
|
8a984d |
+ /* Account for the first 16-byte copy. */
|
|
|
8a984d |
+ addi r4,r4,16
|
|
|
8a984d |
+ addi r11,r3,16 /* use r11 to keep dest address on r3. */
|
|
|
8a984d |
+ subi r5,r5,16
|
|
|
8a984d |
+ b L(loop_head)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+L(ge_256):
|
|
|
8a984d |
+ /* Account for the first copy <= 16 bytes. This is necessary for
|
|
|
8a984d |
+ memmove because at this point the src address can be in front of the
|
|
|
8a984d |
+ dest address. */
|
|
|
8a984d |
+ clrldi r9,r5,56
|
|
|
8a984d |
+ li r8,16
|
|
|
8a984d |
+ cmpldi r9,16
|
|
|
8a984d |
+ iselgt r9,r8,r9
|
|
|
8a984d |
+ add r4,r4,r9
|
|
|
8a984d |
+ add r11,r3,r9 /* use r11 to keep dest address on r3. */
|
|
|
8a984d |
+ sub r5,r5,r9
|
|
|
8a984d |
+
|
|
|
8a984d |
+ /* Align dest to 16 bytes. */
|
|
|
8a984d |
+ neg r7,r3
|
|
|
8a984d |
+ clrldi. r9,r7,60
|
|
|
8a984d |
+ beq L(loop_head)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+ sldi r6,r9,56
|
|
|
8a984d |
+ lxvl 32+v0,r4,r6
|
|
|
8a984d |
+ stxvl 32+v0,r11,r6
|
|
|
8a984d |
+ sub r5,r5,r9
|
|
|
8a984d |
+ add r4,r4,r9
|
|
|
8a984d |
+ add r11,r11,r9
|
|
|
8a984d |
+
|
|
|
8a984d |
+L(loop_head):
|
|
|
8a984d |
+ cmpldi r5,63
|
|
|
8a984d |
+ ble L(final_64)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ srdi. r7,r5,7
|
|
|
8a984d |
+ beq L(loop_tail)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ mtctr r7
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* Main loop that copies 128 bytes each iteration. */
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+L(loop):
|
|
|
8a984d |
+ addi r9,r4,64
|
|
|
8a984d |
+ addi r10,r11,64
|
|
|
8a984d |
+
|
|
|
8a984d |
+ lxv 32+v0,0(r4)
|
|
|
8a984d |
+ lxv 32+v1,16(r4)
|
|
|
8a984d |
+ lxv 32+v2,32(r4)
|
|
|
8a984d |
+ lxv 32+v3,48(r4)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ stxv 32+v0,0(r11)
|
|
|
8a984d |
+ stxv 32+v1,16(r11)
|
|
|
8a984d |
+ stxv 32+v2,32(r11)
|
|
|
8a984d |
+ stxv 32+v3,48(r11)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ addi r4,r4,128
|
|
|
8a984d |
+ addi r11,r11,128
|
|
|
8a984d |
+
|
|
|
8a984d |
+ lxv 32+v4,0(r9)
|
|
|
8a984d |
+ lxv 32+v5,16(r9)
|
|
|
8a984d |
+ lxv 32+v6,32(r9)
|
|
|
8a984d |
+ lxv 32+v7,48(r9)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ stxv 32+v4,0(r10)
|
|
|
8a984d |
+ stxv 32+v5,16(r10)
|
|
|
8a984d |
+ stxv 32+v6,32(r10)
|
|
|
8a984d |
+ stxv 32+v7,48(r10)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ bdnz L(loop)
|
|
|
8a984d |
+ clrldi. r5,r5,57
|
|
|
8a984d |
+ beqlr
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* Copy 64 bytes. */
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+L(loop_tail):
|
|
|
8a984d |
+ cmpldi cr5,r5,63
|
|
|
8a984d |
+ ble cr5,L(final_64)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ lxv 32+v0,0(r4)
|
|
|
8a984d |
+ lxv 32+v1,16(r4)
|
|
|
8a984d |
+ lxv 32+v2,32(r4)
|
|
|
8a984d |
+ lxv 32+v3,48(r4)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ stxv 32+v0,0(r11)
|
|
|
8a984d |
+ stxv 32+v1,16(r11)
|
|
|
8a984d |
+ stxv 32+v2,32(r11)
|
|
|
8a984d |
+ stxv 32+v3,48(r11)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ addi r4,r4,64
|
|
|
8a984d |
+ addi r11,r11,64
|
|
|
8a984d |
+ subi r5,r5,64
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* Copies the last 1-63 bytes. */
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+L(final_64):
|
|
|
8a984d |
+ /* r8 holds the number of bytes that will be copied with lxv/stxv. */
|
|
|
8a984d |
+ clrrdi. r8,r5,4
|
|
|
8a984d |
+ beq L(tail1)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ cmpldi cr5,r5,32
|
|
|
8a984d |
+ lxv 32+v0,0(r4)
|
|
|
8a984d |
+ blt cr5,L(tail2)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ cmpldi cr6,r5,48
|
|
|
8a984d |
+ lxv 32+v1,16(r4)
|
|
|
8a984d |
+ blt cr6,L(tail3)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+ lxv 32+v2,32(r4)
|
|
|
8a984d |
+ stxv 32+v2,32(r11)
|
|
|
8a984d |
+L(tail3):
|
|
|
8a984d |
+ stxv 32+v1,16(r11)
|
|
|
8a984d |
+L(tail2):
|
|
|
8a984d |
+ stxv 32+v0,0(r11)
|
|
|
8a984d |
+ sub r5,r5,r8
|
|
|
8a984d |
+ add r4,r4,r8
|
|
|
8a984d |
+ add r11,r11,r8
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+L(tail1):
|
|
|
8a984d |
+ sldi r6,r5,56
|
|
|
8a984d |
+ lxvl v4,r4,r6
|
|
|
8a984d |
+ stxvl v4,r11,r6
|
|
|
8a984d |
+ blr
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* If dest and src overlap, we should copy backwards. */
|
|
|
8a984d |
+L(memmove_bwd):
|
|
|
8a984d |
+ add r11,r3,r5
|
|
|
8a984d |
+ add r4,r4,r5
|
|
|
8a984d |
+
|
|
|
8a984d |
+ /* Optimization for length smaller than 16 bytes. */
|
|
|
8a984d |
+ cmpldi cr5,r5,15
|
|
|
8a984d |
+ ble cr5,L(tail1_bwd)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ /* For shorter lengths the alignment either slows down or is irrelevant.
|
|
|
8a984d |
+ The forward copy uses a already need 256 comparison for that. Here
|
|
|
8a984d |
+ it's using 128 as it will reduce code and improve readability. */
|
|
|
8a984d |
+ cmpldi cr7,r5,128
|
|
|
8a984d |
+ blt cr7,L(bwd_loop_tail)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ /* Align dest address to 16 bytes. */
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+ clrldi. r9,r11,60
|
|
|
8a984d |
+ beq L(bwd_loop_head)
|
|
|
8a984d |
+ sub r4,r4,r9
|
|
|
8a984d |
+ sub r11,r11,r9
|
|
|
8a984d |
+ lxv 32+v0,0(r4)
|
|
|
8a984d |
+ sldi r6,r9,56
|
|
|
8a984d |
+ stxvl 32+v0,r11,r6
|
|
|
8a984d |
+ sub r5,r5,r9
|
|
|
8a984d |
+
|
|
|
8a984d |
+L(bwd_loop_head):
|
|
|
8a984d |
+ srdi. r7,r5,7
|
|
|
8a984d |
+ beq L(bwd_loop_tail)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ mtctr r7
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* Main loop that copies 128 bytes every iteration. */
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+L(bwd_loop):
|
|
|
8a984d |
+ addi r9,r4,-64
|
|
|
8a984d |
+ addi r10,r11,-64
|
|
|
8a984d |
+
|
|
|
8a984d |
+ lxv 32+v0,-16(r4)
|
|
|
8a984d |
+ lxv 32+v1,-32(r4)
|
|
|
8a984d |
+ lxv 32+v2,-48(r4)
|
|
|
8a984d |
+ lxv 32+v3,-64(r4)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ stxv 32+v0,-16(r11)
|
|
|
8a984d |
+ stxv 32+v1,-32(r11)
|
|
|
8a984d |
+ stxv 32+v2,-48(r11)
|
|
|
8a984d |
+ stxv 32+v3,-64(r11)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ addi r4,r4,-128
|
|
|
8a984d |
+ addi r11,r11,-128
|
|
|
8a984d |
+
|
|
|
8a984d |
+ lxv 32+v0,-16(r9)
|
|
|
8a984d |
+ lxv 32+v1,-32(r9)
|
|
|
8a984d |
+ lxv 32+v2,-48(r9)
|
|
|
8a984d |
+ lxv 32+v3,-64(r9)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ stxv 32+v0,-16(r10)
|
|
|
8a984d |
+ stxv 32+v1,-32(r10)
|
|
|
8a984d |
+ stxv 32+v2,-48(r10)
|
|
|
8a984d |
+ stxv 32+v3,-64(r10)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ bdnz L(bwd_loop)
|
|
|
8a984d |
+ clrldi. r5,r5,57
|
|
|
8a984d |
+ beqlr
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* Copy 64 bytes. */
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+L(bwd_loop_tail):
|
|
|
8a984d |
+ cmpldi cr5,r5,63
|
|
|
8a984d |
+ ble cr5,L(bwd_final_64)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ addi r4,r4,-64
|
|
|
8a984d |
+ addi r11,r11,-64
|
|
|
8a984d |
+
|
|
|
8a984d |
+ lxv 32+v0,0(r4)
|
|
|
8a984d |
+ lxv 32+v1,16(r4)
|
|
|
8a984d |
+ lxv 32+v2,32(r4)
|
|
|
8a984d |
+ lxv 32+v3,48(r4)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ stxv 32+v0,0(r11)
|
|
|
8a984d |
+ stxv 32+v1,16(r11)
|
|
|
8a984d |
+ stxv 32+v2,32(r11)
|
|
|
8a984d |
+ stxv 32+v3,48(r11)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ subi r5,r5,64
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* Copies the last 1-63 bytes. */
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+L(bwd_final_64):
|
|
|
8a984d |
+ /* r8 holds the number of bytes that will be copied with lxv/stxv. */
|
|
|
8a984d |
+ clrrdi. r8,r5,4
|
|
|
8a984d |
+ beq L(tail1_bwd)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ cmpldi cr5,r5,32
|
|
|
8a984d |
+ lxv 32+v2,-16(r4)
|
|
|
8a984d |
+ blt cr5,L(tail2_bwd)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ cmpldi cr6,r5,48
|
|
|
8a984d |
+ lxv 32+v1,-32(r4)
|
|
|
8a984d |
+ blt cr6,L(tail3_bwd)
|
|
|
8a984d |
+
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+ lxv 32+v0,-48(r4)
|
|
|
8a984d |
+ stxv 32+v0,-48(r11)
|
|
|
8a984d |
+L(tail3_bwd):
|
|
|
8a984d |
+ stxv 32+v1,-32(r11)
|
|
|
8a984d |
+L(tail2_bwd):
|
|
|
8a984d |
+ stxv 32+v2,-16(r11)
|
|
|
8a984d |
+ sub r4,r4,r5
|
|
|
8a984d |
+ sub r11,r11,r5
|
|
|
8a984d |
+ sub r5,r5,r8
|
|
|
8a984d |
+ sldi r6,r5,56
|
|
|
8a984d |
+ lxvl v4,r4,r6
|
|
|
8a984d |
+ stxvl v4,r11,r6
|
|
|
8a984d |
+ blr
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* Copy last 16 bytes. */
|
|
|
8a984d |
+ .p2align 5
|
|
|
8a984d |
+L(tail1_bwd):
|
|
|
8a984d |
+ sub r4,r4,r5
|
|
|
8a984d |
+ sub r11,r11,r5
|
|
|
8a984d |
+ sldi r6,r5,56
|
|
|
8a984d |
+ lxvl v4,r4,r6
|
|
|
8a984d |
+ stxvl v4,r11,r6
|
|
|
8a984d |
+ blr
|
|
|
8a984d |
+
|
|
|
8a984d |
+END_GEN_TB (MEMMOVE,TB_TOCLESS)
|
|
|
8a984d |
+libc_hidden_builtin_def (memmove)
|
|
|
8a984d |
+
|
|
|
8a984d |
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
|
|
|
8a984d |
+ Implemented in this file to avoid linker create a stub function call
|
|
|
8a984d |
+ in the branch to '_memmove'. */
|
|
|
8a984d |
+ENTRY_TOCLESS (__bcopy)
|
|
|
8a984d |
+ mr r6,r3
|
|
|
8a984d |
+ mr r3,r4
|
|
|
8a984d |
+ mr r4,r6
|
|
|
8a984d |
+ b L(_memmove)
|
|
|
8a984d |
+END (__bcopy)
|
|
|
8a984d |
+#ifndef __bcopy
|
|
|
8a984d |
+weak_alias (__bcopy, bcopy)
|
|
|
8a984d |
+#endif
|
|
|
8a984d |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
|
|
8a984d |
index 61652b65dd223018..66f8c6ace9824d4a 100644
|
|
|
8a984d |
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
|
|
8a984d |
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
|
|
8a984d |
@@ -32,7 +32,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
|
|
8a984d |
strncase-power8
|
|
|
8a984d |
|
|
|
8a984d |
ifneq (,$(filter %le,$(config-machine)))
|
|
|
8a984d |
-sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
|
|
8a984d |
+sysdep_routines += memmove-power10 \
|
|
|
8a984d |
+ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
|
|
8a984d |
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
|
|
|
8a984d |
strlen-power10
|
|
|
8a984d |
endif
|
|
|
8a984d |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
|
|
|
8a984d |
index 1c4a229b1fc5654a..705fef33d4e57557 100644
|
|
|
8a984d |
--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
|
|
|
8a984d |
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
|
|
|
8a984d |
@@ -22,8 +22,17 @@
|
|
|
8a984d |
extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
|
|
|
8a984d |
/* __bcopy_power7 symbol is implemented at memmove-power7.S */
|
|
|
8a984d |
extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
|
|
|
8a984d |
+#ifdef __LITTLE_ENDIAN__
|
|
|
8a984d |
+extern __typeof (bcopy) __bcopy_power10 attribute_hidden;
|
|
|
8a984d |
+#endif
|
|
|
8a984d |
|
|
|
8a984d |
libc_ifunc (bcopy,
|
|
|
8a984d |
+#ifdef __LITTLE_ENDIAN__
|
|
|
8a984d |
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
|
|
8a984d |
+ PPC_FEATURE2_HAS_ISEL)
|
|
|
8a984d |
+ && (hwcap & PPC_FEATURE_HAS_VSX)
|
|
|
8a984d |
+ ? __bcopy_power10 :
|
|
|
8a984d |
+#endif
|
|
|
8a984d |
(hwcap & PPC_FEATURE_HAS_VSX)
|
|
|
8a984d |
? __bcopy_power7
|
|
|
8a984d |
: __bcopy_ppc);
|
|
|
8a984d |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
|
|
8a984d |
index 46d5956adda72b86..4ce04bc51574cca1 100644
|
|
|
8a984d |
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
|
|
8a984d |
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
|
|
8a984d |
@@ -67,6 +67,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
|
8a984d |
|
|
|
8a984d |
/* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c. */
|
|
|
8a984d |
IFUNC_IMPL (i, name, memmove,
|
|
|
8a984d |
+#ifdef __LITTLE_ENDIAN__
|
|
|
8a984d |
+ IFUNC_IMPL_ADD (array, i, memmove,
|
|
|
8a984d |
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
|
|
8a984d |
+ PPC_FEATURE2_HAS_ISEL)
|
|
|
8a984d |
+ && (hwcap & PPC_FEATURE_HAS_VSX),
|
|
|
8a984d |
+ __memmove_power10)
|
|
|
8a984d |
+#endif
|
|
|
8a984d |
IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
|
|
|
8a984d |
__memmove_power7)
|
|
|
8a984d |
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
|
|
|
8a984d |
@@ -186,6 +193,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
|
8a984d |
|
|
|
8a984d |
/* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c. */
|
|
|
8a984d |
IFUNC_IMPL (i, name, bcopy,
|
|
|
8a984d |
+#ifdef __LITTLE_ENDIAN__
|
|
|
8a984d |
+ IFUNC_IMPL_ADD (array, i, bcopy,
|
|
|
8a984d |
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
|
|
8a984d |
+ PPC_FEATURE2_HAS_ISEL)
|
|
|
8a984d |
+ && (hwcap & PPC_FEATURE_HAS_VSX),
|
|
|
8a984d |
+ __bcopy_power10)
|
|
|
8a984d |
+#endif
|
|
|
8a984d |
IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
|
|
|
8a984d |
__bcopy_power7)
|
|
|
8a984d |
IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
|
|
|
8a984d |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S
|
|
|
8a984d |
new file mode 100644
|
|
|
8a984d |
index 0000000000000000..171b32921a0a4d47
|
|
|
8a984d |
--- /dev/null
|
|
|
8a984d |
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S
|
|
|
8a984d |
@@ -0,0 +1,27 @@
|
|
|
8a984d |
+/* Optimized memmove implementation for POWER10.
|
|
|
8a984d |
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
|
8a984d |
+ This file is part of the GNU C Library.
|
|
|
8a984d |
+
|
|
|
8a984d |
+ The GNU C Library is free software; you can redistribute it and/or
|
|
|
8a984d |
+ modify it under the terms of the GNU Lesser General Public
|
|
|
8a984d |
+ License as published by the Free Software Foundation; either
|
|
|
8a984d |
+ version 2.1 of the License, or (at your option) any later version.
|
|
|
8a984d |
+
|
|
|
8a984d |
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
|
8a984d |
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
8a984d |
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
8a984d |
+ Lesser General Public License for more details.
|
|
|
8a984d |
+
|
|
|
8a984d |
+ You should have received a copy of the GNU Lesser General Public
|
|
|
8a984d |
+ License along with the GNU C Library; if not, see
|
|
|
8a984d |
+ <https://www.gnu.org/licenses/>. */
|
|
|
8a984d |
+
|
|
|
8a984d |
+#define MEMMOVE __memmove_power10
|
|
|
8a984d |
+
|
|
|
8a984d |
+#undef libc_hidden_builtin_def
|
|
|
8a984d |
+#define libc_hidden_builtin_def(name)
|
|
|
8a984d |
+
|
|
|
8a984d |
+#undef __bcopy
|
|
|
8a984d |
+#define __bcopy __bcopy_power10
|
|
|
8a984d |
+
|
|
|
8a984d |
+#include <sysdeps/powerpc/powerpc64/le/power10/memmove.S>
|
|
|
8a984d |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
|
|
|
8a984d |
index 0b251d0f5f087874..fb5261ecda64d061 100644
|
|
|
8a984d |
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
|
|
|
8a984d |
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
|
|
|
8a984d |
@@ -21,7 +21,7 @@
|
|
|
8a984d |
#undef libc_hidden_builtin_def
|
|
|
8a984d |
#define libc_hidden_builtin_def(name)
|
|
|
8a984d |
|
|
|
8a984d |
-#undef bcopy
|
|
|
8a984d |
-#define bcopy __bcopy_power7
|
|
|
8a984d |
+#undef __bcopy
|
|
|
8a984d |
+#define __bcopy __bcopy_power7
|
|
|
8a984d |
|
|
|
8a984d |
#include <sysdeps/powerpc/powerpc64/power7/memmove.S>
|
|
|
8a984d |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
|
|
|
8a984d |
index 39987155cc7d3624..2fd7b6d309e4bedd 100644
|
|
|
8a984d |
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
|
|
|
8a984d |
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
|
|
|
8a984d |
@@ -28,14 +28,22 @@
|
|
|
8a984d |
# include "init-arch.h"
|
|
|
8a984d |
|
|
|
8a984d |
extern __typeof (__redirect_memmove) __libc_memmove;
|
|
|
8a984d |
-
|
|
|
8a984d |
extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
|
|
|
8a984d |
extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
|
|
|
8a984d |
+#ifdef __LITTLE_ENDIAN__
|
|
|
8a984d |
+extern __typeof (__redirect_memmove) __memmove_power10 attribute_hidden;
|
|
|
8a984d |
+#endif
|
|
|
8a984d |
|
|
|
8a984d |
libc_ifunc (__libc_memmove,
|
|
|
8a984d |
- (hwcap & PPC_FEATURE_HAS_VSX)
|
|
|
8a984d |
- ? __memmove_power7
|
|
|
8a984d |
- : __memmove_ppc);
|
|
|
8a984d |
+#ifdef __LITTLE_ENDIAN__
|
|
|
8a984d |
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
|
|
8a984d |
+ PPC_FEATURE2_HAS_ISEL)
|
|
|
8a984d |
+ && (hwcap & PPC_FEATURE_HAS_VSX)
|
|
|
8a984d |
+ ? __memmove_power10 :
|
|
|
8a984d |
+#endif
|
|
|
8a984d |
+ (hwcap & PPC_FEATURE_HAS_VSX)
|
|
|
8a984d |
+ ? __memmove_power7
|
|
|
8a984d |
+ : __memmove_ppc);
|
|
|
8a984d |
|
|
|
8a984d |
#undef memmove
|
|
|
8a984d |
strong_alias (__libc_memmove, memmove);
|
|
|
8a984d |
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
|
|
|
8a984d |
index b7f3dc28d1a8eac3..9e4cabb07ef9b732 100644
|
|
|
8a984d |
--- a/sysdeps/powerpc/powerpc64/power7/memmove.S
|
|
|
8a984d |
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
|
|
|
8a984d |
@@ -832,4 +832,6 @@ ENTRY_TOCLESS (__bcopy)
|
|
|
8a984d |
mr r4,r6
|
|
|
8a984d |
b L(_memmove)
|
|
|
8a984d |
END (__bcopy)
|
|
|
8a984d |
+#ifndef __bcopy
|
|
|
8a984d |
weak_alias (__bcopy, bcopy)
|
|
|
8a984d |
+#endif
|