| commit 813c6ec808556553be9d39e900a3fc97ceb32330 |
| Author: Pedro Franco de Carvalho <pedromfc@linux.ibm.com> |
| Date: Wed Jun 30 12:36:07 2021 -0300 |
| |
| powerpc: optimize strcpy/stpcpy for POWER9/10 |
| |
| This patch modifies the current POWER9 implementation of strcpy and |
| stpcpy to optimize it for POWER9/10. |
| |
| Since no new POWER10 instructions are used, the original POWER9 strcpy is |
| modified instead of creating a new implementation for POWER10. This |
| implementation is based on both the original POWER9 implementation of |
| strcpy and the preamble of the new POWER10 implementation of strlen. |
| |
| The changes also affect stpcpy, which uses the same implementation with |
| some additional code before returning. |
| |
| On POWER9, averaging improvements across the benchmark |
| inputs (length/source alignment/destination alignment), for an |
| experiment that ran the benchmark five times, bench-strcpy showed an |
| improvement of 5.23%, and bench-stpcpy showed an improvement of 6.59%. |
| |
| On POWER10, bench-strcpy showed 13.16%, and bench-stpcpy showed 13.59%. |
| |
| The changes are: |
| |
| 1. Removed the null string optimization. |
| |
| Although this results in a few extra cycles for the null string, in |
| combination with the second change, this resulted in improvements for |
| for other cases. |
| |
| 2. Adapted the preamble from strlen for POWER10. |
| |
| This is the part of the function that handles up to the first 16 bytes |
| of the string. |
| |
| 3. Increased number of unrolled iterations in the main loop to 6. |
| |
| Reviewed-by: Matheus Castanho <msc@linux.ibm.com> |
| Tested-by: Matheus Castanho <msc@linux.ibm.com> |
| |
| diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S |
| index ce8f50329177fd06..9845a1d4cf0e1e5d 100644 |
| |
| |
| @@ -45,91 +45,78 @@ |
| The implementation can load bytes past a null terminator, but only |
| up to the next 16B boundary, so it never crosses a page. */ |
| |
| +/* Load quadword at addr+offset to vreg, check for null bytes, |
| + and branch to label if any are found. */ |
| +#define CHECK16(vreg,offset,addr,label) \ |
| + lxv vreg+32,offset(addr); \ |
| + vcmpequb. v6,vreg,v18; \ |
| + bne cr6,L(label); |
| + |
| .machine power9 |
| ENTRY_TOCLESS (FUNC_NAME, 4) |
| CALL_MCOUNT 2 |
| |
| - /* NULL string optimisation */ |
| - lbz r0,0(r4) |
| - stb r0,0(r3) |
| - cmpwi r0,0 |
| - beqlr |
| - |
| - addi r4,r4,1 |
| - addi r11,r3,1 |
| - |
| vspltisb v18,0 /* Zeroes in v18 */ |
| + vspltisb v19,-1 /* 0xFF bytes in v19 */ |
| |
| - neg r5,r4 |
| - rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */ |
| + /* Next 16B-aligned address. Prepare address for L(loop). */ |
| + addi r5,r4,16 |
| + clrrdi r5,r5,4 |
| + subf r8,r4,r5 |
| + add r11,r3,r8 |
| |
| - /* Get source 16B aligned */ |
| + /* Align data and fill bytes not loaded with non matching char. */ |
| lvx v0,0,r4 |
| lvsr v1,0,r4 |
| - vperm v0,v18,v0,v1 |
| - |
| - vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ |
| - vctzlsbb r7,v6 /* Number of trailing zeroes */ |
| - addi r8,r7,1 /* Add null terminator */ |
| + vperm v0,v19,v0,v1 |
| |
| - /* r8 = bytes including null |
| - r9 = bytes to get source 16B aligned |
| - if r8 > r9 |
| - no null, copy r9 bytes |
| - else |
| - there is a null, copy r8 bytes and return. */ |
| - cmpd r8,r9 |
| - bgt L(no_null) |
| + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ |
| + beq cr6,L(no_null) |
| |
| - sldi r10,r8,56 /* stxvl wants size in top 8 bits */ |
| - stxvl 32+v0,r11,r10 /* Partial store */ |
| + /* There's a null byte. */ |
| + vctzlsbb r8,v6 /* Number of trailing zeroes */ |
| + addi r9,r8,1 /* Add null byte. */ |
| + sldi r10,r9,56 /* stxvl wants size in top 8 bits. */ |
| + stxvl 32+v0,r3,r10 /* Partial store */ |
| |
| #ifdef USE_AS_STPCPY |
| /* stpcpy returns the dest address plus the size not counting the |
| final '\0'. */ |
| - add r3,r11,r7 |
| + add r3,r3,r8 |
| #endif |
| blr |
| |
| L(no_null): |
| - sldi r10,r9,56 /* stxvl wants size in top 8 bits */ |
| - stxvl 32+v0,r11,r10 /* Partial store */ |
| - |
| - add r4,r4,r9 |
| - add r11,r11,r9 |
| + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ |
| + stxvl 32+v0,r3,r10 /* Partial store */ |
| |
| + .p2align 4 |
| L(loop): |
| - lxv 32+v0,0(r4) |
| - vcmpequb. v6,v0,v18 /* Any zero bytes? */ |
| - bne cr6,L(tail1) |
| - |
| - lxv 32+v1,16(r4) |
| - vcmpequb. v6,v1,v18 /* Any zero bytes? */ |
| - bne cr6,L(tail2) |
| - |
| - lxv 32+v2,32(r4) |
| - vcmpequb. v6,v2,v18 /* Any zero bytes? */ |
| - bne cr6,L(tail3) |
| - |
| - lxv 32+v3,48(r4) |
| - vcmpequb. v6,v3,v18 /* Any zero bytes? */ |
| - bne cr6,L(tail4) |
| + CHECK16(v0,0,r5,tail1) |
| + CHECK16(v1,16,r5,tail2) |
| + CHECK16(v2,32,r5,tail3) |
| + CHECK16(v3,48,r5,tail4) |
| + CHECK16(v4,64,r5,tail5) |
| + CHECK16(v5,80,r5,tail6) |
| |
| stxv 32+v0,0(r11) |
| stxv 32+v1,16(r11) |
| stxv 32+v2,32(r11) |
| stxv 32+v3,48(r11) |
| + stxv 32+v4,64(r11) |
| + stxv 32+v5,80(r11) |
| |
| - addi r4,r4,64 |
| - addi r11,r11,64 |
| + addi r5,r5,96 |
| + addi r11,r11,96 |
| |
| b L(loop) |
| |
| + .p2align 4 |
| L(tail1): |
| - vctzlsbb r8,v6 |
| - addi r9,r8,1 |
| + vctzlsbb r8,v6 /* Number of trailing zeroes */ |
| + addi r9,r8,1 /* Add null terminator */ |
| sldi r9,r9,56 /* stxvl wants size in top 8 bits */ |
| - stxvl 32+v0,r11,r9 |
| + stxvl 32+v0,r11,r9 /* Partial store */ |
| #ifdef USE_AS_STPCPY |
| /* stpcpy returns the dest address plus the size not counting the |
| final '\0'. */ |
| @@ -137,50 +124,81 @@ L(tail1): |
| #endif |
| blr |
| |
| + .p2align 4 |
| L(tail2): |
| stxv 32+v0,0(r11) |
| - vctzlsbb r8,v6 /* Number of trailing zeroes */ |
| - addi r9,r8,1 /* Add null terminator */ |
| - sldi r10,r9,56 /* stxvl wants size in top 8 bits */ |
| + vctzlsbb r8,v6 |
| + addi r9,r8,1 |
| + sldi r9,r9,56 |
| addi r11,r11,16 |
| - stxvl 32+v1,r11,r10 /* Partial store */ |
| + stxvl 32+v1,r11,r9 |
| #ifdef USE_AS_STPCPY |
| - /* stpcpy returns the dest address plus the size not counting the |
| - final '\0'. */ |
| add r3,r11,r8 |
| #endif |
| blr |
| |
| + .p2align 4 |
| L(tail3): |
| stxv 32+v0,0(r11) |
| stxv 32+v1,16(r11) |
| - vctzlsbb r8,v6 /* Number of trailing zeroes */ |
| - addi r9,r8,1 /* Add null terminator */ |
| - sldi r10,r9,56 /* stxvl wants size in top 8 bits */ |
| + vctzlsbb r8,v6 |
| + addi r9,r8,1 |
| + sldi r9,r9,56 |
| addi r11,r11,32 |
| - stxvl 32+v2,r11,r10 /* Partial store */ |
| + stxvl 32+v2,r11,r9 |
| #ifdef USE_AS_STPCPY |
| - /* stpcpy returns the dest address plus the size not counting the |
| - final '\0'. */ |
| add r3,r11,r8 |
| #endif |
| blr |
| |
| + .p2align 4 |
| L(tail4): |
| stxv 32+v0,0(r11) |
| stxv 32+v1,16(r11) |
| stxv 32+v2,32(r11) |
| - vctzlsbb r8,v6 /* Number of trailing zeroes */ |
| - addi r9,r8,1 /* Add null terminator */ |
| - sldi r10,r9,56 /* stxvl wants size in top 8 bits */ |
| + vctzlsbb r8,v6 |
| + addi r9,r8,1 |
| + sldi r9,r9,56 |
| addi r11,r11,48 |
| - stxvl 32+v3,r11,r10 /* Partial store */ |
| + stxvl 32+v3,r11,r9 |
| #ifdef USE_AS_STPCPY |
| - /* stpcpy returns the dest address plus the size not counting the |
| - final '\0'. */ |
| add r3,r11,r8 |
| #endif |
| blr |
| + |
| + .p2align 4 |
| +L(tail5): |
| + stxv 32+v0,0(r11) |
| + stxv 32+v1,16(r11) |
| + stxv 32+v2,32(r11) |
| + stxv 32+v3,48(r11) |
| + vctzlsbb r8,v6 |
| + addi r9,r8,1 |
| + sldi r9,r9,56 |
| + addi r11,r11,64 |
| + stxvl 32+v4,r11,r9 |
| +#ifdef USE_AS_STPCPY |
| + add r3,r11,r8 |
| +#endif |
| + blr |
| + |
| + .p2align 4 |
| +L(tail6): |
| + stxv 32+v0,0(r11) |
| + stxv 32+v1,16(r11) |
| + stxv 32+v2,32(r11) |
| + stxv 32+v3,48(r11) |
| + stxv 32+v4,64(r11) |
| + vctzlsbb r8,v6 |
| + addi r9,r8,1 |
| + sldi r9,r9,56 |
| + addi r11,r11,80 |
| + stxvl 32+v5,r11,r9 |
| +#ifdef USE_AS_STPCPY |
| + add r3,r11,r8 |
| +#endif |
| + blr |
| + |
| END (FUNC_NAME) |
| #ifndef USE_AS_STPCPY |
| libc_hidden_builtin_def (strcpy) |