diff --git a/SOURCES/glibc-rh1983203-1.patch b/SOURCES/glibc-rh1983203-1.patch new file mode 100644 index 0000000..07e1431 --- /dev/null +++ b/SOURCES/glibc-rh1983203-1.patch @@ -0,0 +1,306 @@ +commit a55e2da2702e235fa0ae66a116d304d1bffc060a +Author: Lucas A. M. Magalhaes +Date: Thu May 6 17:01:52 2021 -0300 + + powerpc: Optimized memcmp for power10 + + This patch was based on the __memcmp_power8 and the recent + __strlen_power10. + + Improvements from __memcmp_power8: + + 1. Don't need alignment code. + + On POWER10 lxvp and lxvl do not generate alignment interrupts, so + they are safe for use on caching-inhibited memory. Notice that the + comparison on the main loop will wait for both VSR to be ready. + Therefore aligning one of the input address does not improve + performance. In order to align both registers a vperm is necessary + which add too much overhead. + + 2. Uses new POWER10 instructions + + This code uses lxvp to decrease contention on load by loading 32 bytes + per instruction. + The vextractbm is used to have a smaller tail code for calculating the + return value. + + 3. Performance improvement + + This version has around 35% better performance on average. I saw no + performance regressions for any length or alignment. + + Thanks Matheus for helping me out with some details. + + Co-authored-by: Matheus Castanho + Reviewed-by: Raphael M Zinsly + +diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcmp.S b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S +new file mode 100644 +index 0000000000000000..52f244e7e77cbdf9 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S +@@ -0,0 +1,179 @@ ++/* Optimized memcmp implementation for POWER10. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++/* TODO: Replace macros by the actual instructions when minimum binutils becomes ++ >= 2.35. This is used to keep compatibility with older versions. */ ++#define VEXTRACTBM(rt,vrb) \ ++ .long(((4)<<(32-6)) \ ++ | ((rt)<<(32-11)) \ ++ | ((8)<<(32-16)) \ ++ | ((vrb)<<(32-21)) \ ++ | 1602) ++ ++#define LXVP(xtp,dq,ra) \ ++ .long(((6)<<(32-6)) \ ++ | ((((xtp)-32)>>1)<<(32-10)) \ ++ | ((1)<<(32-11)) \ ++ | ((ra)<<(32-16)) \ ++ | dq) ++ ++/* Compare 32 bytes. */ ++#define COMPARE_32(vr1,vr2,offset,tail_1,tail_2)\ ++ LXVP(32+vr1,offset,r3); \ ++ LXVP(32+vr2,offset,r4); \ ++ vcmpneb. v5,vr1+1,vr2+1; \ ++ bne cr6,L(tail_2); \ ++ vcmpneb. v4,vr1,vr2; \ ++ bne cr6,L(tail_1); \ ++ ++#define TAIL(v_res,s1,s2) \ ++ vctzlsbb r7,v_res; \ ++ vextubrx r8,r7,s1; \ ++ vextubrx r9,r7,s2; \ ++ subf r3,r9,r8; \ ++ blr; \ ++ ++/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], ++ size_t size [r5]) */ ++ ++#ifndef MEMCMP ++# define MEMCMP memcmp ++#endif ++ .machine power9 ++ENTRY_TOCLESS (MEMCMP, 4) ++ CALL_MCOUNT 3 ++ ++ cmpldi cr6,r5,64 ++ bgt cr6,L(loop_head) ++ ++/* Compare 64 bytes. This section is used for lengths <= 64 and for the last ++ bytes for larger lengths. */ ++L(last_compare): ++ li r8,16 ++ ++ sldi r9,r5,56 ++ sldi r8,r8,56 ++ addi r6,r3,16 ++ addi r7,r4,16 ++ ++ /* Align up to 16 bytes. */ ++ lxvl 32+v0,r3,r9 ++ lxvl 32+v2,r4,r9 ++ ++ /* The sub. and vcmpneb. results are concatenated by the crnand in order ++ to do a single branch. It's doing a NOT(CR0.GT AND CR6.EQ) then ++ loading to CR0.LT. That means r9 is not bigger than 0 and v4 is not ++ all equal to 0. */ ++ sub. r9,r9,r8 ++ vcmpneb. v4,v0,v2 ++ crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq ++ bt 4*cr0+lt,L(tail1) ++ ++ addi r3,r3,32 ++ addi r4,r4,32 ++ ++ lxvl 32+v1,r6,r9 ++ lxvl 32+v3,r7,r9 ++ sub. r9,r9,r8 ++ vcmpneb. v5,v1,v3 ++ crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq ++ bt 4*cr0+lt,L(tail2) ++ ++ addi r6,r3,16 ++ addi r7,r4,16 ++ ++ lxvl 32+v6,r3,r9 ++ lxvl 32+v8,r4,r9 ++ sub. r9,r9,r8 ++ vcmpneb. v4,v6,v8 ++ crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq ++ bt 4*cr0+lt,L(tail3) ++ ++ lxvl 32+v7,r6,r9 ++ lxvl 32+v9,r7,r9 ++ vcmpneb. v5,v7,v9 ++ bne cr6,L(tail4) ++ ++L(finish): ++ /* The contents are equal. */ ++ li r3,0 ++ blr ++ ++L(loop_head): ++ /* Calculate how many loops to run. */ ++ srdi. r8,r5,7 ++ beq L(loop_tail) ++ mtctr r8 ++ ++/* Main loop. Compares 128 bytes each loop. */ ++ .p2align 5 ++L(loop_128): ++ COMPARE_32(v0,v2,0,tail1,tail2) ++ COMPARE_32(v6,v8,32,tail3,tail4) ++ COMPARE_32(v10,v12,64,tail5,tail6) ++ COMPARE_32(v14,v16,96,tail7,tail8) ++ ++ addi r3,r3,128 ++ addi r4,r4,128 ++ bdnz L(loop_128) ++ ++ /* Account loop comparisons. */ ++ clrldi. r5,r5,57 ++ beq L(finish) ++ ++/* Compares 64 bytes if length is still bigger than 64 bytes. */ ++ .p2align 5 ++L(loop_tail): ++ cmpldi r5,64 ++ ble L(last_compare) ++ COMPARE_32(v0,v2,0,tail1,tail2) ++ COMPARE_32(v6,v8,32,tail3,tail4) ++ addi r3,r3,64 ++ addi r4,r4,64 ++ subi r5,r5,64 ++ b L(last_compare) ++ ++L(tail1): ++ TAIL(v4,v0,v2) ++ ++L(tail2): ++ TAIL(v5,v1,v3) ++ ++L(tail3): ++ TAIL(v4,v6,v8) ++ ++L(tail4): ++ TAIL(v5,v7,v9) ++ ++L(tail5): ++ TAIL(v4,v10,v12) ++ ++L(tail6): ++ TAIL(v5,v11,v13) ++ ++L(tail7): ++ TAIL(v4,v14,v16) ++ ++L(tail8): ++ TAIL(v5,v15,v17) ++ ++END (MEMCMP) ++libc_hidden_builtin_def (memcmp) ++weak_alias (memcmp, bcmp) +diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile +index ac2446aca62cc4ab..ee98417f4a383356 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile ++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile +@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ + strncase-power8 + + ifneq (,$(filter %le,$(config-machine))) +-sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \ ++sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \ + rawmemchr-power9 rawmemchr-power10 \ + strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 +diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +index 127af84b32a8196f..5213abdf87c79c88 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +@@ -184,6 +184,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/powerpc/powerpc64/multiarch/memcmp.c. */ + IFUNC_IMPL (i, name, memcmp, ++#ifdef __LITTLE_ENDIAN__ ++ IFUNC_IMPL_ADD (array, i, memcmp, ++ hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap & PPC_FEATURE_HAS_VSX, ++ __memcmp_power10) ++#endif + IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __memcmp_power8) + IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_HAS_VSX, +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S +new file mode 100644 +index 0000000000000000..73a0debd4a811d8e +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S +@@ -0,0 +1,26 @@ ++/* Optimized memcmp implementation for POWER10. ++ Copyright (C) 2017-2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define MEMCMP __memcmp_power10 ++ ++#undef libc_hidden_builtin_def ++#define libc_hidden_builtin_def(name) ++#undef weak_alias ++#define weak_alias(name,alias) ++ ++#include +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c +index 2c7a083a6560f920..0b8c0c1d8aa3f90a 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c +@@ -27,11 +27,17 @@ extern __typeof (memcmp) __memcmp_ppc attribute_hidden; + extern __typeof (memcmp) __memcmp_power4 attribute_hidden; + extern __typeof (memcmp) __memcmp_power7 attribute_hidden; + extern __typeof (memcmp) __memcmp_power8 attribute_hidden; ++extern __typeof (memcmp) __memcmp_power10 attribute_hidden; + # undef memcmp + + /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ + libc_ifunc_redirected (__redirect_memcmp, memcmp, ++#ifdef __LITTLE_ENDIAN__ ++ (hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap & PPC_FEATURE_HAS_VSX) ++ ? __memcmp_power10 : ++#endif + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __memcmp_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) diff --git a/SOURCES/glibc-rh1983203-2.patch b/SOURCES/glibc-rh1983203-2.patch new file mode 100644 index 0000000..eb06d39 --- /dev/null +++ b/SOURCES/glibc-rh1983203-2.patch @@ -0,0 +1,278 @@ +commit 813c6ec808556553be9d39e900a3fc97ceb32330 +Author: Pedro Franco de Carvalho +Date: Wed Jun 30 12:36:07 2021 -0300 + + powerpc: optimize strcpy/stpcpy for POWER9/10 + + This patch modifies the current POWER9 implementation of strcpy and + stpcpy to optimize it for POWER9/10. + + Since no new POWER10 instructions are used, the original POWER9 strcpy is + modified instead of creating a new implementation for POWER10. This + implementation is based on both the original POWER9 implementation of + strcpy and the preamble of the new POWER10 implementation of strlen. + + The changes also affect stpcpy, which uses the same implementation with + some additional code before returning. + + On POWER9, averaging improvements across the benchmark + inputs (length/source alignment/destination alignment), for an + experiment that ran the benchmark five times, bench-strcpy showed an + improvement of 5.23%, and bench-stpcpy showed an improvement of 6.59%. + + On POWER10, bench-strcpy showed 13.16%, and bench-stpcpy showed 13.59%. + + The changes are: + + 1. Removed the null string optimization. + + Although this results in a few extra cycles for the null string, in + combination with the second change, this resulted in improvements for + for other cases. + + 2. Adapted the preamble from strlen for POWER10. + + This is the part of the function that handles up to the first 16 bytes + of the string. + + 3. Increased number of unrolled iterations in the main loop to 6. + + Reviewed-by: Matheus Castanho + Tested-by: Matheus Castanho + +diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S +index ce8f50329177fd06..9845a1d4cf0e1e5d 100644 +--- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S ++++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S +@@ -45,91 +45,78 @@ + The implementation can load bytes past a null terminator, but only + up to the next 16B boundary, so it never crosses a page. */ + ++/* Load quadword at addr+offset to vreg, check for null bytes, ++ and branch to label if any are found. */ ++#define CHECK16(vreg,offset,addr,label) \ ++ lxv vreg+32,offset(addr); \ ++ vcmpequb. v6,vreg,v18; \ ++ bne cr6,L(label); ++ + .machine power9 + ENTRY_TOCLESS (FUNC_NAME, 4) + CALL_MCOUNT 2 + +- /* NULL string optimisation */ +- lbz r0,0(r4) +- stb r0,0(r3) +- cmpwi r0,0 +- beqlr +- +- addi r4,r4,1 +- addi r11,r3,1 +- + vspltisb v18,0 /* Zeroes in v18 */ ++ vspltisb v19,-1 /* 0xFF bytes in v19 */ + +- neg r5,r4 +- rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */ ++ /* Next 16B-aligned address. Prepare address for L(loop). */ ++ addi r5,r4,16 ++ clrrdi r5,r5,4 ++ subf r8,r4,r5 ++ add r11,r3,r8 + +- /* Get source 16B aligned */ ++ /* Align data and fill bytes not loaded with non matching char. */ + lvx v0,0,r4 + lvsr v1,0,r4 +- vperm v0,v18,v0,v1 +- +- vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ +- vctzlsbb r7,v6 /* Number of trailing zeroes */ +- addi r8,r7,1 /* Add null terminator */ ++ vperm v0,v19,v0,v1 + +- /* r8 = bytes including null +- r9 = bytes to get source 16B aligned +- if r8 > r9 +- no null, copy r9 bytes +- else +- there is a null, copy r8 bytes and return. */ +- cmpd r8,r9 +- bgt L(no_null) ++ vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ ++ beq cr6,L(no_null) + +- sldi r10,r8,56 /* stxvl wants size in top 8 bits */ +- stxvl 32+v0,r11,r10 /* Partial store */ ++ /* There's a null byte. */ ++ vctzlsbb r8,v6 /* Number of trailing zeroes */ ++ addi r9,r8,1 /* Add null byte. */ ++ sldi r10,r9,56 /* stxvl wants size in top 8 bits. */ ++ stxvl 32+v0,r3,r10 /* Partial store */ + + #ifdef USE_AS_STPCPY + /* stpcpy returns the dest address plus the size not counting the + final '\0'. */ +- add r3,r11,r7 ++ add r3,r3,r8 + #endif + blr + + L(no_null): +- sldi r10,r9,56 /* stxvl wants size in top 8 bits */ +- stxvl 32+v0,r11,r10 /* Partial store */ +- +- add r4,r4,r9 +- add r11,r11,r9 ++ sldi r10,r8,56 /* stxvl wants size in top 8 bits */ ++ stxvl 32+v0,r3,r10 /* Partial store */ + ++ .p2align 4 + L(loop): +- lxv 32+v0,0(r4) +- vcmpequb. v6,v0,v18 /* Any zero bytes? */ +- bne cr6,L(tail1) +- +- lxv 32+v1,16(r4) +- vcmpequb. v6,v1,v18 /* Any zero bytes? */ +- bne cr6,L(tail2) +- +- lxv 32+v2,32(r4) +- vcmpequb. v6,v2,v18 /* Any zero bytes? */ +- bne cr6,L(tail3) +- +- lxv 32+v3,48(r4) +- vcmpequb. v6,v3,v18 /* Any zero bytes? */ +- bne cr6,L(tail4) ++ CHECK16(v0,0,r5,tail1) ++ CHECK16(v1,16,r5,tail2) ++ CHECK16(v2,32,r5,tail3) ++ CHECK16(v3,48,r5,tail4) ++ CHECK16(v4,64,r5,tail5) ++ CHECK16(v5,80,r5,tail6) + + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) ++ stxv 32+v4,64(r11) ++ stxv 32+v5,80(r11) + +- addi r4,r4,64 +- addi r11,r11,64 ++ addi r5,r5,96 ++ addi r11,r11,96 + + b L(loop) + ++ .p2align 4 + L(tail1): +- vctzlsbb r8,v6 +- addi r9,r8,1 ++ vctzlsbb r8,v6 /* Number of trailing zeroes */ ++ addi r9,r8,1 /* Add null terminator */ + sldi r9,r9,56 /* stxvl wants size in top 8 bits */ +- stxvl 32+v0,r11,r9 ++ stxvl 32+v0,r11,r9 /* Partial store */ + #ifdef USE_AS_STPCPY + /* stpcpy returns the dest address plus the size not counting the + final '\0'. */ +@@ -137,50 +124,81 @@ L(tail1): + #endif + blr + ++ .p2align 4 + L(tail2): + stxv 32+v0,0(r11) +- vctzlsbb r8,v6 /* Number of trailing zeroes */ +- addi r9,r8,1 /* Add null terminator */ +- sldi r10,r9,56 /* stxvl wants size in top 8 bits */ ++ vctzlsbb r8,v6 ++ addi r9,r8,1 ++ sldi r9,r9,56 + addi r11,r11,16 +- stxvl 32+v1,r11,r10 /* Partial store */ ++ stxvl 32+v1,r11,r9 + #ifdef USE_AS_STPCPY +- /* stpcpy returns the dest address plus the size not counting the +- final '\0'. */ + add r3,r11,r8 + #endif + blr + ++ .p2align 4 + L(tail3): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) +- vctzlsbb r8,v6 /* Number of trailing zeroes */ +- addi r9,r8,1 /* Add null terminator */ +- sldi r10,r9,56 /* stxvl wants size in top 8 bits */ ++ vctzlsbb r8,v6 ++ addi r9,r8,1 ++ sldi r9,r9,56 + addi r11,r11,32 +- stxvl 32+v2,r11,r10 /* Partial store */ ++ stxvl 32+v2,r11,r9 + #ifdef USE_AS_STPCPY +- /* stpcpy returns the dest address plus the size not counting the +- final '\0'. */ + add r3,r11,r8 + #endif + blr + ++ .p2align 4 + L(tail4): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) +- vctzlsbb r8,v6 /* Number of trailing zeroes */ +- addi r9,r8,1 /* Add null terminator */ +- sldi r10,r9,56 /* stxvl wants size in top 8 bits */ ++ vctzlsbb r8,v6 ++ addi r9,r8,1 ++ sldi r9,r9,56 + addi r11,r11,48 +- stxvl 32+v3,r11,r10 /* Partial store */ ++ stxvl 32+v3,r11,r9 + #ifdef USE_AS_STPCPY +- /* stpcpy returns the dest address plus the size not counting the +- final '\0'. */ + add r3,r11,r8 + #endif + blr ++ ++ .p2align 4 ++L(tail5): ++ stxv 32+v0,0(r11) ++ stxv 32+v1,16(r11) ++ stxv 32+v2,32(r11) ++ stxv 32+v3,48(r11) ++ vctzlsbb r8,v6 ++ addi r9,r8,1 ++ sldi r9,r9,56 ++ addi r11,r11,64 ++ stxvl 32+v4,r11,r9 ++#ifdef USE_AS_STPCPY ++ add r3,r11,r8 ++#endif ++ blr ++ ++ .p2align 4 ++L(tail6): ++ stxv 32+v0,0(r11) ++ stxv 32+v1,16(r11) ++ stxv 32+v2,32(r11) ++ stxv 32+v3,48(r11) ++ stxv 32+v4,64(r11) ++ vctzlsbb r8,v6 ++ addi r9,r8,1 ++ sldi r9,r9,56 ++ addi r11,r11,80 ++ stxvl 32+v5,r11,r9 ++#ifdef USE_AS_STPCPY ++ add r3,r11,r8 ++#endif ++ blr ++ + END (FUNC_NAME) + #ifndef USE_AS_STPCPY + libc_hidden_builtin_def (strcpy) diff --git a/SPECS/glibc.spec b/SPECS/glibc.spec index c6b6ac5..2a20cdd 100644 --- a/SPECS/glibc.spec +++ b/SPECS/glibc.spec @@ -1,6 +1,6 @@ %define glibcsrcdir glibc-2.28 %define glibcversion 2.28 -%define glibcrelease 167%{?dist} +%define glibcrelease 168%{?dist} # Pre-release tarballs are pulled in from git using a command that is # effectively: # @@ -776,6 +776,8 @@ Patch598: glibc-rh1971664-13.patch Patch599: glibc-rh1971664-14.patch Patch600: glibc-rh1971664-15.patch Patch601: glibc-rh1977614.patch +Patch602: glibc-rh1983203-1.patch +Patch603: glibc-rh1983203-2.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2726,6 +2728,9 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Fri Oct 29 2021 Arjun Shankar - 2.28-168 +- Optimize memcmp, strcpy, and stpcpy for IBM POWER10 (#1983203) + * Wed Oct 13 2021 Arjun Shankar - 2.28-167 - malloc: Initiate tcache shutdown even without allocations (#1977614)