65fa5e
From 2dfdc5b7d6943c0ac60eef63e361e2a50f9da610 Mon Sep 17 00:00:00 2001
65fa5e
From: Ilya Leoshkevich <iii@linux.ibm.com>
65fa5e
Date: Thu, 19 Mar 2020 11:52:03 +0100
65fa5e
Subject: [PATCH] s390x: vectorize crc32
65fa5e
65fa5e
Use vector extensions when compiling for s390x and binutils knows
65fa5e
about them. At runtime, check whether kernel supports vector
65fa5e
extensions (it has to be not just the CPU, but also the kernel) and
65fa5e
choose between the regular and the vectorized implementations.
65fa5e
---
65fa5e
 Makefile.in             |   9 ++
65fa5e
 configure               |  28 ++++++
65fa5e
 contrib/s390/crc32-vx.c | 195 ++++++++++++++++++++++++++++++++++++++++
65fa5e
 crc32.c                 |  55 +++++++++++-
65fa5e
 4 files changed, 285 insertions(+), 2 deletions(-)
65fa5e
 create mode 100644 contrib/s390/crc32-vx.c
65fa5e
65fa5e
diff --git a/Makefile.in b/Makefile.in
65fa5e
index 6070dcc..9e9743b 100644
65fa5e
--- a/Makefile.in
65fa5e
+++ b/Makefile.in
65fa5e
@@ -29,6 +29,7 @@ LDFLAGS=
65fa5e
 TEST_LDFLAGS=-L. libz.a
65fa5e
 LDSHARED=$(CC)
65fa5e
 CPP=$(CC) -E
65fa5e
+VGFMAFLAG=
65fa5e
 
65fa5e
 STATICLIB=libz.a
65fa5e
 SHAREDLIB=libz.so
65fa5e
@@ -179,6 +180,9 @@ crc32_power8.o: $(SRCDIR)contrib/power8-crc/vec_crc32.c
65fa5e
 crc32.o: $(SRCDIR)crc32.c
65fa5e
 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c
65fa5e
 
65fa5e
+crc32-vx.o: $(SRCDIR)contrib/s390/crc32-vx.c
65fa5e
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(ZINC) -c -o $@ $(SRCDIR)contrib/s390/crc32-vx.c
65fa5e
+
65fa5e
 deflate.o: $(SRCDIR)deflate.c
65fa5e
 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
65fa5e
 
65fa5e
@@ -234,6 +238,11 @@ crc32.lo: $(SRCDIR)crc32.c
65fa5e
 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c
65fa5e
 	-@mv objs/crc32.o $@
65fa5e
 
65fa5e
+crc32-vx.lo: $(SRCDIR)contrib/s390/crc32-vx.c
65fa5e
+	-@mkdir objs 2>/dev/null || test -d objs
65fa5e
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(ZINC) -DPIC -c -o objs/crc32-vx.o $(SRCDIR)contrib/s390/crc32-vx.c
65fa5e
+	-@mv objs/crc32-vx.o $@
65fa5e
+
65fa5e
 deflate.lo: $(SRCDIR)deflate.c
65fa5e
 	-@mkdir objs 2>/dev/null || test -d objs
65fa5e
 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
65fa5e
diff --git a/configure b/configure
65fa5e
index 70ed86b..7941f75 100755
65fa5e
--- a/configure
65fa5e
+++ b/configure
65fa5e
@@ -923,6 +923,32 @@ EOF
65fa5e
   fi
65fa5e
 fi
65fa5e
 
65fa5e
+# check if we are compiling for s390 and binutils support vector extensions
65fa5e
+VGFMAFLAG=-march=z13
65fa5e
+cat > $test.c <
65fa5e
+#ifndef __s390__
65fa5e
+#error
65fa5e
+#endif
65fa5e
+EOF
65fa5e
+if try $CC -c $CFLAGS $VGFMAFLAG $test.c; then
65fa5e
+  CFLAGS="$CFLAGS -DHAVE_S390X_VX"
65fa5e
+  SFLAGS="$SFLAGS -DHAVE_S390X_VX"
65fa5e
+  OBJC="$OBJC crc32-vx.o"
65fa5e
+  PIC_OBJC="$PIC_OBJC crc32-vx.lo"
65fa5e
+  echo "Checking for s390 vector extensions... Yes." | tee -a configure.log
65fa5e
+
65fa5e
+  for flag in -mzarch -fzvector; do
65fa5e
+    if try $CC -c $CFLAGS $VGFMAFLAG $flag $test.c; then
65fa5e
+      VGFMAFLAG="$VGFMAFLAG $flag"
65fa5e
+      echo "Checking for $flag... Yes." | tee -a configure.log
65fa5e
+    else
65fa5e
+      echo "Checking for $flag... No." | tee -a configure.log
65fa5e
+    fi
65fa5e
+  done
65fa5e
+else
65fa5e
+  echo "Checking for s390 vector extensions... No." | tee -a configure.log
65fa5e
+fi
65fa5e
+
65fa5e
 # show the results in the log
65fa5e
 echo >> configure.log
65fa5e
 echo ALL = $ALL >> configure.log
65fa5e
@@ -955,6 +981,7 @@ echo mandir = $mandir >> configure.log
65fa5e
 echo prefix = $prefix >> configure.log
65fa5e
 echo sharedlibdir = $sharedlibdir >> configure.log
65fa5e
 echo uname = $uname >> configure.log
65fa5e
+echo VGFMAFLAG = $VGFMAFLAG >> configure.log
65fa5e
 
65fa5e
 # udpate Makefile with the configure results
65fa5e
 sed < ${SRCDIR}Makefile.in "
65fa5e
@@ -964,6 +991,7 @@ sed < ${SRCDIR}Makefile.in "
65fa5e
 /^LDFLAGS *=/s#=.*#=$LDFLAGS#
65fa5e
 /^LDSHARED *=/s#=.*#=$LDSHARED#
65fa5e
 /^CPP *=/s#=.*#=$CPP#
65fa5e
+/^VGFMAFLAG *=/s#=.*#=$VGFMAFLAG#
65fa5e
 /^STATICLIB *=/s#=.*#=$STATICLIB#
65fa5e
 /^SHAREDLIB *=/s#=.*#=$SHAREDLIB#
65fa5e
 /^SHAREDLIBV *=/s#=.*#=$SHAREDLIBV#
65fa5e
diff --git a/contrib/s390/crc32-vx.c b/contrib/s390/crc32-vx.c
65fa5e
new file mode 100644
65fa5e
index 0000000..fa5387c
65fa5e
--- /dev/null
65fa5e
+++ b/contrib/s390/crc32-vx.c
65fa5e
@@ -0,0 +1,195 @@
65fa5e
+/*
65fa5e
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
65fa5e
+ *
65fa5e
+ * Use the z/Architecture Vector Extension Facility to accelerate the
65fa5e
+ * computing of bitreflected CRC-32 checksums.
65fa5e
+ *
65fa5e
+ * This CRC-32 implementation algorithm is bitreflected and processes
65fa5e
+ * the least-significant bit first (Little-Endian).
65fa5e
+ *
65fa5e
+ * This code was originally written by Hendrik Brueckner
65fa5e
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
65fa5e
+ * relicensed under the zlib license.
65fa5e
+ */
65fa5e
+
65fa5e
+#include "../../zutil.h"
65fa5e
+
65fa5e
+#include <stdint.h>
65fa5e
+#include <vecintrin.h>
65fa5e
+
65fa5e
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
65fa5e
+typedef unsigned int uv4si __attribute__((vector_size(16)));
65fa5e
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
65fa5e
+
65fa5e
+uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
65fa5e
+    /*
65fa5e
+     * The CRC-32 constant block contains reduction constants to fold and
65fa5e
+     * process particular chunks of the input data stream in parallel.
65fa5e
+     *
65fa5e
+     * For the CRC-32 variants, the constants are precomputed according to
65fa5e
+     * these definitions:
65fa5e
+     *
65fa5e
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
65fa5e
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
65fa5e
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
65fa5e
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
65fa5e
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
65fa5e
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
65fa5e
+     *
65fa5e
+     *      The bitreflected Barret reduction constant, u', is defined as
65fa5e
+     *      the bit reversal of floor(x**64 / P(x)).
65fa5e
+     *
65fa5e
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
65fa5e
+     *      polynomial in the reversed (bitreflected) domain.
65fa5e
+     *
65fa5e
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
65fa5e
+     *
65fa5e
+     *      P(x)  = 0x04C11DB7
65fa5e
+     *      P'(x) = 0xEDB88320
65fa5e
+     */
65fa5e
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
65fa5e
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
65fa5e
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
65fa5e
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
65fa5e
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
65fa5e
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
65fa5e
+
65fa5e
+    /*
65fa5e
+     * Load the initial CRC value.
65fa5e
+     *
65fa5e
+     * The CRC value is loaded into the rightmost word of the
65fa5e
+     * vector register and is later XORed with the LSB portion
65fa5e
+     * of the loaded input data.
65fa5e
+     */
65fa5e
+    uv2di v0 = {0, 0};
65fa5e
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
65fa5e
+
65fa5e
+    /* Load a 64-byte data chunk and XOR with CRC */
65fa5e
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
65fa5e
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
65fa5e
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
65fa5e
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
65fa5e
+
65fa5e
+    v1 ^= v0;
65fa5e
+    buf += 64;
65fa5e
+    len -= 64;
65fa5e
+
65fa5e
+    while (len >= 64) {
65fa5e
+        /* Load the next 64-byte data chunk */
65fa5e
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
65fa5e
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
65fa5e
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
65fa5e
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
65fa5e
+
65fa5e
+        /*
65fa5e
+         * Perform a GF(2) multiplication of the doublewords in V1 with
65fa5e
+         * the R1 and R2 reduction constants in V0.  The intermediate result
65fa5e
+         * is then folded (accumulated) with the next data chunk in PART1 and
65fa5e
+         * stored in V1. Repeat this step for the register contents
65fa5e
+         * in V2, V3, and V4 respectively.
65fa5e
+         */
65fa5e
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
65fa5e
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
65fa5e
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
65fa5e
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
65fa5e
+
65fa5e
+        buf += 64;
65fa5e
+        len -= 64;
65fa5e
+    }
65fa5e
+
65fa5e
+    /*
65fa5e
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
65fa5e
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
65fa5e
+     * value remains.
65fa5e
+     */
65fa5e
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
65fa5e
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
65fa5e
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
65fa5e
+
65fa5e
+    while (len >= 16) {
65fa5e
+        /* Load next data chunk */
65fa5e
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
65fa5e
+
65fa5e
+        /* Fold next data chunk */
65fa5e
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
65fa5e
+
65fa5e
+        buf += 16;
65fa5e
+        len -= 16;
65fa5e
+    }
65fa5e
+
65fa5e
+    /*
65fa5e
+     * Set up a vector register for byte shifts.  The shift value must
65fa5e
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
65fa5e
+     * Shift by 8 bytes: 0x40
65fa5e
+     * Shift by 4 bytes: 0x20
65fa5e
+     */
65fa5e
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
65fa5e
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
65fa5e
+
65fa5e
+    /*
65fa5e
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
65fa5e
+     * to move R4 into the rightmost doubleword and set the leftmost
65fa5e
+     * doubleword to 0x1.
65fa5e
+     */
65fa5e
+    v0 = vec_srb(r4r3, (uv2di)v9);
65fa5e
+    v0[0] = 1;
65fa5e
+
65fa5e
+    /*
65fa5e
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
65fa5e
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
65fa5e
+     * multiplied by 0x1 and is then XORed with rightmost product.
65fa5e
+     * Implicitly, the intermediate leftmost product becomes padded
65fa5e
+     */
65fa5e
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
65fa5e
+
65fa5e
+    /*
65fa5e
+     * Now do the final 32-bit fold by multiplying the rightmost word
65fa5e
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
65fa5e
+     *
65fa5e
+     * To achieve this by a single VGFMAG, right shift V1 by a word
65fa5e
+     * and store the result in V2 which is then accumulated.  Use the
65fa5e
+     * vector unpack instruction to load the rightmost half of the
65fa5e
+     * doubleword into the rightmost doubleword element of V1; the other
65fa5e
+     * half is loaded in the leftmost doubleword.
65fa5e
+     * The vector register with CONST_R5 contains the R5 constant in the
65fa5e
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
65fa5e
+     * the leftmost product of V1.
65fa5e
+     */
65fa5e
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
65fa5e
+    v2 = vec_srb(v1, (uv2di)v9);
65fa5e
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
65fa5e
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
65fa5e
+
65fa5e
+    /*
65fa5e
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
65fa5e
+     *
65fa5e
+     * The input values to the Barret reduction are the degree-63 polynomial
65fa5e
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
65fa5e
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
65fa5e
+     * P(x).
65fa5e
+     *
65fa5e
+     * The Barret reduction algorithm is defined as:
65fa5e
+     *
65fa5e
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
65fa5e
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
65fa5e
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
65fa5e
+     *
65fa5e
+     *  Note: The leftmost doubleword of vector register containing
65fa5e
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
65fa5e
+     *  is zero and does not contribute to the final result.
65fa5e
+     */
65fa5e
+
65fa5e
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
65fa5e
+    v2 = vec_unpackl((uv4si)v1);
65fa5e
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
65fa5e
+
65fa5e
+    /*
65fa5e
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
65fa5e
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
65fa5e
+     * The final result is stored in word element 2 of V2.
65fa5e
+     */
65fa5e
+    v2 = vec_unpackl((uv4si)v2);
65fa5e
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
65fa5e
+
65fa5e
+    return ((uv4si)v2)[2];
65fa5e
+}
65fa5e
diff --git a/crc32.c b/crc32.c
65fa5e
index 34132ea..dfa33ef 100644
65fa5e
--- a/crc32.c
65fa5e
+++ b/crc32.c
65fa5e
@@ -252,12 +252,54 @@ unsigned long crc32_vpmsum(unsigned long, const unsigned char FAR *, z_size_t);
65fa5e
 #endif
65fa5e
 #endif
65fa5e
 
65fa5e
+#ifdef HAVE_S390X_VX
65fa5e
+#include <sys/auxv.h>
65fa5e
+
65fa5e
+#define VX_MIN_LEN 64
65fa5e
+#define VX_ALIGNMENT 16L
65fa5e
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
65fa5e
+
65fa5e
+unsigned int crc32_le_vgfm_16(unsigned int crc, const unsigned char FAR *buf, z_size_t len);
65fa5e
+
65fa5e
+local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
65fa5e
+{
65fa5e
+    uint64_t prealign, aligned, remaining;
65fa5e
+
65fa5e
+    if (buf == Z_NULL) return 0UL;
65fa5e
+
65fa5e
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
65fa5e
+        return crc32_big(crc, buf, len);
65fa5e
+
65fa5e
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
65fa5e
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
65fa5e
+        len -= prealign;
65fa5e
+        crc = crc32_big(crc, buf, prealign);
65fa5e
+        buf += prealign;
65fa5e
+    }
65fa5e
+    aligned = len & ~VX_ALIGN_MASK;
65fa5e
+    remaining = len & VX_ALIGN_MASK;
65fa5e
+
65fa5e
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
65fa5e
+
65fa5e
+    if (remaining)
65fa5e
+        crc = crc32_big(crc, buf + aligned, remaining);
65fa5e
+
65fa5e
+    return crc;
65fa5e
+}
65fa5e
+#endif
65fa5e
+
65fa5e
 /* due to a quirk of gnu_indirect_function - "local" (aka static) is applied to
65fa5e
  * crc32_z which is not desired. crc32_z_ifunc is implictly "local" */
65fa5e
 #ifndef Z_IFUNC_ASM
65fa5e
 local
65fa5e
 #endif
65fa5e
-unsigned long (*(crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR *, z_size_t)
65fa5e
+unsigned long (*(crc32_z_ifunc(
65fa5e
+#ifdef __s390__
65fa5e
+unsigned long hwcap
65fa5e
+#else
65fa5e
+void
65fa5e
+#endif
65fa5e
+)))(unsigned long, const unsigned char FAR *, z_size_t)
65fa5e
 {
65fa5e
 #if _ARCH_PWR8==1
65fa5e
 #if defined(__BUILTIN_CPU_SUPPORTS__)
65fa5e
@@ -269,6 +311,11 @@ unsigned long (*(crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR *,
65fa5e
 #endif
65fa5e
 #endif /* _ARCH_PWR8 */
65fa5e
 
65fa5e
+#ifdef HAVE_S390X_VX
65fa5e
+    if (hwcap & HWCAP_S390_VX)
65fa5e
+        return s390_crc32_vx;
65fa5e
+#endif
65fa5e
+
65fa5e
 /* return a function pointer for optimized arches here */
65fa5e
 
65fa5e
 #ifdef DYNAMIC_CRC_TABLE
65fa5e
@@ -301,7 +348,11 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
65fa5e
     static unsigned long ZEXPORT (*crc32_func)(unsigned long, const unsigned char FAR *, z_size_t) = NULL;
65fa5e
 
65fa5e
     if (!crc32_func)
65fa5e
-        crc32_func = crc32_z_ifunc();
65fa5e
+        crc32_func = crc32_z_ifunc(
65fa5e
+#ifdef __s390__
65fa5e
+            getauxval(AT_HWCAP)
65fa5e
+#endif
65fa5e
+        );
65fa5e
     return (*crc32_func)(crc, buf, len);
65fa5e
 }
65fa5e
 
65fa5e
-- 
65fa5e
2.25.1
65fa5e