From 02ff96944bd8f17bde0ac51cf0a0cf52a5dd8959 Mon Sep 17 00:00:00 2001 From: aekoroglu Date: Mar 01 2023 14:26:40 +0000 Subject: AVX2 optimization added to slide_hash function --- diff --git a/SOURCES/zlib-1.2.11-x86_64-accelrated-slide-hash.patch b/SOURCES/zlib-1.2.11-x86_64-accelrated-slide-hash.patch index c9c362a..e01636d 100644 --- a/SOURCES/zlib-1.2.11-x86_64-accelrated-slide-hash.patch +++ b/SOURCES/zlib-1.2.11-x86_64-accelrated-slide-hash.patch @@ -1,29 +1,36 @@ ---- a/configure 2022-04-19 17:46:39.589212290 +0300 -+++ b/configure 2022-04-19 17:48:26.737818784 +0300 -@@ -115,6 +115,7 @@ +--- a/configure 2023-03-01 08:14:25.954898388 +0200 ++++ b/configure 2023-03-01 08:24:45.239014676 +0200 +@@ -115,6 +115,8 @@ echo ' [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log echo ' [--dfltcc]' | tee -a configure.log -+ echo ' [--enable-sse_slide]' | tee -a configure.log ++ echo ' [--enable-sse-slide]' | tee -a configure.log ++ echo ' [--enable-avx2-slide]' | tee -a configure.log exit 0 ;; -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;; -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;; -@@ -144,6 +145,12 @@ +@@ -144,6 +146,18 @@ PIC_OBJC="$PIC_OBJC dfltcc.lo" shift ;; -+ --enable-sse_slide) ++ --enable-sse-slide) + CFLAGS="$CFLAGS -DUSE_SSE_SLIDE" + OBJC="$OBJC slide_sse.o" + PIC_OBJC="$PIC_OBJC slide_sse.lo" + shift + ;; ++ --enable-avx2-slide) ++ CFLAGS="$CFLAGS -DUSE_AVX2_SLIDE" ++ OBJC="$OBJC slide_avx2.o" ++ PIC_OBJC="$PIC_OBJC slide_avx2.lo" ++ shift ++ ;; *) echo "unknown option: $1" | tee -a configure.log echo "$0 --help for help" | tee -a configure.log ---- a/Makefile.in 2022-04-11 18:00:47.184530801 +0300 -+++ b/Makefile.in 2022-04-11 18:02:47.815927655 +0300 -@@ -151,6 +151,14 @@ +--- a/Makefile.in 2023-03-01 08:14:25.950898032 +0200 ++++ b/Makefile.in 2023-03-01 08:28:07.734042879 +0200 +@@ -151,6 +151,22 @@ $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/dfltcc.o $(SRCDIR)contrib/s390/dfltcc.c -@mv objs/dfltcc.o $@ @@ -35,21 +42,34 @@ + $(CC) $(SFLAGS) $(ZINC) -DPIC -msse2 -c -o objs/slide_sse.o $(SRCDIR)slide_sse.c + -@mv objs/slide_sse.o $@ + ++slide_avx2.o: $(SRCDIR)slide_avx2.c ++ $(CC) $(CFLAGS) $(ZINC) -mavx2 -c -o $@ $(SRCDIR)slide_avx2.c ++ ++slide_avx2.lo: $(SRCDIR)slide_avx2.c ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) $(ZINC) -DPIC -mavx2 -c -o objs/slide_avx2.o $(SRCDIR)slide_avx2.c ++ -@mv objs/slide_avx2.o $@ ++ example.o: $(SRCDIR)test/example.c $(SRCDIR)zlib.h zconf.h $(CC) $(CFLAGS) $(ZINCOUT) -c -o $@ $(SRCDIR)test/example.c ---- a/deflate.c 2022-04-19 11:43:42.333320519 +0300 -+++ b/deflate.c 2022-04-19 15:55:30.636531139 +0300 -@@ -90,6 +90,8 @@ +--- a/deflate.c 2023-03-01 14:04:13.871373364 +0200 ++++ b/deflate.c 2023-03-01 14:15:35.345276070 +0200 +@@ -90,6 +90,13 @@ local int deflateStateCheck OF((z_streamp strm)); local void slide_hash OF((deflate_state *s)); +local void slide_hash_c OF((deflate_state *s)); -+extern void slide_hash_sse (deflate_state *s); ++#ifdef USE_SSE_SLIDE ++extern void slide_hash_sse(deflate_state *s); ++#endif ++#ifdef USE_AVX2_SLIDE ++extern void slide_hash_avx2(deflate_state *s); ++#endif local void fill_window OF((deflate_state *s)); local block_state deflate_stored OF((deflate_state *s, int flush)); local block_state deflate_fast OF((deflate_state *s, int flush)); -@@ -212,7 +214,7 @@ +@@ -212,7 +219,7 @@ * bit values at the expense of memory usage). We slide even when level == 0 to * keep the hash table consistent if we switch back to level > 0 later. */ @@ -58,7 +78,7 @@ deflate_state *s; { unsigned n, m; -@@ -238,6 +240,15 @@ +@@ -238,6 +245,17 @@ #endif } @@ -66,20 +86,19 @@ +{ +#ifdef USE_SSE_SLIDE + slide_hash_sse(s); -+#else -+ slide_hash_c(s); +#endif ++#ifdef USE_AVX2_SLIDE ++ slide_hash_avx2(s); ++#endif ++ slide_hash_c(s); +} + /* ========================================================================= */ int ZEXPORT deflateInit_(strm, level, version, stream_size) z_streamp strm; -diff --git a/slide_sse.c b/slide_sse.c -new file mode 100644 -index 0000000..2ef2669 --- /dev/null +++ b/slide_sse.c -@@ -0,0 +1,49 @@ +@@ -0,0 +1,47 @@ +/* + * SSE optimized hash slide + * @@ -127,8 +146,50 @@ index 0000000..2ef2669 + } while (n > 0); +#endif +} +--- /dev/null ++++ b/slide_avx2.c +@@ -0,0 +1,44 @@ ++/* ++ * AVX2 optimized hash slide ++ * ++ * Copyright (C) 2020 Intel Corporation ++ * ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++#include "deflate.h" ++#include ++ ++void slide_hash_avx2(deflate_state *s) ++{ ++ unsigned n; ++ Posf *p; ++ uInt wsize = s->w_size; ++ z_const __m256i ymm_wsize = _mm256_set1_epi16(s->w_size); ++ ++ n = s->hash_size; ++ p = &s->head[n] - 16; ++ do { ++ __m256i value, result; + ++ value = _mm256_loadu_si256((__m256i *)p); ++ result= _mm256_subs_epu16(value, ymm_wsize); ++ _mm256_storeu_si256((__m256i *)p, result); ++ p -= 16; ++ n -= 16; ++ } while (n > 0); ++ ++#ifndef FASTEST ++ n = wsize; ++ p = &s->prev[n] - 16; ++ do { ++ __m256i value, result; + --- -2.27.0 - ++ value = _mm256_loadu_si256((__m256i *)p); ++ result= _mm256_subs_epu16(value, ymm_wsize); ++ _mm256_storeu_si256((__m256i *)p, result); ++ ++ p -= 16; ++ n -= 16; ++ } while (n > 0); ++#endif ++} diff --git a/SPECS/zlib.spec b/SPECS/zlib.spec index 436767a..8e0c158 100644 --- a/SPECS/zlib.spec +++ b/SPECS/zlib.spec @@ -3,7 +3,7 @@ Name: zlib Version: 1.2.11 -Release: 22%{?dist} +Release: 23%{?dist} Summary: The compression and decompression library # /contrib/dotzlib/ have Boost license License: zlib and Boost @@ -42,7 +42,7 @@ Patch13: zlib-1.2.11-cve-2022-37434_2.patch # Fix setting strm.adler on z15 Patch14: zlib-1.2.11-IBM-Z-hw-accelrated-deflate-strm-adler-fix.patch -# Intel slide hash optimization for x86_64 arch +# SSE+AVX2 slide_hash optimization for x86_64 arch Patch100: zlib-1.2.11-x86_64-accelrated-slide-hash.patch BuildRequires: automake, autoconf, libtool @@ -127,13 +127,12 @@ export LDFLAGS="$LDFLAGS -Wl,-z,relro -Wl,-z,now" # no-autotools, %%configure is not compatible %ifarch s390 s390x ./configure --libdir=%{_libdir} --includedir=%{_includedir} --prefix=%{_prefix} --dfltcc -%else +%endif %ifarch x86_64 -./configure --libdir=%{_libdir} --includedir=%{_includedir} --prefix=%{_prefix} --enable-sse_slide +./configure --libdir=%{_libdir} --includedir=%{_includedir} --prefix=%{_prefix} --enable-avx2-slide %else ./configure --libdir=%{_libdir} --includedir=%{_includedir} --prefix=%{_prefix} %endif -%endif %make_build %if %{with minizip} @@ -195,6 +194,9 @@ find $RPM_BUILD_ROOT -name '*.la' -delete %changelog +* Wed Mar 01 2023 Ali Erdinc Koroglu 1.2.11-23 +- AVX2 optimization added to slide_hash function + * Wed Oct 12 2022 Ilya Leoshkevich - 1.2.11-22 - Fix for IBM strm.adler rhbz#2134074