--- a/configure 2023-03-01 08:14:25.954898388 +0200 +++ b/configure 2023-03-01 08:24:45.239014676 +0200 @@ -115,6 +115,8 @@ echo ' [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log echo ' [--dfltcc]' | tee -a configure.log + echo ' [--enable-sse-slide]' | tee -a configure.log + echo ' [--enable-avx2-slide]' | tee -a configure.log exit 0 ;; -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;; -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;; @@ -144,6 +146,18 @@ PIC_OBJC="$PIC_OBJC dfltcc.lo" shift ;; + --enable-sse-slide) + CFLAGS="$CFLAGS -DUSE_SSE_SLIDE" + OBJC="$OBJC slide_sse.o" + PIC_OBJC="$PIC_OBJC slide_sse.lo" + shift + ;; + --enable-avx2-slide) + CFLAGS="$CFLAGS -DUSE_AVX2_SLIDE" + OBJC="$OBJC slide_avx2.o" + PIC_OBJC="$PIC_OBJC slide_avx2.lo" + shift + ;; *) echo "unknown option: $1" | tee -a configure.log echo "$0 --help for help" | tee -a configure.log --- a/Makefile.in 2023-03-01 08:14:25.950898032 +0200 +++ b/Makefile.in 2023-03-01 08:28:07.734042879 +0200 @@ -151,6 +151,22 @@ $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/dfltcc.o $(SRCDIR)contrib/s390/dfltcc.c -@mv objs/dfltcc.o $@ +slide_sse.o: $(SRCDIR)slide_sse.c + $(CC) $(CFLAGS) $(ZINC) -msse2 -c -o $@ $(SRCDIR)slide_sse.c + +slide_sse.lo: $(SRCDIR)slide_sse.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) $(ZINC) -DPIC -msse2 -c -o objs/slide_sse.o $(SRCDIR)slide_sse.c + -@mv objs/slide_sse.o $@ + +slide_avx2.o: $(SRCDIR)slide_avx2.c + $(CC) $(CFLAGS) $(ZINC) -mavx2 -c -o $@ $(SRCDIR)slide_avx2.c + +slide_avx2.lo: $(SRCDIR)slide_avx2.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) $(ZINC) -DPIC -mavx2 -c -o objs/slide_avx2.o $(SRCDIR)slide_avx2.c + -@mv objs/slide_avx2.o $@ + example.o: $(SRCDIR)test/example.c $(SRCDIR)zlib.h zconf.h $(CC) $(CFLAGS) $(ZINCOUT) -c -o $@ $(SRCDIR)test/example.c --- a/deflate.c 2023-03-01 14:04:13.871373364 +0200 +++ b/deflate.c 2023-03-01 14:15:35.345276070 +0200 @@ -90,6 +90,13 @@ local int deflateStateCheck OF((z_streamp strm)); local void slide_hash OF((deflate_state *s)); +local void slide_hash_c OF((deflate_state *s)); +#ifdef USE_SSE_SLIDE +extern void slide_hash_sse(deflate_state *s); +#endif +#ifdef USE_AVX2_SLIDE +extern void slide_hash_avx2(deflate_state *s); +#endif local void fill_window OF((deflate_state *s)); local block_state deflate_stored OF((deflate_state *s, int flush)); local block_state deflate_fast OF((deflate_state *s, int flush)); @@ -212,7 +219,7 @@ * bit values at the expense of memory usage). We slide even when level == 0 to * keep the hash table consistent if we switch back to level > 0 later. */ -local void slide_hash(s) +local void slide_hash_c(s) deflate_state *s; { unsigned n, m; @@ -238,6 +245,17 @@ #endif } +local void slide_hash(deflate_state *s) +{ +#ifdef USE_SSE_SLIDE + slide_hash_sse(s); +#endif +#ifdef USE_AVX2_SLIDE + slide_hash_avx2(s); +#endif + slide_hash_c(s); +} + /* ========================================================================= */ int ZEXPORT deflateInit_(strm, level, version, stream_size) z_streamp strm; --- /dev/null +++ b/slide_sse.c @@ -0,0 +1,47 @@ +/* + * SSE optimized hash slide + * + * Copyright (C) 2017 Intel Corporation + * Authors: + * Arjan van de Ven + * Jim Kukunas + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "deflate.h" +#include + +void slide_hash_sse(deflate_state *s) +{ + unsigned n; + Posf *p; + uInt wsize = s->w_size; + z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size); + + n = s->hash_size; + p = &s->head[n] - 8; + do { + __m128i value, result; + + value = _mm_loadu_si128((__m128i *)p); + result= _mm_subs_epu16(value, xmm_wsize); + _mm_storeu_si128((__m128i *)p, result); + p -= 8; + n -= 8; + } while (n > 0); + +#ifndef FASTEST + n = wsize; + p = &s->prev[n] - 8; + do { + __m128i value, result; + + value = _mm_loadu_si128((__m128i *)p); + result= _mm_subs_epu16(value, xmm_wsize); + _mm_storeu_si128((__m128i *)p, result); + + p -= 8; + n -= 8; + } while (n > 0); +#endif +} --- /dev/null +++ b/slide_avx2.c @@ -0,0 +1,44 @@ +/* + * AVX2 optimized hash slide + * + * Copyright (C) 2020 Intel Corporation + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "deflate.h" +#include + +void slide_hash_avx2(deflate_state *s) +{ + unsigned n; + Posf *p; + uInt wsize = s->w_size; + z_const __m256i ymm_wsize = _mm256_set1_epi16(s->w_size); + + n = s->hash_size; + p = &s->head[n] - 16; + do { + __m256i value, result; + + value = _mm256_loadu_si256((__m256i *)p); + result= _mm256_subs_epu16(value, ymm_wsize); + _mm256_storeu_si256((__m256i *)p, result); + p -= 16; + n -= 16; + } while (n > 0); + +#ifndef FASTEST + n = wsize; + p = &s->prev[n] - 16; + do { + __m256i value, result; + + value = _mm256_loadu_si256((__m256i *)p); + result= _mm256_subs_epu16(value, ymm_wsize); + _mm256_storeu_si256((__m256i *)p, result); + + p -= 16; + n -= 16; + } while (n > 0); +#endif +}