|
|
02ff96 |
--- a/configure 2023-03-01 08:14:25.954898388 +0200
|
|
|
02ff96 |
+++ b/configure 2023-03-01 08:24:45.239014676 +0200
|
|
|
02ff96 |
@@ -115,6 +115,8 @@
|
|
|
e6944d |
echo ' [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log
|
|
|
e6944d |
echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log
|
|
|
e6944d |
echo ' [--dfltcc]' | tee -a configure.log
|
|
|
02ff96 |
+ echo ' [--enable-sse-slide]' | tee -a configure.log
|
|
|
02ff96 |
+ echo ' [--enable-avx2-slide]' | tee -a configure.log
|
|
|
e6944d |
exit 0 ;;
|
|
|
e6944d |
-p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;;
|
|
|
e6944d |
-e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;;
|
|
|
02ff96 |
@@ -144,6 +146,18 @@
|
|
|
e6944d |
PIC_OBJC="$PIC_OBJC dfltcc.lo"
|
|
|
e6944d |
shift
|
|
|
e6944d |
;;
|
|
|
02ff96 |
+ --enable-sse-slide)
|
|
|
e6944d |
+ CFLAGS="$CFLAGS -DUSE_SSE_SLIDE"
|
|
|
e6944d |
+ OBJC="$OBJC slide_sse.o"
|
|
|
e6944d |
+ PIC_OBJC="$PIC_OBJC slide_sse.lo"
|
|
|
e6944d |
+ shift
|
|
|
e6944d |
+ ;;
|
|
|
02ff96 |
+ --enable-avx2-slide)
|
|
|
02ff96 |
+ CFLAGS="$CFLAGS -DUSE_AVX2_SLIDE"
|
|
|
02ff96 |
+ OBJC="$OBJC slide_avx2.o"
|
|
|
02ff96 |
+ PIC_OBJC="$PIC_OBJC slide_avx2.lo"
|
|
|
02ff96 |
+ shift
|
|
|
02ff96 |
+ ;;
|
|
|
e6944d |
*)
|
|
|
e6944d |
echo "unknown option: $1" | tee -a configure.log
|
|
|
e6944d |
echo "$0 --help for help" | tee -a configure.log
|
|
|
02ff96 |
--- a/Makefile.in 2023-03-01 08:14:25.950898032 +0200
|
|
|
02ff96 |
+++ b/Makefile.in 2023-03-01 08:28:07.734042879 +0200
|
|
|
02ff96 |
@@ -151,6 +151,22 @@
|
|
|
e6944d |
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/dfltcc.o $(SRCDIR)contrib/s390/dfltcc.c
|
|
|
e6944d |
-@mv objs/dfltcc.o $@
|
|
|
e6944d |
|
|
|
e6944d |
+slide_sse.o: $(SRCDIR)slide_sse.c
|
|
|
e6944d |
+ $(CC) $(CFLAGS) $(ZINC) -msse2 -c -o $@ $(SRCDIR)slide_sse.c
|
|
|
e6944d |
+
|
|
|
e6944d |
+slide_sse.lo: $(SRCDIR)slide_sse.c
|
|
|
e6944d |
+ -@mkdir objs 2>/dev/null || test -d objs
|
|
|
e6944d |
+ $(CC) $(SFLAGS) $(ZINC) -DPIC -msse2 -c -o objs/slide_sse.o $(SRCDIR)slide_sse.c
|
|
|
e6944d |
+ -@mv objs/slide_sse.o $@
|
|
|
e6944d |
+
|
|
|
02ff96 |
+slide_avx2.o: $(SRCDIR)slide_avx2.c
|
|
|
02ff96 |
+ $(CC) $(CFLAGS) $(ZINC) -mavx2 -c -o $@ $(SRCDIR)slide_avx2.c
|
|
|
02ff96 |
+
|
|
|
02ff96 |
+slide_avx2.lo: $(SRCDIR)slide_avx2.c
|
|
|
02ff96 |
+ -@mkdir objs 2>/dev/null || test -d objs
|
|
|
02ff96 |
+ $(CC) $(SFLAGS) $(ZINC) -DPIC -mavx2 -c -o objs/slide_avx2.o $(SRCDIR)slide_avx2.c
|
|
|
02ff96 |
+ -@mv objs/slide_avx2.o $@
|
|
|
02ff96 |
+
|
|
|
e6944d |
example.o: $(SRCDIR)test/example.c $(SRCDIR)zlib.h zconf.h
|
|
|
e6944d |
$(CC) $(CFLAGS) $(ZINCOUT) -c -o $@ $(SRCDIR)test/example.c
|
|
|
e6944d |
|
|
|
02ff96 |
--- a/deflate.c 2023-03-01 14:04:13.871373364 +0200
|
|
|
02ff96 |
+++ b/deflate.c 2023-03-01 14:15:35.345276070 +0200
|
|
|
02ff96 |
@@ -90,6 +90,13 @@
|
|
|
e6944d |
|
|
|
e6944d |
local int deflateStateCheck OF((z_streamp strm));
|
|
|
e6944d |
local void slide_hash OF((deflate_state *s));
|
|
|
e6944d |
+local void slide_hash_c OF((deflate_state *s));
|
|
|
02ff96 |
+#ifdef USE_SSE_SLIDE
|
|
|
02ff96 |
+extern void slide_hash_sse(deflate_state *s);
|
|
|
02ff96 |
+#endif
|
|
|
02ff96 |
+#ifdef USE_AVX2_SLIDE
|
|
|
02ff96 |
+extern void slide_hash_avx2(deflate_state *s);
|
|
|
02ff96 |
+#endif
|
|
|
e6944d |
local void fill_window OF((deflate_state *s));
|
|
|
e6944d |
local block_state deflate_stored OF((deflate_state *s, int flush));
|
|
|
e6944d |
local block_state deflate_fast OF((deflate_state *s, int flush));
|
|
|
02ff96 |
@@ -212,7 +219,7 @@
|
|
|
e6944d |
* bit values at the expense of memory usage). We slide even when level == 0 to
|
|
|
e6944d |
* keep the hash table consistent if we switch back to level > 0 later.
|
|
|
e6944d |
*/
|
|
|
e6944d |
-local void slide_hash(s)
|
|
|
e6944d |
+local void slide_hash_c(s)
|
|
|
e6944d |
deflate_state *s;
|
|
|
e6944d |
{
|
|
|
e6944d |
unsigned n, m;
|
|
|
02ff96 |
@@ -238,6 +245,17 @@
|
|
|
e6944d |
#endif
|
|
|
e6944d |
}
|
|
|
e6944d |
|
|
|
e6944d |
+local void slide_hash(deflate_state *s)
|
|
|
e6944d |
+{
|
|
|
e6944d |
+#ifdef USE_SSE_SLIDE
|
|
|
e6944d |
+ slide_hash_sse(s);
|
|
|
e6944d |
+#endif
|
|
|
02ff96 |
+#ifdef USE_AVX2_SLIDE
|
|
|
02ff96 |
+ slide_hash_avx2(s);
|
|
|
02ff96 |
+#endif
|
|
|
02ff96 |
+ slide_hash_c(s);
|
|
|
e6944d |
+}
|
|
|
e6944d |
+
|
|
|
e6944d |
/* ========================================================================= */
|
|
|
e6944d |
int ZEXPORT deflateInit_(strm, level, version, stream_size)
|
|
|
e6944d |
z_streamp strm;
|
|
|
e6944d |
--- /dev/null
|
|
|
e6944d |
+++ b/slide_sse.c
|
|
|
02ff96 |
@@ -0,0 +1,47 @@
|
|
|
e6944d |
+/*
|
|
|
e6944d |
+ * SSE optimized hash slide
|
|
|
e6944d |
+ *
|
|
|
e6944d |
+ * Copyright (C) 2017 Intel Corporation
|
|
|
e6944d |
+ * Authors:
|
|
|
e6944d |
+ * Arjan van de Ven <arjan@linux.intel.com>
|
|
|
e6944d |
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
|
|
|
e6944d |
+ *
|
|
|
e6944d |
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
|
|
e6944d |
+ */
|
|
|
e6944d |
+#include "deflate.h"
|
|
|
e6944d |
+#include <immintrin.h>
|
|
|
e6944d |
+
|
|
|
e6944d |
+void slide_hash_sse(deflate_state *s)
|
|
|
e6944d |
+{
|
|
|
e6944d |
+ unsigned n;
|
|
|
e6944d |
+ Posf *p;
|
|
|
e6944d |
+ uInt wsize = s->w_size;
|
|
|
e6944d |
+ z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
|
|
|
e6944d |
+
|
|
|
e6944d |
+ n = s->hash_size;
|
|
|
e6944d |
+ p = &s->head[n] - 8;
|
|
|
e6944d |
+ do {
|
|
|
e6944d |
+ __m128i value, result;
|
|
|
e6944d |
+
|
|
|
e6944d |
+ value = _mm_loadu_si128((__m128i *)p);
|
|
|
e6944d |
+ result= _mm_subs_epu16(value, xmm_wsize);
|
|
|
e6944d |
+ _mm_storeu_si128((__m128i *)p, result);
|
|
|
e6944d |
+ p -= 8;
|
|
|
e6944d |
+ n -= 8;
|
|
|
e6944d |
+ } while (n > 0);
|
|
|
e6944d |
+
|
|
|
e6944d |
+#ifndef FASTEST
|
|
|
e6944d |
+ n = wsize;
|
|
|
e6944d |
+ p = &s->prev[n] - 8;
|
|
|
e6944d |
+ do {
|
|
|
e6944d |
+ __m128i value, result;
|
|
|
e6944d |
+
|
|
|
e6944d |
+ value = _mm_loadu_si128((__m128i *)p);
|
|
|
e6944d |
+ result= _mm_subs_epu16(value, xmm_wsize);
|
|
|
e6944d |
+ _mm_storeu_si128((__m128i *)p, result);
|
|
|
e6944d |
+
|
|
|
e6944d |
+ p -= 8;
|
|
|
e6944d |
+ n -= 8;
|
|
|
e6944d |
+ } while (n > 0);
|
|
|
e6944d |
+#endif
|
|
|
e6944d |
+}
|
|
|
02ff96 |
--- /dev/null
|
|
|
02ff96 |
+++ b/slide_avx2.c
|
|
|
02ff96 |
@@ -0,0 +1,44 @@
|
|
|
02ff96 |
+/*
|
|
|
02ff96 |
+ * AVX2 optimized hash slide
|
|
|
02ff96 |
+ *
|
|
|
02ff96 |
+ * Copyright (C) 2020 Intel Corporation
|
|
|
02ff96 |
+ *
|
|
|
02ff96 |
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
|
|
02ff96 |
+ */
|
|
|
02ff96 |
+#include "deflate.h"
|
|
|
02ff96 |
+#include <immintrin.h>
|
|
|
02ff96 |
+
|
|
|
02ff96 |
+void slide_hash_avx2(deflate_state *s)
|
|
|
02ff96 |
+{
|
|
|
02ff96 |
+ unsigned n;
|
|
|
02ff96 |
+ Posf *p;
|
|
|
02ff96 |
+ uInt wsize = s->w_size;
|
|
|
02ff96 |
+ z_const __m256i ymm_wsize = _mm256_set1_epi16(s->w_size);
|
|
|
02ff96 |
+
|
|
|
02ff96 |
+ n = s->hash_size;
|
|
|
02ff96 |
+ p = &s->head[n] - 16;
|
|
|
02ff96 |
+ do {
|
|
|
02ff96 |
+ __m256i value, result;
|
|
|
e6944d |
+
|
|
|
02ff96 |
+ value = _mm256_loadu_si256((__m256i *)p);
|
|
|
02ff96 |
+ result= _mm256_subs_epu16(value, ymm_wsize);
|
|
|
02ff96 |
+ _mm256_storeu_si256((__m256i *)p, result);
|
|
|
02ff96 |
+ p -= 16;
|
|
|
02ff96 |
+ n -= 16;
|
|
|
02ff96 |
+ } while (n > 0);
|
|
|
02ff96 |
+
|
|
|
02ff96 |
+#ifndef FASTEST
|
|
|
02ff96 |
+ n = wsize;
|
|
|
02ff96 |
+ p = &s->prev[n] - 16;
|
|
|
02ff96 |
+ do {
|
|
|
02ff96 |
+ __m256i value, result;
|
|
|
e6944d |
+
|
|
|
02ff96 |
+ value = _mm256_loadu_si256((__m256i *)p);
|
|
|
02ff96 |
+ result= _mm256_subs_epu16(value, ymm_wsize);
|
|
|
02ff96 |
+ _mm256_storeu_si256((__m256i *)p, result);
|
|
|
02ff96 |
+
|
|
|
02ff96 |
+ p -= 16;
|
|
|
02ff96 |
+ n -= 16;
|
|
|
02ff96 |
+ } while (n > 0);
|
|
|
02ff96 |
+#endif
|
|
|
02ff96 |
+}
|