Blob Blame History Raw
--- a/configure	2022-04-19 17:46:39.589212290 +0300
+++ b/configure	2022-04-19 17:48:26.737818784 +0300
@@ -115,6 +115,7 @@
       echo '    [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log
       echo '    [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log
       echo '    [--dfltcc]' | tee -a configure.log
+      echo '    [--enable-sse_slide]' | tee -a configure.log
         exit 0 ;;
     -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;;
     -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;;
@@ -144,6 +145,12 @@
 	    PIC_OBJC="$PIC_OBJC dfltcc.lo"
       shift
       ;; 
+    --enable-sse_slide)
+	    CFLAGS="$CFLAGS -DUSE_SSE_SLIDE"
+	    OBJC="$OBJC slide_sse.o"
+	    PIC_OBJC="$PIC_OBJC slide_sse.lo"
+      shift
+      ;; 
     *)
       echo "unknown option: $1" | tee -a configure.log
       echo "$0 --help for help" | tee -a configure.log
--- a/Makefile.in	2023-02-18 10:35:58.873281584 +0200
+++ b/Makefile.in	2023-02-18 11:48:00.796154526 +0200
@@ -144,6 +144,14 @@
 	mv _match.o match.lo
 	rm -f _match.s
 
+slide_sse.o: $(SRCDIR)slide_sse.c
+	$(CC) $(CFLAGS) $(ZINC) -msse2 -c -o $@ $(SRCDIR)slide_sse.c
+
+slide_sse.lo: $(SRCDIR)slide_sse.c
+	-@mkdir objs 2>/dev/null || test -d objs
+	$(CC) $(SFLAGS) $(ZINC) -DPIC -msse2 -c -o objs/slide_sse.o $(SRCDIR)slide_sse.c
+	-@mv objs/slide_sse.o $@
+
 dfltcc.o: $(SRCDIR)contrib/s390/dfltcc.c $(SRCDIR)zlib.h zconf.h
 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)contrib/s390/dfltcc.c
 
--- a/deflate.c	2022-04-19 11:43:42.333320519 +0300
+++ b/deflate.c	2022-04-19 15:55:30.636531139 +0300
@@ -90,6 +90,8 @@
 
 local int deflateStateCheck      OF((z_streamp strm));
 local void slide_hash     OF((deflate_state *s));
+local void slide_hash_c     OF((deflate_state *s));
+extern void slide_hash_sse     (deflate_state *s);
 local void fill_window    OF((deflate_state *s));
 local block_state deflate_stored OF((deflate_state *s, int flush));
 local block_state deflate_fast   OF((deflate_state *s, int flush));
@@ -212,7 +214,7 @@
  * bit values at the expense of memory usage). We slide even when level == 0 to
  * keep the hash table consistent if we switch back to level > 0 later.
  */
-local void slide_hash(s)
+local void slide_hash_c(s)
     deflate_state *s;
 {
     unsigned n, m;
@@ -238,6 +240,15 @@
 #endif
 }
 
+local void slide_hash(deflate_state *s)
+{
+#ifdef USE_SSE_SLIDE
+	slide_hash_sse(s);
+#else
+	slide_hash_c(s);
+#endif
+}
+
 /* ========================================================================= */
 int ZEXPORT deflateInit_(strm, level, version, stream_size)
     z_streamp strm;
diff --git a/slide_sse.c b/slide_sse.c
new file mode 100644
index 0000000..2ef2669
--- /dev/null
+++ b/slide_sse.c
@@ -0,0 +1,49 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven	<arjan@linux.intel.com>
+ *   Jim Kukunas	<james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "deflate.h"
+#include <immintrin.h>
+
+void slide_hash_sse(deflate_state *s)
+{
+    unsigned n;
+    Posf *p;
+    uInt wsize = s->w_size;
+    z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
+
+    n = s->hash_size;
+    p = &s->head[n] - 8;
+    do {
+        __m128i value, result;
+
+	value = _mm_loadu_si128((__m128i *)p);
+	result= _mm_subs_epu16(value, xmm_wsize);
+	_mm_storeu_si128((__m128i *)p, result);
+	p -= 8;
+	n -= 8;
+    } while (n > 0);
+
+#ifndef FASTEST
+    n = wsize;
+    p = &s->prev[n] - 8;
+    do {
+        __m128i value, result;
+
+	value = _mm_loadu_si128((__m128i *)p);
+	result= _mm_subs_epu16(value, xmm_wsize);
+	_mm_storeu_si128((__m128i *)p, result);
+
+	p -= 8;
+	n -= 8;
+    } while (n > 0);
+#endif
+}
+
+
-- 
2.27.0