Blame SOURCES/nettle-3.4.1-powerpc64-ghash-asm.patch

007cfe
diff -up ./configure.ac.ghash ./configure.ac
007cfe
--- ./configure.ac.ghash	2021-07-14 14:11:58.126891572 +0200
007cfe
+++ ./configure.ac	2021-07-14 14:11:58.130891552 +0200
007cfe
@@ -211,6 +211,22 @@ AC_C_BIGENDIAN([AC_DEFINE([WORDS_BIGENDI
007cfe
 		ASM_WORDS_BIGENDIAN=yes],
007cfe
 	[ASM_WORDS_BIGENDIAN=no])
007cfe
 
007cfe
+AC_CACHE_CHECK([for __builtin_bswap64],
007cfe
+		nettle_cv_c_builtin_bswap64,
007cfe
+[AC_TRY_LINK([
007cfe
+#include <stdint.h>
007cfe
+],[
007cfe
+uint64_t x = 17;
007cfe
+uint64_t y = __builtin_bswap64(x);
007cfe
+],
007cfe
+nettle_cv_c_builtin_bswap64=yes,
007cfe
+nettle_cv_c_builtin_bswap64=no)])
007cfe
+
007cfe
+AH_TEMPLATE([HAVE_BUILTIN_BSWAP64], [Define if __builtin_bswap64 is available])
007cfe
+if test "x$nettle_cv_c_builtin_bswap64" = "xyes" ; then
007cfe
+  AC_DEFINE(HAVE_BUILTIN_BSWAP64)
007cfe
+fi
007cfe
+
007cfe
 LSH_GCC_ATTRIBUTES
007cfe
 
007cfe
 # According to Simon Josefsson, looking for uint32_t and friends in
007cfe
@@ -472,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.a
e986cc
 		sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
e986cc
 
e986cc
 # Assembler files which generate additional object files if they are used.
e986cc
-asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
e986cc
+asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
e986cc
   aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
e986cc
   salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
e986cc
   sha3-permute-2.asm sha512-compress-2.asm \
007cfe
@@ -588,6 +604,10 @@ AH_VERBATIM([HAVE_NATIVE],
e986cc
 #undef HAVE_NATIVE_ecc_384_redc
e986cc
 #undef HAVE_NATIVE_ecc_521_modp
e986cc
 #undef HAVE_NATIVE_ecc_521_redc
e986cc
+#undef HAVE_NATIVE_gcm_init_key
e986cc
+#undef HAVE_NATIVE_fat_gcm_init_key
e986cc
+#undef HAVE_NATIVE_gcm_hash
e986cc
+#undef HAVE_NATIVE_fat_gcm_hash
e986cc
 #undef HAVE_NATIVE_gcm_hash8
e986cc
 #undef HAVE_NATIVE_salsa20_core
e986cc
 #undef HAVE_NATIVE_sha1_compress
007cfe
diff -up ./ctr16.c.ghash ./ctr16.c
007cfe
--- ./ctr16.c.ghash	2021-07-14 14:11:58.130891552 +0200
007cfe
+++ ./ctr16.c	2021-07-14 14:11:58.130891552 +0200
e986cc
@@ -0,0 +1,106 @@
e986cc
+/* ctr16.c
e986cc
+
e986cc
+   Cipher counter mode, optimized for 16-byte blocks.
e986cc
+
e986cc
+   Copyright (C) 2005-2018 Niels Möller
e986cc
+   Copyright (C) 2018 Red Hat, Inc.
e986cc
+
e986cc
+   This file is part of GNU Nettle.
e986cc
+
e986cc
+   GNU Nettle is free software: you can redistribute it and/or
e986cc
+   modify it under the terms of either:
e986cc
+
e986cc
+     * the GNU Lesser General Public License as published by the Free
e986cc
+       Software Foundation; either version 3 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or
e986cc
+
e986cc
+     * the GNU General Public License as published by the Free
e986cc
+       Software Foundation; either version 2 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or both in parallel, as here.
e986cc
+
e986cc
+   GNU Nettle is distributed in the hope that it will be useful,
e986cc
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
e986cc
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
e986cc
+   General Public License for more details.
e986cc
+
e986cc
+   You should have received copies of the GNU General Public License and
e986cc
+   the GNU Lesser General Public License along with this program.  If
e986cc
+   not, see http://www.gnu.org/licenses/.
e986cc
+*/
e986cc
+
e986cc
+#if HAVE_CONFIG_H
e986cc
+# include "config.h"
e986cc
+#endif
e986cc
+
e986cc
+#include <assert.h>
e986cc
+
e986cc
+#include "ctr.h"
e986cc
+
e986cc
+#include "ctr-internal.h"
e986cc
+#include "memxor.h"
e986cc
+#include "nettle-internal.h"
e986cc
+
e986cc
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
e986cc
+
e986cc
+void
e986cc
+_ctr_crypt16(const void *ctx, nettle_cipher_func *f,
e986cc
+	     nettle_fill16_func *fill, uint8_t *ctr,
e986cc
+	     size_t length, uint8_t *dst,
e986cc
+	     const uint8_t *src)
e986cc
+{
e986cc
+  if (dst != src && !((uintptr_t) dst % sizeof(uint64_t)))
e986cc
+    {
e986cc
+      size_t blocks = length / 16u;
e986cc
+      size_t done;
e986cc
+      fill (ctr, blocks, (union nettle_block16 *) dst);
e986cc
+
e986cc
+      done = blocks * 16;
e986cc
+      f(ctx, done, dst, dst);
e986cc
+      memxor (dst, src, done);
e986cc
+
e986cc
+      length -= done;
e986cc
+      if (length > 0)
e986cc
+	{ /* Left-over partial block */
e986cc
+	  union nettle_block16 block;
e986cc
+	  dst += done;
e986cc
+	  src += done;
e986cc
+	  assert (length < 16);
e986cc
+	  /* Use fill, to update ctr value in the same way in all cases. */
e986cc
+	  fill (ctr, 1, &block);
e986cc
+	  f (ctx, 16, block.b, block.b);
e986cc
+	  memxor3 (dst, src, block.b, length);
e986cc
+	}
e986cc
+    }
e986cc
+  else
e986cc
+    {
e986cc
+      /* Construct an aligned buffer of consecutive counter values, of
e986cc
+	 size at most CTR_BUFFER_LIMIT. */
e986cc
+      TMP_DECL(buffer, union nettle_block16, CTR_BUFFER_LIMIT / 16);
e986cc
+      size_t blocks = (length + 15) / 16u;
e986cc
+      size_t i;
e986cc
+      TMP_ALLOC(buffer, MIN(blocks, CTR_BUFFER_LIMIT / 16));
e986cc
+
e986cc
+      for (i = 0; blocks >= CTR_BUFFER_LIMIT / 16;
e986cc
+	   i += CTR_BUFFER_LIMIT, blocks -= CTR_BUFFER_LIMIT / 16)
e986cc
+	{
e986cc
+	  fill (ctr, CTR_BUFFER_LIMIT / 16, buffer);
e986cc
+	  f(ctx, CTR_BUFFER_LIMIT, buffer->b, buffer->b);
e986cc
+	  if (length - i < CTR_BUFFER_LIMIT)
e986cc
+	    goto done;
e986cc
+	  memxor3 (dst + i, src + i, buffer->b, CTR_BUFFER_LIMIT);
e986cc
+	}
e986cc
+
e986cc
+      if (blocks > 0)
e986cc
+	{
e986cc
+	  assert (length - i < CTR_BUFFER_LIMIT);
e986cc
+	  fill (ctr, blocks, buffer);
e986cc
+	  f(ctx, blocks * 16, buffer->b, buffer->b);
e986cc
+	done:
e986cc
+	  memxor3 (dst + i, src + i, buffer->b, length - i);
e986cc
+	}
e986cc
+    }
e986cc
+}
007cfe
diff -up ./ctr.c.ghash ./ctr.c
007cfe
--- ./ctr.c.ghash	2018-12-04 21:56:05.000000000 +0100
007cfe
+++ ./ctr.c	2021-07-14 14:13:07.714539484 +0200
007cfe
@@ -41,11 +41,83 @@
e986cc
 
e986cc
 #include "ctr.h"
e986cc
 
e986cc
+#include "ctr-internal.h"
e986cc
 #include "macros.h"
e986cc
 #include "memxor.h"
e986cc
 #include "nettle-internal.h"
e986cc
 
e986cc
-#define NBLOCKS 4
e986cc
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
e986cc
+
007cfe
+/* The 'u64' member has been added in the public header
007cfe
+   (nettle-types.h).  Check that the alignment is not affected with
007cfe
+   it using _Static_assert. */
007cfe
+union nettle_block16_
007cfe
+{
007cfe
+  uint8_t b[16];
007cfe
+  unsigned long w[16 / sizeof(unsigned long)];
007cfe
+};
007cfe
+_Static_assert(__alignof(union nettle_block16_) == __alignof(union nettle_block16),
007cfe
+	       "nettle_block16 alignment should be preserved");
007cfe
+
e986cc
+static size_t
e986cc
+ctr_fill (size_t block_size, uint8_t *ctr, size_t length, uint8_t *buffer)
e986cc
+{
e986cc
+  size_t i;
e986cc
+  for (i = 0; i + block_size <= length; i += block_size)
e986cc
+    {
e986cc
+      memcpy (buffer + i, ctr, block_size);
e986cc
+      INCREMENT(block_size, ctr);
e986cc
+    }
e986cc
+  return i;
e986cc
+}
e986cc
+
e986cc
+#if WORDS_BIGENDIAN
e986cc
+# define USE_CTR_CRYPT16 1
e986cc
+static nettle_fill16_func ctr_fill16;
e986cc
+static void
e986cc
+ctr_fill16(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
e986cc
+{
e986cc
+  uint64_t hi, lo;
e986cc
+  size_t i;
e986cc
+  hi = READ_UINT64(ctr);
e986cc
+  lo = READ_UINT64(ctr + 8);
e986cc
+
e986cc
+  for (i = 0; i < blocks; i++)
e986cc
+    {
e986cc
+      buffer[i].u64[0] = hi;
e986cc
+      buffer[i].u64[1] = lo;
e986cc
+      hi += !(++lo);
e986cc
+    }
e986cc
+  WRITE_UINT64(ctr, hi);
e986cc
+  WRITE_UINT64(ctr + 8, lo);
e986cc
+}
e986cc
+#else /* !WORDS_BIGENDIAN */
e986cc
+# if HAVE_BUILTIN_BSWAP64
e986cc
+#  define USE_CTR_CRYPT16 1
e986cc
+static nettle_fill16_func ctr_fill16;
e986cc
+static void
e986cc
+ctr_fill16(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
e986cc
+{
e986cc
+  uint64_t hi, lo;
e986cc
+  size_t i;
e986cc
+  /* Read hi in native endianness */
e986cc
+  hi = LE_READ_UINT64(ctr);
e986cc
+  lo = READ_UINT64(ctr + 8);
e986cc
+
e986cc
+  for (i = 0; i < blocks; i++)
e986cc
+    {
e986cc
+      buffer[i].u64[0] = hi;
e986cc
+      buffer[i].u64[1] = __builtin_bswap64(lo);
e986cc
+      if (!++lo)
e986cc
+	hi = __builtin_bswap64(__builtin_bswap64(hi) + 1);
e986cc
+    }
e986cc
+  LE_WRITE_UINT64(ctr, hi);
e986cc
+  WRITE_UINT64(ctr + 8, lo);
e986cc
+}
e986cc
+# else /* ! HAVE_BUILTIN_BSWAP64 */
e986cc
+#  define USE_CTR_CRYPT16 0
e986cc
+# endif
e986cc
+#endif /* !WORDS_BIGENDIAN */
e986cc
 
e986cc
 void
e986cc
 ctr_crypt(const void *ctx, nettle_cipher_func *f,
007cfe
@@ -53,84 +125,64 @@ ctr_crypt(const void *ctx, nettle_cipher
e986cc
 	  size_t length, uint8_t *dst,
e986cc
 	  const uint8_t *src)
e986cc
 {
e986cc
-  if (src != dst)
e986cc
+#if USE_CTR_CRYPT16
e986cc
+  if (block_size == 16)
e986cc
     {
e986cc
-      if (length == block_size)
e986cc
-	{
e986cc
-	  f(ctx, block_size, dst, ctr);
e986cc
-	  INCREMENT(block_size, ctr);
e986cc
-	  memxor(dst, src, block_size);
e986cc
-	}
e986cc
-      else
e986cc
+      _ctr_crypt16(ctx, f, ctr_fill16, ctr, length, dst, src);
e986cc
+      return;
e986cc
+    }
e986cc
+#endif
e986cc
+
e986cc
+  if(src != dst)
e986cc
+    {
e986cc
+      size_t filled = ctr_fill (block_size, ctr, length, dst);
e986cc
+
e986cc
+      f(ctx, filled, dst, dst);
e986cc
+      memxor(dst, src, filled);
e986cc
+
e986cc
+      if (filled < length)
e986cc
 	{
e986cc
-	  size_t left;
e986cc
-	  uint8_t *p;	  
e986cc
+	  TMP_DECL(block, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
e986cc
+	  TMP_ALLOC(block, block_size);
e986cc
 
e986cc
-	  for (p = dst, left = length;
e986cc
-	       left >= block_size;
e986cc
-	       left -= block_size, p += block_size)
e986cc
-	    {
e986cc
-	      memcpy (p, ctr, block_size);
e986cc
-	      INCREMENT(block_size, ctr);
e986cc
-	    }
e986cc
-
e986cc
-	  f(ctx, length - left, dst, dst);
e986cc
-	  memxor(dst, src, length - left);
e986cc
-
e986cc
-	  if (left)
e986cc
-	    {
e986cc
-	      TMP_DECL(buffer, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
e986cc
-	      TMP_ALLOC(buffer, block_size);
e986cc
-
e986cc
-	      f(ctx, block_size, buffer, ctr);
e986cc
-	      INCREMENT(block_size, ctr);
e986cc
-	      memxor3(dst + length - left, src + length - left, buffer, left);
e986cc
-	    }
e986cc
+	  f(ctx, block_size, block, ctr);
e986cc
+	  INCREMENT(block_size, ctr);
e986cc
+	  memxor3(dst + filled, src + filled, block, length - filled);
e986cc
 	}
e986cc
     }
e986cc
   else
e986cc
     {
e986cc
-      if (length > block_size)
e986cc
-	{
e986cc
-	  TMP_DECL(buffer, uint8_t, NBLOCKS * NETTLE_MAX_CIPHER_BLOCK_SIZE);
e986cc
-	  size_t chunk = NBLOCKS * block_size;
e986cc
+      /* For in-place CTR, construct a buffer of consecutive counter
e986cc
+	 values, of size at most CTR_BUFFER_LIMIT. */
e986cc
+      TMP_DECL(buffer, uint8_t, CTR_BUFFER_LIMIT);
e986cc
+
e986cc
+      size_t buffer_size;
e986cc
+      if (length < block_size)
e986cc
+	buffer_size = block_size;
e986cc
+      else if (length <= CTR_BUFFER_LIMIT)
e986cc
+	buffer_size = length;
e986cc
+      else
e986cc
+	buffer_size = CTR_BUFFER_LIMIT;
e986cc
 
e986cc
-	  TMP_ALLOC(buffer, chunk);
e986cc
+      TMP_ALLOC(buffer, buffer_size);
e986cc
 
e986cc
-	  for (; length >= chunk;
e986cc
-	       length -= chunk, src += chunk, dst += chunk)
e986cc
-	    {
e986cc
-	      unsigned n;
e986cc
-	      uint8_t *p;	  
e986cc
-	      for (n = 0, p = buffer; n < NBLOCKS; n++, p += block_size)
e986cc
-		{
e986cc
-		  memcpy (p, ctr, block_size);
e986cc
-		  INCREMENT(block_size, ctr);
e986cc
-		}
e986cc
-	      f(ctx, chunk, buffer, buffer);
e986cc
-	      memxor(dst, buffer, chunk);
e986cc
-	    }
e986cc
-
e986cc
-	  if (length > 0)
e986cc
-	    {
e986cc
-	      /* Final, possibly partial, blocks */
e986cc
-	      for (chunk = 0; chunk < length; chunk += block_size)
e986cc
-		{
e986cc
-		  memcpy (buffer + chunk, ctr, block_size);
e986cc
-		  INCREMENT(block_size, ctr);
e986cc
-		}
e986cc
-	      f(ctx, chunk, buffer, buffer);
e986cc
-	      memxor3(dst, src, buffer, length);
e986cc
-	    }
e986cc
+      while (length >= block_size)
e986cc
+	{
e986cc
+	  size_t filled
e986cc
+	    = ctr_fill (block_size, ctr, MIN(buffer_size, length), buffer);
e986cc
+	  assert (filled > 0);
e986cc
+	  f(ctx, filled, buffer, buffer);
e986cc
+	  memxor(dst, buffer, filled);
e986cc
+	  length -= filled;
e986cc
+	  dst += filled;
e986cc
 	}
e986cc
-      else if (length > 0)
e986cc
-      	{
e986cc
-	  TMP_DECL(buffer, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
e986cc
-	  TMP_ALLOC(buffer, block_size);
e986cc
 
e986cc
+      /* Final, possibly partial, block. */
e986cc
+      if (length > 0)
e986cc
+	{
e986cc
 	  f(ctx, block_size, buffer, ctr);
e986cc
 	  INCREMENT(block_size, ctr);
e986cc
-	  memxor3(dst, src, buffer, length);
e986cc
+	  memxor(dst, buffer, length);
e986cc
 	}
e986cc
     }
e986cc
 }
007cfe
diff -up ./ctr-internal.h.ghash ./ctr-internal.h
007cfe
--- ./ctr-internal.h.ghash	2021-07-14 14:11:58.130891552 +0200
007cfe
+++ ./ctr-internal.h	2021-07-14 14:11:58.130891552 +0200
e986cc
@@ -0,0 +1,56 @@
e986cc
+/* ctr-internal.h
e986cc
+
e986cc
+   Copyright (C) 2018 Niels Möller
e986cc
+
e986cc
+   This file is part of GNU Nettle.
e986cc
+
e986cc
+   GNU Nettle is free software: you can redistribute it and/or
e986cc
+   modify it under the terms of either:
e986cc
+
e986cc
+     * the GNU Lesser General Public License as published by the Free
e986cc
+       Software Foundation; either version 3 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or
e986cc
+
e986cc
+     * the GNU General Public License as published by the Free
e986cc
+       Software Foundation; either version 2 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or both in parallel, as here.
e986cc
+
e986cc
+   GNU Nettle is distributed in the hope that it will be useful,
e986cc
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
e986cc
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
e986cc
+   General Public License for more details.
e986cc
+
e986cc
+   You should have received copies of the GNU General Public License and
e986cc
+   the GNU Lesser General Public License along with this program.  If
e986cc
+   not, see http://www.gnu.org/licenses/.
e986cc
+*/
e986cc
+
e986cc
+#ifndef NETTLE_CTR_INTERNAL_H_INCLUDED
e986cc
+#define NETTLE_CTR_INTERNAL_H_INCLUDED
e986cc
+
e986cc
+#include "nettle-types.h"
e986cc
+
e986cc
+/* Name mangling */
e986cc
+#define _ctr_crypt16 _nettle_ctr_crypt16
e986cc
+
e986cc
+/* Size limit for temporary stack buffers. */
e986cc
+#define CTR_BUFFER_LIMIT 512
e986cc
+
e986cc
+/* Fill BUFFER (n blocks) with incrementing CTR values. It would be
e986cc
+   nice if CTR was always 64-bit aligned, but it isn't when called
e986cc
+   from ctr_crypt. */
e986cc
+typedef void
e986cc
+nettle_fill16_func(uint8_t *ctr, size_t n, union nettle_block16 *buffer);
e986cc
+
e986cc
+void
e986cc
+_ctr_crypt16(const void *ctx, nettle_cipher_func *f,
e986cc
+	     nettle_fill16_func *fill, uint8_t *ctr,
e986cc
+	     size_t length, uint8_t *dst,
e986cc
+	     const uint8_t *src);
e986cc
+
e986cc
+
e986cc
+#endif /* NETTLE_CTR_INTERNAL_H_INCLUDED */
007cfe
diff -up ./fat-ppc.c.ghash ./fat-ppc.c
007cfe
--- ./fat-ppc.c.ghash	2021-07-14 14:11:58.126891572 +0200
007cfe
+++ ./fat-ppc.c	2021-07-14 14:11:58.130891552 +0200
e986cc
@@ -49,6 +49,7 @@
e986cc
 
e986cc
 #include "aes-internal.h"
e986cc
 #include "gcm.h"
e986cc
+#include "gcm-internal.h"
e986cc
 #include "fat-setup.h"
e986cc
 
e986cc
 /* Define from arch/powerpc/include/uapi/asm/cputable.h in Linux kernel */
e986cc
@@ -87,6 +88,16 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, ae
e986cc
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c)
e986cc
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
e986cc
 
e986cc
+#if GCM_TABLE_BITS == 8
e986cc
+DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func)
e986cc
+DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c)
e986cc
+DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, ppc64)
e986cc
+
e986cc
+DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func)
e986cc
+DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c)
e986cc
+DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, ppc64)
e986cc
+#endif /* GCM_TABLE_BITS == 8 */
e986cc
+
e986cc
 static void CONSTRUCTOR
e986cc
 fat_init (void)
e986cc
 {
e986cc
@@ -101,17 +112,29 @@ fat_init (void)
e986cc
      features.have_crypto_ext ? "crypto extensions" : "");
e986cc
 
e986cc
   if (features.have_crypto_ext)
e986cc
-  {
e986cc
-     if (verbose)
e986cc
-        fprintf (stderr, "libnettle: enabling arch 2.07 code.\n");
e986cc
-     _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64;
e986cc
-     _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64;
e986cc
-  }
e986cc
+    {
e986cc
+      if (verbose)
e986cc
+	fprintf (stderr, "libnettle: enabling arch 2.07 code.\n");
e986cc
+      _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64;
e986cc
+      _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64;
e986cc
+#if GCM_TABLE_BITS == 8
e986cc
+      /* Make sure _nettle_gcm_init_key_vec function is compatible
e986cc
+         with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c()
e986cc
+         fills gcm_key table with values that are incompatible with
e986cc
+         _nettle_gcm_hash_ppc64() */
e986cc
+      _nettle_gcm_init_key_vec = _nettle_gcm_init_key_ppc64;
e986cc
+      _nettle_gcm_hash_vec = _nettle_gcm_hash_ppc64;
e986cc
+#endif /* GCM_TABLE_BITS == 8 */
e986cc
+    }
e986cc
   else
e986cc
-  {
e986cc
-     _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
e986cc
-     _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
e986cc
-  }
e986cc
+    {
e986cc
+      _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
e986cc
+      _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
e986cc
+#if GCM_TABLE_BITS == 8
e986cc
+      _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c;
e986cc
+      _nettle_gcm_hash_vec = _nettle_gcm_hash_c;
e986cc
+#endif /* GCM_TABLE_BITS == 8 */
e986cc
+    }
e986cc
 }
e986cc
 
e986cc
 DEFINE_FAT_FUNC(_nettle_aes_encrypt, void,
e986cc
@@ -127,3 +150,14 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, voi
e986cc
  size_t length, uint8_t *dst,
e986cc
  const uint8_t *src),
e986cc
  (rounds, keys, T, length, dst, src))
e986cc
+
e986cc
+#if GCM_TABLE_BITS == 8
e986cc
+DEFINE_FAT_FUNC(_nettle_gcm_init_key, void,
e986cc
+		(union nettle_block16 *table),
e986cc
+		(table))
e986cc
+
e986cc
+DEFINE_FAT_FUNC(_nettle_gcm_hash, void,
e986cc
+		(const struct gcm_key *key, union nettle_block16 *x,
e986cc
+		 size_t length, const uint8_t *data),
e986cc
+		(key, x, length, data))
e986cc
+#endif /* GCM_TABLE_BITS == 8 */
007cfe
diff -up ./fat-setup.h.ghash ./fat-setup.h
007cfe
--- ./fat-setup.h.ghash	2018-12-04 21:56:06.000000000 +0100
007cfe
+++ ./fat-setup.h	2021-07-14 14:11:58.130891552 +0200
e986cc
@@ -159,6 +159,11 @@ typedef void aes_crypt_internal_func (un
e986cc
 				      size_t length, uint8_t *dst,
e986cc
 				      const uint8_t *src);
e986cc
 
e986cc
+typedef void gcm_init_key_func (union nettle_block16 *table);
e986cc
+
e986cc
+typedef void gcm_hash_func (const struct gcm_key *key, union nettle_block16 *x,
e986cc
+			    size_t length, const uint8_t *data);
e986cc
+
e986cc
 typedef void *(memxor_func)(void *dst, const void *src, size_t n);
e986cc
 
e986cc
 typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src, unsigned rounds);
007cfe
diff -up ./gcm.c.ghash ./gcm.c
007cfe
--- ./gcm.c.ghash	2018-12-04 21:56:05.000000000 +0100
007cfe
+++ ./gcm.c	2021-07-14 14:11:58.131891547 +0200
e986cc
@@ -6,8 +6,9 @@
e986cc
    See also the gcm paper at
e986cc
    http://www.cryptobarn.com/papers/gcm-spec.pdf.
e986cc
 
e986cc
-   Copyright (C) 2011, 2013 Niels Möller
e986cc
    Copyright (C) 2011 Katholieke Universiteit Leuven
e986cc
+   Copyright (C) 2011, 2013, 2018 Niels Möller
e986cc
+   Copyright (C) 2018 Red Hat, Inc.
e986cc
    
e986cc
    Contributed by Nikos Mavrogiannopoulos
e986cc
 
e986cc
@@ -48,9 +49,11 @@
e986cc
 
e986cc
 #include "gcm.h"
e986cc
 
e986cc
+#include "gcm-internal.h"
e986cc
 #include "memxor.h"
e986cc
 #include "nettle-internal.h"
e986cc
 #include "macros.h"
e986cc
+#include "ctr-internal.h"
e986cc
 
e986cc
 #define GHASH_POLYNOMIAL 0xE1UL
e986cc
 
e986cc
@@ -112,7 +115,17 @@ gcm_gf_shift (union nettle_block16 *r, c
e986cc
 #endif /* ! WORDS_BIGENDIAN */
e986cc
 }
e986cc
 
e986cc
-#if GCM_TABLE_BITS == 0
e986cc
+#if GCM_TABLE_BITS != 8
e986cc
+/* The native implementations (currently ppc64 only) depend on the
e986cc
+   GCM_TABLE_BITS == 8 layout */
e986cc
+#undef HAVE_NATIVE_gcm_hash
e986cc
+#undef HAVE_NATIVE_gcm_init_key
e986cc
+#undef HAVE_NATIVE_fat_gcm_hash
e986cc
+#undef HAVE_NATIVE_fat_gcm_init_key
e986cc
+#endif
e986cc
+
e986cc
+#if !HAVE_NATIVE_gcm_hash
e986cc
+# if GCM_TABLE_BITS == 0
e986cc
 /* Sets x <- x * y mod r, using the plain bitwise algorithm from the
e986cc
    specification. y may be shorter than a full block, missing bytes
e986cc
    are assumed zero. */
e986cc
@@ -140,15 +153,15 @@ gcm_gf_mul (union nettle_block16 *x, con
e986cc
     }
e986cc
   memcpy (x->b, Z.b, sizeof(Z));
e986cc
 }
e986cc
-#else /* GCM_TABLE_BITS != 0 */
e986cc
+# else /* GCM_TABLE_BITS != 0 */
e986cc
 
e986cc
-# if WORDS_BIGENDIAN
e986cc
-#  define W(left,right) (0x##left##right)
e986cc
-# else
e986cc
-#  define W(left,right) (0x##right##left)
e986cc
-# endif
e986cc
+#  if WORDS_BIGENDIAN
e986cc
+#   define W(left,right) (0x##left##right)
e986cc
+#  else
e986cc
+#   define W(left,right) (0x##right##left)
e986cc
+#  endif
e986cc
 
e986cc
-# if GCM_TABLE_BITS == 4
e986cc
+#  if GCM_TABLE_BITS == 4
e986cc
 static const uint16_t
e986cc
 shift_table[0x10] = {
e986cc
   W(00,00),W(1c,20),W(38,40),W(24,60),W(70,80),W(6c,a0),W(48,c0),W(54,e0),
e986cc
@@ -177,26 +190,13 @@ gcm_gf_shift_4(union nettle_block16 *x)
e986cc
 #  error Unsupported word size. */
e986cc
 #endif
e986cc
 #else /* ! WORDS_BIGENDIAN */
e986cc
-# if SIZEOF_LONG == 4
e986cc
-#define RSHIFT_WORD(x) \
e986cc
-  ((((x) & 0xf0f0f0f0UL) >> 4)			\
e986cc
-   | (((x) & 0x000f0f0f) << 12))
e986cc
-  reduce = shift_table[(w[3] >> 24) & 0xf];
e986cc
-  w[3] = RSHIFT_WORD(w[3]) | ((w[2] >> 20) & 0xf0);
e986cc
-  w[2] = RSHIFT_WORD(w[2]) | ((w[1] >> 20) & 0xf0);
e986cc
-  w[1] = RSHIFT_WORD(w[1]) | ((w[0] >> 20) & 0xf0);
e986cc
-  w[0] = RSHIFT_WORD(w[0]) ^ reduce;
e986cc
-# elif SIZEOF_LONG == 8
e986cc
-#define RSHIFT_WORD(x) \
e986cc
-  ((((x) & 0xf0f0f0f0f0f0f0f0UL) >> 4) \
e986cc
-   | (((x) & 0x000f0f0f0f0f0f0fUL) << 12))
e986cc
-  reduce = shift_table[(w[1] >> 56) & 0xf];
e986cc
-  w[1] = RSHIFT_WORD(w[1]) | ((w[0] >> 52) & 0xf0);
e986cc
-  w[0] = RSHIFT_WORD(w[0]) ^ reduce;
e986cc
-# else
e986cc
-#  error Unsupported word size. */
e986cc
-# endif
e986cc
-# undef RSHIFT_WORD
e986cc
+# define RSHIFT_WORD_4(x) \
e986cc
+  ((((x) & UINT64_C(0xf0f0f0f0f0f0f0f0)) >> 4) \
e986cc
+   | (((x) & UINT64_C(0x000f0f0f0f0f0f0f)) << 12))
e986cc
+  reduce = shift_table[(u64[1] >> 56) & 0xf];
e986cc
+  u64[1] = RSHIFT_WORD_4(u64[1]) | ((u64[0] >> 52) & 0xf0);
e986cc
+  u64[0] = RSHIFT_WORD_4(u64[0]) ^ reduce;
e986cc
+# undef RSHIFT_WORD_4
e986cc
 #endif /* ! WORDS_BIGENDIAN */
e986cc
 }
e986cc
 
e986cc
@@ -219,10 +219,10 @@ gcm_gf_mul (union nettle_block16 *x, con
e986cc
     }
e986cc
   memcpy (x->b, Z.b, sizeof(Z));
e986cc
 }
e986cc
-# elif GCM_TABLE_BITS == 8
e986cc
-#  if HAVE_NATIVE_gcm_hash8
e986cc
+#  elif GCM_TABLE_BITS == 8
e986cc
+#   if HAVE_NATIVE_gcm_hash8
e986cc
 
e986cc
-#define gcm_hash _nettle_gcm_hash8
e986cc
+#define _nettle_gcm_hash _nettle_gcm_hash8
e986cc
 void
e986cc
 _nettle_gcm_hash8 (const struct gcm_key *key, union nettle_block16 *x,
e986cc
 		   size_t length, const uint8_t *data);
e986cc
@@ -317,18 +317,46 @@ gcm_gf_mul (union nettle_block16 *x, con
e986cc
   gcm_gf_shift_8(&Z);
e986cc
   gcm_gf_add(x, &Z, &table[x->b[0]]);
e986cc
 }
e986cc
-#  endif /* ! HAVE_NATIVE_gcm_hash8 */
e986cc
-# else /* GCM_TABLE_BITS != 8 */
e986cc
-#  error Unsupported table size. 
e986cc
-# endif /* GCM_TABLE_BITS != 8 */
e986cc
+#   endif /* ! HAVE_NATIVE_gcm_hash8 */
e986cc
+#  else /* GCM_TABLE_BITS != 8 */
e986cc
+#   error Unsupported table size.
e986cc
+#  endif /* GCM_TABLE_BITS != 8 */
e986cc
+
e986cc
+#  undef W
e986cc
+# endif /* GCM_TABLE_BITS != 0 */
e986cc
+#endif /* !HAVE_NATIVE_gcm_hash */
e986cc
 
e986cc
-#undef W
e986cc
-
e986cc
-#endif /* GCM_TABLE_BITS */
e986cc
 
e986cc
 /* Increment the rightmost 32 bits. */
e986cc
 #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
e986cc
 
e986cc
+#if !HAVE_NATIVE_gcm_init_key
e986cc
+# if !HAVE_NATIVE_fat_gcm_hash
e986cc
+#  define _nettle_gcm_init_key _nettle_gcm_init_key_c
e986cc
+static
e986cc
+# endif
e986cc
+void
e986cc
+_nettle_gcm_init_key_c(union nettle_block16 *table)
e986cc
+{
e986cc
+#if GCM_TABLE_BITS
e986cc
+  /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
e986cc
+     element */
e986cc
+  unsigned i = (1<
e986cc
+
e986cc
+  /* Algorithm 3 from the gcm paper. First do powers of two, then do
e986cc
+     the rest by adding. */
e986cc
+  while (i /= 2)
e986cc
+    gcm_gf_shift(&table[i], &table[2*i]);
e986cc
+  for (i = 2; i < 1<
e986cc
+    {
e986cc
+      unsigned j;
e986cc
+      for (j = 1; j < i; j++)
e986cc
+        gcm_gf_add(&table[i+j], &table[i], &table[j]);
e986cc
+    }
e986cc
+#endif
e986cc
+}
e986cc
+#endif /* !HAVE_NATIVE_gcm_init_key */
e986cc
+
e986cc
 /* Initialization of GCM.
e986cc
  * @ctx: The context of GCM
e986cc
  * @cipher: The context of the underlying block cipher
e986cc
@@ -345,25 +373,18 @@ gcm_set_key(struct gcm_key *key,
e986cc
   /* H */  
e986cc
   memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
e986cc
   f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
e986cc
-  
e986cc
-#if GCM_TABLE_BITS
e986cc
-  /* Algorithm 3 from the gcm paper. First do powers of two, then do
e986cc
-     the rest by adding. */
e986cc
-  while (i /= 2)
e986cc
-    gcm_gf_shift(&key->h[i], &key->h[2*i]);
e986cc
-  for (i = 2; i < 1<
e986cc
-    {
e986cc
-      unsigned j;
e986cc
-      for (j = 1; j < i; j++)
e986cc
-	gcm_gf_add(&key->h[i+j], &key->h[i],&key->h[j]);
e986cc
-    }
e986cc
-#endif
e986cc
+
e986cc
+  _nettle_gcm_init_key(key->h);
e986cc
 }
e986cc
 
e986cc
-#ifndef gcm_hash
e986cc
-static void
e986cc
-gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
e986cc
-	 size_t length, const uint8_t *data)
e986cc
+#if !(HAVE_NATIVE_gcm_hash || HAVE_NATIVE_gcm_hash8)
e986cc
+# if !HAVE_NATIVE_fat_gcm_hash
e986cc
+#  define _nettle_gcm_hash _nettle_gcm_hash_c
e986cc
+static
e986cc
+# endif
e986cc
+void
e986cc
+_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x,
e986cc
+		   size_t length, const uint8_t *data)
e986cc
 {
e986cc
   for (; length >= GCM_BLOCK_SIZE;
e986cc
        length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE)
e986cc
@@ -377,7 +398,7 @@ gcm_hash(const struct gcm_key *key, unio
e986cc
       gcm_gf_mul (x, key->h);
e986cc
     }
e986cc
 }
e986cc
-#endif /* !gcm_hash */
e986cc
+#endif /* !(HAVE_NATIVE_gcm_hash || HAVE_NATIVE_gcm_hash8) */
e986cc
 
e986cc
 static void
e986cc
 gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x,
e986cc
@@ -391,7 +412,7 @@ gcm_hash_sizes(const struct gcm_key *key
e986cc
   WRITE_UINT64 (buffer, auth_size);
e986cc
   WRITE_UINT64 (buffer + 8, data_size);
e986cc
 
e986cc
-  gcm_hash(key, x, GCM_BLOCK_SIZE, buffer);
e986cc
+  _nettle_gcm_hash(key, x, GCM_BLOCK_SIZE, buffer);
e986cc
 }
e986cc
 
e986cc
 /* NOTE: The key is needed only if length != GCM_IV_SIZE */
e986cc
@@ -410,7 +431,7 @@ gcm_set_iv(struct gcm_ctx *ctx, const st
e986cc
   else
e986cc
     {
e986cc
       memset(ctx->iv.b, 0, GCM_BLOCK_SIZE);
e986cc
-      gcm_hash(key, &ctx->iv, length, iv);
e986cc
+      _nettle_gcm_hash(key, &ctx->iv, length, iv);
e986cc
       gcm_hash_sizes(key, &ctx->iv, 0, length);
e986cc
     }
e986cc
 
e986cc
@@ -429,47 +450,68 @@ gcm_update(struct gcm_ctx *ctx, const st
e986cc
   assert(ctx->auth_size % GCM_BLOCK_SIZE == 0);
e986cc
   assert(ctx->data_size == 0);
e986cc
 
e986cc
-  gcm_hash(key, &ctx->x, length, data);
e986cc
+  _nettle_gcm_hash(key, &ctx->x, length, data);
e986cc
 
e986cc
   ctx->auth_size += length;
e986cc
 }
e986cc
 
e986cc
+static nettle_fill16_func gcm_fill;
e986cc
+#if WORDS_BIGENDIAN
e986cc
 static void
e986cc
-gcm_crypt(struct gcm_ctx *ctx, const void *cipher, nettle_cipher_func *f,
e986cc
-	  size_t length, uint8_t *dst, const uint8_t *src)
e986cc
+gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
e986cc
 {
e986cc
-  uint8_t buffer[GCM_BLOCK_SIZE];
e986cc
+  uint64_t hi, mid;
e986cc
+  uint32_t lo;
e986cc
+  size_t i;
e986cc
+  hi = READ_UINT64(ctr);
e986cc
+  mid = (uint64_t) READ_UINT32(ctr + 8) << 32;
e986cc
+  lo = READ_UINT32(ctr + 12);
e986cc
 
e986cc
-  if (src != dst)
e986cc
+  for (i = 0; i < blocks; i++)
e986cc
     {
e986cc
-      for (; length >= GCM_BLOCK_SIZE;
e986cc
-           (length -= GCM_BLOCK_SIZE,
e986cc
-	    src += GCM_BLOCK_SIZE, dst += GCM_BLOCK_SIZE))
e986cc
-        {
e986cc
-          f (cipher, GCM_BLOCK_SIZE, dst, ctx->ctr.b);
e986cc
-          memxor (dst, src, GCM_BLOCK_SIZE);
e986cc
-          INC32 (ctx->ctr);
e986cc
-        }
e986cc
+      buffer[i].u64[0] = hi;
e986cc
+      buffer[i].u64[1] = mid + lo++;
e986cc
     }
e986cc
-  else
e986cc
+  WRITE_UINT32(ctr + 12, lo);
e986cc
+
e986cc
+}
e986cc
+#elif HAVE_BUILTIN_BSWAP64
e986cc
+/* Assume __builtin_bswap32 is also available */
e986cc
+static void
e986cc
+gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
e986cc
+{
e986cc
+  uint64_t hi, mid;
e986cc
+  uint32_t lo;
e986cc
+  size_t i;
e986cc
+  hi = LE_READ_UINT64(ctr);
e986cc
+  mid = LE_READ_UINT32(ctr + 8);
e986cc
+  lo = READ_UINT32(ctr + 12);
e986cc
+
e986cc
+  for (i = 0; i < blocks; i++)
e986cc
     {
e986cc
-      for (; length >= GCM_BLOCK_SIZE;
e986cc
-           (length -= GCM_BLOCK_SIZE,
e986cc
-	    src += GCM_BLOCK_SIZE, dst += GCM_BLOCK_SIZE))
e986cc
-        {
e986cc
-          f (cipher, GCM_BLOCK_SIZE, buffer, ctx->ctr.b);
e986cc
-          memxor3 (dst, src, buffer, GCM_BLOCK_SIZE);
e986cc
-          INC32 (ctx->ctr);
e986cc
-        }
e986cc
+      buffer[i].u64[0] = hi;
e986cc
+      buffer[i].u64[1] = mid + ((uint64_t)__builtin_bswap32(lo) << 32);
e986cc
+      lo++;
e986cc
     }
e986cc
-  if (length > 0)
e986cc
+  WRITE_UINT32(ctr + 12, lo);
e986cc
+}
e986cc
+#else
e986cc
+static void
e986cc
+gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
e986cc
+{
e986cc
+  uint32_t c;
e986cc
+
e986cc
+  c = READ_UINT32(ctr + GCM_BLOCK_SIZE - 4);
e986cc
+
e986cc
+  for (; blocks-- > 0; buffer++, c++)
e986cc
     {
e986cc
-      /* A final partial block */
e986cc
-      f (cipher, GCM_BLOCK_SIZE, buffer, ctx->ctr.b);
e986cc
-      memxor3 (dst, src, buffer, length);
e986cc
-      INC32 (ctx->ctr);
e986cc
+      memcpy(buffer->b, ctr, GCM_BLOCK_SIZE - 4);
e986cc
+      WRITE_UINT32(buffer->b + GCM_BLOCK_SIZE - 4, c);
e986cc
     }
e986cc
+
e986cc
+  WRITE_UINT32(ctr + GCM_BLOCK_SIZE - 4, c);
e986cc
 }
e986cc
+#endif
e986cc
 
e986cc
 void
e986cc
 gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key,
e986cc
@@ -478,8 +520,8 @@ gcm_encrypt (struct gcm_ctx *ctx, const
e986cc
 {
e986cc
   assert(ctx->data_size % GCM_BLOCK_SIZE == 0);
e986cc
 
e986cc
-  gcm_crypt(ctx, cipher, f, length, dst, src);
e986cc
-  gcm_hash(key, &ctx->x, length, dst);
e986cc
+  _ctr_crypt16(cipher, f, gcm_fill, ctx->ctr.b, length, dst, src);
e986cc
+  _nettle_gcm_hash(key, &ctx->x, length, dst);
e986cc
 
e986cc
   ctx->data_size += length;
e986cc
 }
e986cc
@@ -491,8 +533,8 @@ gcm_decrypt(struct gcm_ctx *ctx, const s
e986cc
 {
e986cc
   assert(ctx->data_size % GCM_BLOCK_SIZE == 0);
e986cc
 
e986cc
-  gcm_hash(key, &ctx->x, length, src);
e986cc
-  gcm_crypt(ctx, cipher, f, length, dst, src);
e986cc
+  _nettle_gcm_hash(key, &ctx->x, length, src);
e986cc
+  _ctr_crypt16(cipher, f, gcm_fill, ctx->ctr.b, length, dst, src);
e986cc
 
e986cc
   ctx->data_size += length;
e986cc
 }
007cfe
diff -up ./gcm-internal.h.ghash ./gcm-internal.h
007cfe
--- ./gcm-internal.h.ghash	2021-07-14 14:11:58.131891547 +0200
007cfe
+++ ./gcm-internal.h	2021-07-14 14:11:58.131891547 +0200
e986cc
@@ -0,0 +1,54 @@
e986cc
+/* gcm-internal.h
e986cc
+
e986cc
+   Copyright (C) 2020 Niels Möller
e986cc
+
e986cc
+   This file is part of GNU Nettle.
e986cc
+
e986cc
+   GNU Nettle is free software: you can redistribute it and/or
e986cc
+   modify it under the terms of either:
e986cc
+
e986cc
+     * the GNU Lesser General Public License as published by the Free
e986cc
+       Software Foundation; either version 3 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or
e986cc
+
e986cc
+     * the GNU General Public License as published by the Free
e986cc
+       Software Foundation; either version 2 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or both in parallel, as here.
e986cc
+
e986cc
+   GNU Nettle is distributed in the hope that it will be useful,
e986cc
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
e986cc
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
e986cc
+   General Public License for more details.
e986cc
+
e986cc
+   You should have received copies of the GNU General Public License and
e986cc
+   the GNU Lesser General Public License along with this program.  If
e986cc
+   not, see http://www.gnu.org/licenses/.
e986cc
+*/
e986cc
+
e986cc
+#ifndef NETTLE_GCM_INTERNAL_H_INCLUDED
e986cc
+#define NETTLE_GCM_INTERNAL_H_INCLUDED
e986cc
+
e986cc
+/* Functions available only in some configurations */
e986cc
+void
e986cc
+_nettle_gcm_init_key (union nettle_block16 *table);
e986cc
+
e986cc
+void
e986cc
+_nettle_gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
e986cc
+		 size_t length, const uint8_t *data);
e986cc
+
e986cc
+#if HAVE_NATIVE_fat_gcm_init_key
e986cc
+void
e986cc
+_nettle_gcm_init_key_c (union nettle_block16 *table);
e986cc
+#endif
e986cc
+
e986cc
+#if HAVE_NATIVE_fat_gcm_hash
e986cc
+void
e986cc
+_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x,
e986cc
+		    size_t length, const uint8_t *data);
e986cc
+#endif
e986cc
+
e986cc
+#endif /* NETTLE_GCM_INTERNAL_H_INCLUDED */
007cfe
diff -up ./Makefile.in.ghash ./Makefile.in
007cfe
--- ./Makefile.in.ghash	2021-07-14 14:11:58.124891582 +0200
007cfe
+++ ./Makefile.in	2021-07-14 14:11:58.131891547 +0200
e986cc
@@ -96,7 +96,7 @@ nettle_SOURCES = aes-decrypt-internal.c
e986cc
 		 chacha-crypt.c chacha-core-internal.c \
e986cc
 		 chacha-poly1305.c chacha-poly1305-meta.c \
e986cc
 		 chacha-set-key.c chacha-set-nonce.c \
e986cc
-		 ctr.c des.c des3.c des-compat.c \
e986cc
+		 ctr.c ctr16.c des.c des3.c des-compat.c \
e986cc
 		 eax.c eax-aes128.c eax-aes128-meta.c \
e986cc
 		 gcm.c gcm-aes.c \
e986cc
 		 gcm-aes128.c gcm-aes128-meta.c \
e986cc
@@ -233,6 +233,8 @@ DISTFILES = $(SOURCES) $(HEADERS) getopt
e986cc
 	cast128_sboxes.h desinfo.h desCode.h \
e986cc
 	memxor-internal.h nettle-internal.h nettle-write.h \
e986cc
 	rsa-internal.h \
e986cc
+	ctr-internal.h \
e986cc
+	gcm-internal.h \
e986cc
 	gmp-glue.h ecc-internal.h fat-setup.h \
e986cc
 	mini-gmp.h asm.m4 \
e986cc
 	nettle.texinfo nettle.info nettle.html nettle.pdf sha-example.c
007cfe
diff -up ./nettle-types.h.ghash ./nettle-types.h
007cfe
--- ./nettle-types.h.ghash	2018-12-04 21:56:06.000000000 +0100
007cfe
+++ ./nettle-types.h	2021-07-14 14:11:58.131891547 +0200
e986cc
@@ -48,6 +48,7 @@ union nettle_block16
e986cc
 {
e986cc
   uint8_t b[16];
e986cc
   unsigned long w[16 / sizeof(unsigned long)];
e986cc
+  uint64_t u64[2];
e986cc
 };
e986cc
 
e986cc
 /* Randomness. Used by key generation and dsa signature creation. */
007cfe
diff -up ./powerpc64/fat/gcm-hash.asm.ghash ./powerpc64/fat/gcm-hash.asm
007cfe
--- ./powerpc64/fat/gcm-hash.asm.ghash	2021-07-14 14:11:58.131891547 +0200
007cfe
+++ ./powerpc64/fat/gcm-hash.asm	2021-07-14 14:11:58.131891547 +0200
e986cc
@@ -0,0 +1,39 @@
e986cc
+C powerpc64/fat/gcm-hash.asm
e986cc
+
e986cc
+
e986cc
+ifelse(<
e986cc
+   Copyright (C) 2020 Mamone Tarsha
e986cc
+
e986cc
+   This file is part of GNU Nettle.
e986cc
+
e986cc
+   GNU Nettle is free software: you can redistribute it and/or
e986cc
+   modify it under the terms of either:
e986cc
+
e986cc
+     * the GNU Lesser General Public License as published by the Free
e986cc
+       Software Foundation; either version 3 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or
e986cc
+
e986cc
+     * the GNU General Public License as published by the Free
e986cc
+       Software Foundation; either version 2 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or both in parallel, as here.
e986cc
+
e986cc
+   GNU Nettle is distributed in the hope that it will be useful,
e986cc
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
e986cc
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
e986cc
+   General Public License for more details.
e986cc
+
e986cc
+   You should have received copies of the GNU General Public License and
e986cc
+   the GNU Lesser General Public License along with this program.  If
e986cc
+   not, see http://www.gnu.org/licenses/.
e986cc
+>)
e986cc
+
e986cc
+dnl picked up by configure
e986cc
+dnl PROLOGUE(_nettle_fat_gcm_init_key)
e986cc
+dnl PROLOGUE(_nettle_fat_gcm_hash)
e986cc
+
e986cc
+define(<fat_transform>, <$1_ppc64>)
e986cc
+include_src(<powerpc64/p8/gcm-hash.asm>)
007cfe
diff -up ./powerpc64/p8/gcm-hash.asm.ghash ./powerpc64/p8/gcm-hash.asm
007cfe
--- ./powerpc64/p8/gcm-hash.asm.ghash	2021-07-14 14:11:58.131891547 +0200
007cfe
+++ ./powerpc64/p8/gcm-hash.asm	2021-07-14 14:11:58.131891547 +0200
e986cc
@@ -0,0 +1,499 @@
e986cc
+C powerpc64/p8/gcm-hash.asm
e986cc
+
e986cc
+ifelse(<
e986cc
+   Copyright (C) 2020 Niels Möller and Mamone Tarsha
e986cc
+   This file is part of GNU Nettle.
e986cc
+
e986cc
+   GNU Nettle is free software: you can redistribute it and/or
e986cc
+   modify it under the terms of either:
e986cc
+
e986cc
+     * the GNU Lesser General Public License as published by the Free
e986cc
+       Software Foundation; either version 3 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or
e986cc
+
e986cc
+     * the GNU General Public License as published by the Free
e986cc
+       Software Foundation; either version 2 of the License, or (at your
e986cc
+       option) any later version.
e986cc
+
e986cc
+   or both in parallel, as here.
e986cc
+
e986cc
+   GNU Nettle is distributed in the hope that it will be useful,
e986cc
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
e986cc
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
e986cc
+   General Public License for more details.
e986cc
+
e986cc
+   You should have received copies of the GNU General Public License and
e986cc
+   the GNU Lesser General Public License along with this program.  If
e986cc
+   not, see http://www.gnu.org/licenses/.
e986cc
+>)
e986cc
+
e986cc
+C gcm_set_key() assigns H value in the middle element of the table
e986cc
+define(<H_Idx>, <128>)
e986cc
+
e986cc
+C Register usage:
e986cc
+
e986cc
+define(<SP>, <1>)
e986cc
+define(<TOCP>, <2>)
e986cc
+
e986cc
+define(, <3>)
e986cc
+
e986cc
+define(<ZERO>, <0>)
e986cc
+define(<B1>, <1>)
e986cc
+define(<EMSB>, <16>)
e986cc
+define(<POLY>, <17>)
e986cc
+define(<POLY_L>, <1>)
e986cc
+
e986cc
+define(<H>, <2>)
e986cc
+define(

, <3>)

e986cc
+define(

, <4>)

e986cc
+define(

, <5>)

e986cc
+define(<H1M>, <6>)
e986cc
+define(<H1L>, <7>)
e986cc
+define(<H2M>, <8>)
e986cc
+define(<H2L>, <9>)
e986cc
+define(<Hl>, <10>)
e986cc
+define(<Hm>, <11>)
e986cc
+define(<Hp>, <12>)
e986cc
+define(<Hl2>, <13>)
e986cc
+define(<Hm2>, <14>)
e986cc
+define(<Hp2>, <15>)
e986cc
+define(<R>, <13>)
e986cc
+define(<F>, <14>)
e986cc
+define(<T>, <15>)
e986cc
+define(<R2>, <16>)
e986cc
+define(<F2>, <17>)
e986cc
+define(<T2>, <18>)
e986cc
+
e986cc
+define(<LE_TEMP>, <18>)
e986cc
+define(<LE_MASK>, <19>)
e986cc
+
e986cc
+.file "gcm-hash.asm"
e986cc
+
e986cc
+.text
e986cc
+
e986cc
+    C void gcm_init_key (union gcm_block *table)
e986cc
+
e986cc
+C This function populates the gcm table as the following layout
e986cc
+C *******************************************************************************
e986cc
+C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
e986cc
+C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) |
e986cc
+C |                                                                             |
e986cc
+C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
e986cc
+C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) |
e986cc
+C |                                                                             |
e986cc
+C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
e986cc
+C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) |
e986cc
+C |                                                                             |
e986cc
+C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
e986cc
+C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) |
e986cc
+C *******************************************************************************
e986cc
+
e986cc
+define(<FUNC_ALIGN>, <5>)
e986cc
+PROLOGUE(_nettle_gcm_init_key)
e986cc
+    DATA_LOAD_VEC(POLY,.polynomial,7)           C 0xC2000000000000000000000000000001
e986cc
+IF_LE(<
e986cc
+    li             8,0
e986cc
+    lvsl           LE_MASK,0,8                  C 0x000102030405060708090A0B0C0D0E0F
e986cc
+    vspltisb       LE_TEMP,0x07                  C 0x07070707070707070707070707070707
e986cc
+    vxor           LE_MASK,LE_MASK,LE_TEMP       C 0x07060504030201000F0E0D0C0B0A0908
e986cc
+>)
e986cc
+
e986cc
+    C 'H' is assigned by gcm_set_key() to the middle element of the table
e986cc
+    li             10,H_Idx*16
e986cc
+    lxvd2x         VSR(H),10,TABLE              C load 'H'
e986cc
+    C byte-reverse of each doubleword permuting on little-endian mode
e986cc
+IF_LE(<
e986cc
+    vperm          H,H,H,LE_MASK
e986cc
+>)
e986cc
+
e986cc
+    C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) ---
e986cc
+
e986cc
+    vupkhsb        EMSB,H                        C extend most significant bit to first byte
e986cc
+    vspltisb       B1,1                          C 0x01010101010101010101010101010101
e986cc
+    vspltb         EMSB,EMSB,0                   C first byte quadword-extend
e986cc
+    vsl            H,H,B1                        C H = H << 1
e986cc
+    vand           EMSB,EMSB,POLY                C EMSB &= 0xC2000000000000000000000000000001
e986cc
+    vxor           ZERO,ZERO,ZERO                C 0x00000000000000000000000000000000
e986cc
+    vxor           H,H,EMSB                      C H ^= EMSB
e986cc
+
e986cc
+    C --- calculate H^2 = H*H ---
e986cc
+
e986cc
+    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY) C 0x0000000000000000C200000000000000
e986cc
+
e986cc
+    C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) ---
e986cc
+    C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 ---
e986cc
+    C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) ---
e986cc
+    vpmsumd        Hp,H,POLY_L                   C Hp = (H mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)
e986cc
+    xxswapd        VSR(Hm),VSR(H)
e986cc
+    xxmrgld        VSR(Hl),VSR(H),VSR(ZERO)      C Hl = (H mod x⁶⁴) × x⁶⁴
e986cc
+    vxor           Hm,Hm,Hp                      C Hm = Hm + Hp
e986cc
+    vxor           Hl,Hl,Hp                      C Hl = Hl + Hp
e986cc
+    xxmrgld        VSR(H1L),VSR(H),VSR(Hm)       C H1L = (H mod x⁶⁴)||(Hl mod x⁶⁴)
e986cc
+    xxmrghd        VSR(H1M),VSR(H),VSR(Hl)       C H1M = (H div x⁶⁴)||(Hl div x⁶⁴)
e986cc
+
e986cc
+    vpmsumd        F,H1L,H                       C F = (H1Lh × Hh) + (H1Ll × Hl)
e986cc
+    vpmsumd        R,H1M,H                       C R = (H1Mh × Hh) + (H1Ml × Hl)
e986cc
+
e986cc
+    C --- rduction ---
e986cc
+    vpmsumd        T,F,POLY_L                    C T = (F mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)
e986cc
+    xxswapd        VSR(H2),VSR(F)
e986cc
+    vxor           R,R,T                         C R = R + T
e986cc
+    vxor           H2,R,H2
e986cc
+
e986cc
+    xxmrgld        VSR(Hl),VSR(H2),VSR(ZERO)
e986cc
+    xxswapd        VSR(Hm),VSR(H2)
e986cc
+    vpmsumd        Hp,H2,POLY_L
e986cc
+    vxor           Hl,Hl,Hp
e986cc
+    vxor           Hm,Hm,Hp
e986cc
+    xxmrghd        VSR(H2M),VSR(H2),VSR(Hl)
e986cc
+    xxmrgld        VSR(H2L),VSR(H2),VSR(Hm)
e986cc
+
e986cc
+    C store H1M, H1L, H2M, H2L
e986cc
+    li             8,1*16
e986cc
+    li             9,2*16
e986cc
+    li             10,3*16
e986cc
+    stxvd2x        VSR(H1M),0,TABLE
e986cc
+    stxvd2x        VSR(H1L),8,TABLE
e986cc
+    stxvd2x        VSR(H2M),9,TABLE
e986cc
+    stxvd2x        VSR(H2L),10,TABLE
e986cc
+
e986cc
+    C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
e986cc
+
e986cc
+    vpmsumd        F,H1L,H2
e986cc
+    vpmsumd        F2,H2L,H2
e986cc
+    vpmsumd        R,H1M,H2
e986cc
+    vpmsumd        R2,H2M,H2
e986cc
+
e986cc
+    vpmsumd        T,F,POLY_L
e986cc
+    vpmsumd        T2,F2,POLY_L
e986cc
+    xxswapd        VSR(H3),VSR(F)
e986cc
+    xxswapd        VSR(H4),VSR(F2)
e986cc
+    vxor           R,R,T
e986cc
+    vxor           R2,R2,T2
e986cc
+    vxor           H3,R,H3
e986cc
+    vxor           H4,R2,H4
e986cc
+
e986cc
+    xxmrgld        VSR(Hl),VSR(H3),VSR(ZERO)
e986cc
+    xxmrgld        VSR(Hl2),VSR(H4),VSR(ZERO)
e986cc
+    xxswapd        VSR(Hm),VSR(H3)
e986cc
+    xxswapd        VSR(Hm2),VSR(H4)
e986cc
+    vpmsumd        Hp,H3,POLY_L
e986cc
+    vpmsumd        Hp2,H4,POLY_L
e986cc
+    vxor           Hl,Hl,Hp
e986cc
+    vxor           Hl2,Hl2,Hp2
e986cc
+    vxor           Hm,Hm,Hp
e986cc
+    vxor           Hm2,Hm2,Hp2
e986cc
+    xxmrghd        VSR(H1M),VSR(H3),VSR(Hl)
e986cc
+    xxmrghd        VSR(H2M),VSR(H4),VSR(Hl2)
e986cc
+    xxmrgld        VSR(H1L),VSR(H3),VSR(Hm)
e986cc
+    xxmrgld        VSR(H2L),VSR(H4),VSR(Hm2)
e986cc
+
e986cc
+    C store H3M, H3L, H4M, H4L
e986cc
+    li             7,4*16
e986cc
+    li             8,5*16
e986cc
+    li             9,6*16
e986cc
+    li             10,7*16
e986cc
+    stxvd2x        VSR(H1M),7,TABLE
e986cc
+    stxvd2x        VSR(H1L),8,TABLE
e986cc
+    stxvd2x        VSR(H2M),9,TABLE
e986cc
+    stxvd2x        VSR(H2L),10,TABLE
e986cc
+
e986cc
+    blr
e986cc
+EPILOGUE(_nettle_gcm_init_key)
e986cc
+
e986cc
+define(, <3>)
e986cc
+define(<X>, <4>)
e986cc
+define(<LENGTH>, <5>)
e986cc
+define(<DATA>, <6>)
e986cc
+
e986cc
+define(<ZERO>, <16>)
e986cc
+define(<POLY>, <17>)
e986cc
+define(<POLY_L>, <0>)
e986cc
+
e986cc
+define(<D>, <1>)
e986cc
+define(<C0>, <2>)
e986cc
+define(<C1>, <3>)
e986cc
+define(<C2>, <4>)
e986cc
+define(<C3>, <5>)
e986cc
+define(<H1M>, <6>)
e986cc
+define(<H1L>, <7>)
e986cc
+define(<H2M>, <8>)
e986cc
+define(<H2L>, <9>)
e986cc
+define(<H3M>, <10>)
e986cc
+define(<H3L>, <11>)
e986cc
+define(<H4M>, <12>)
e986cc
+define(<H4L>, <13>)
e986cc
+define(<R>, <14>)
e986cc
+define(<F>, <15>)
e986cc
+define(<R2>, <16>)
e986cc
+define(<F2>, <17>)
e986cc
+define(<T>, <18>)
e986cc
+define(<R3>, <20>)
e986cc
+define(<F3>, <21>)
e986cc
+define(<R4>, <22>)
e986cc
+define(<F4>, <23>)
e986cc
+
e986cc
+define(<LE_TEMP>, <18>)
e986cc
+define(<LE_MASK>, <19>)
e986cc
+
e986cc
+    C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
e986cc
+    C                size_t length, const uint8_t *data)
e986cc
+
e986cc
+define(<FUNC_ALIGN>, <5>)
e986cc
+PROLOGUE(_nettle_gcm_hash)
e986cc
+    vxor           ZERO,ZERO,ZERO
e986cc
+    DATA_LOAD_VEC(POLY,.polynomial,7)
e986cc
+IF_LE(<
e986cc
+    li             8,0
e986cc
+    lvsl           LE_MASK,0,8
e986cc
+    vspltisb       LE_TEMP,0x07
e986cc
+    vxor           LE_MASK,LE_MASK,LE_TEMP
e986cc
+>)
e986cc
+    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY)
e986cc
+
e986cc
+    lxvd2x         VSR(D),0,X                    C load 'X' pointer
e986cc
+    C byte-reverse of each doubleword permuting on little-endian mode
e986cc
+IF_LE(<
e986cc
+    vperm          D,D,D,LE_MASK
e986cc
+>)
e986cc
+
e986cc
+    C --- process 4 blocks '128-bit each' per one loop ---
e986cc
+
e986cc
+    srdi.          7,LENGTH,6                   C 4-blocks loop count 'LENGTH / (4 * 16)'
e986cc
+    beq            L2x
e986cc
+
e986cc
+    mtctr          7                            C assign counter register to loop count
e986cc
+
e986cc
+    C store non-volatile vector registers
e986cc
+    addi           8,SP,-64
e986cc
+    stvx           20,0,8
e986cc
+    addi           8,8,16
e986cc
+    stvx           21,0,8
e986cc
+    addi           8,8,16
e986cc
+    stvx           22,0,8
e986cc
+    addi           8,8,16
e986cc
+    stvx           23,0,8
e986cc
+
e986cc
+    C load table elements
e986cc
+    li             8,1*16
e986cc
+    li             9,2*16
e986cc
+    li             10,3*16
e986cc
+    lxvd2x         VSR(H1M),0,TABLE
e986cc
+    lxvd2x         VSR(H1L),8,TABLE
e986cc
+    lxvd2x         VSR(H2M),9,TABLE
e986cc
+    lxvd2x         VSR(H2L),10,TABLE
e986cc
+    li             7,4*16
e986cc
+    li             8,5*16
e986cc
+    li             9,6*16
e986cc
+    li             10,7*16
e986cc
+    lxvd2x         VSR(H3M),7,TABLE
e986cc
+    lxvd2x         VSR(H3L),8,TABLE
e986cc
+    lxvd2x         VSR(H4M),9,TABLE
e986cc
+    lxvd2x         VSR(H4L),10,TABLE
e986cc
+
e986cc
+    li             8,0x10
e986cc
+    li             9,0x20
e986cc
+    li             10,0x30
e986cc
+.align 5
e986cc
+L4x_loop:
e986cc
+    C input loading
e986cc
+    lxvd2x         VSR(C0),0,DATA                C load C0
e986cc
+    lxvd2x         VSR(C1),8,DATA               C load C1
e986cc
+    lxvd2x         VSR(C2),9,DATA               C load C2
e986cc
+    lxvd2x         VSR(C3),10,DATA              C load C3
e986cc
+
e986cc
+IF_LE(<
e986cc
+    vperm          C0,C0,C0,LE_MASK
e986cc
+    vperm          C1,C1,C1,LE_MASK
e986cc
+    vperm          C2,C2,C2,LE_MASK
e986cc
+    vperm          C3,C3,C3,LE_MASK
e986cc
+>)
e986cc
+
e986cc
+    C previous digest combining
e986cc
+    vxor           C0,C0,D
e986cc
+
e986cc
+    C polynomial multiplication
e986cc
+    vpmsumd        F2,H3L,C1
e986cc
+    vpmsumd        R2,H3M,C1
e986cc
+    vpmsumd        F3,H2L,C2
e986cc
+    vpmsumd        R3,H2M,C2
e986cc
+    vpmsumd        F4,H1L,C3
e986cc
+    vpmsumd        R4,H1M,C3
e986cc
+    vpmsumd        F,H4L,C0
e986cc
+    vpmsumd        R,H4M,C0
e986cc
+
e986cc
+    C deferred recombination of partial products
e986cc
+    vxor           F3,F3,F4
e986cc
+    vxor           R3,R3,R4
e986cc
+    vxor           F,F,F2
e986cc
+    vxor           R,R,R2
e986cc
+    vxor           F,F,F3
e986cc
+    vxor           R,R,R3
e986cc
+
e986cc
+    C reduction
e986cc
+    vpmsumd        T,F,POLY_L
e986cc
+    xxswapd        VSR(D),VSR(F)
e986cc
+    vxor           R,R,T
e986cc
+    vxor           D,R,D
e986cc
+
e986cc
+    addi           DATA,DATA,0x40
e986cc
+    bdnz           L4x_loop
e986cc
+
e986cc
+    C restore non-volatile vector registers
e986cc
+    addi           8,SP,-64
e986cc
+    lvx            20,0,8
e986cc
+    addi           8,8,16
e986cc
+    lvx            21,0,8
e986cc
+    addi           8,8,16
e986cc
+    lvx            22,0,8
e986cc
+    addi           8,8,16
e986cc
+    lvx            23,0,8
e986cc
+
e986cc
+    clrldi         LENGTH,LENGTH,58              C 'set the high-order 58 bits to zeros'
e986cc
+L2x:
e986cc
+    C --- process 2 blocks ---
e986cc
+
e986cc
+    srdi.          7,LENGTH,5                   C 'LENGTH / (2 * 16)'
e986cc
+    beq            L1x
e986cc
+
e986cc
+    C load table elements
e986cc
+    li             8,1*16
e986cc
+    li             9,2*16
e986cc
+    li             10,3*16
e986cc
+    lxvd2x         VSR(H1M),0,TABLE
e986cc
+    lxvd2x         VSR(H1L),8,TABLE
e986cc
+    lxvd2x         VSR(H2M),9,TABLE
e986cc
+    lxvd2x         VSR(H2L),10,TABLE
e986cc
+
e986cc
+    C input loading
e986cc
+    li             10,0x10
e986cc
+    lxvd2x         VSR(C0),0,DATA                C load C0
e986cc
+    lxvd2x         VSR(C1),10,DATA              C load C1
e986cc
+
e986cc
+IF_LE(<
e986cc
+    vperm          C0,C0,C0,LE_MASK
e986cc
+    vperm          C1,C1,C1,LE_MASK
e986cc
+>)
e986cc
+
e986cc
+    C previous digest combining
e986cc
+    vxor           C0,C0,D
e986cc
+
e986cc
+    C polynomial multiplication
e986cc
+    vpmsumd        F2,H1L,C1
e986cc
+    vpmsumd        R2,H1M,C1
e986cc
+    vpmsumd        F,H2L,C0
e986cc
+    vpmsumd        R,H2M,C0
e986cc
+
e986cc
+    C deferred recombination of partial products
e986cc
+    vxor           F,F,F2
e986cc
+    vxor           R,R,R2
e986cc
+
e986cc
+    C reduction
e986cc
+    vpmsumd        T,F,POLY_L
e986cc
+    xxswapd        VSR(D),VSR(F)
e986cc
+    vxor           R,R,T
e986cc
+    vxor           D,R,D
e986cc
+
e986cc
+    addi           DATA,DATA,0x20
e986cc
+    clrldi         LENGTH,LENGTH,59              C 'set the high-order 59 bits to zeros'
e986cc
+L1x:
e986cc
+    C --- process 1 block ---
e986cc
+
e986cc
+    srdi.          7,LENGTH,4                   C 'LENGTH / (1 * 16)'
e986cc
+    beq            Lmod
e986cc
+
e986cc
+    C load table elements
e986cc
+    li             8,1*16
e986cc
+    lxvd2x         VSR(H1M),0,TABLE
e986cc
+    lxvd2x         VSR(H1L),8,TABLE
e986cc
+
e986cc
+    C input loading
e986cc
+    lxvd2x         VSR(C0),0,DATA                C load C0
e986cc
+
e986cc
+IF_LE(<
e986cc
+    vperm          C0,C0,C0,LE_MASK
e986cc
+>)
e986cc
+
e986cc
+    C previous digest combining
e986cc
+    vxor           C0,C0,D
e986cc
+
e986cc
+    C polynomial multiplication
e986cc
+    vpmsumd        F,H1L,C0
e986cc
+    vpmsumd        R,H1M,C0
e986cc
+
e986cc
+    C reduction
e986cc
+    vpmsumd        T,F,POLY_L
e986cc
+    xxswapd        VSR(D),VSR(F)
e986cc
+    vxor           R,R,T
e986cc
+    vxor           D,R,D
e986cc
+
e986cc
+    addi           DATA,DATA,0x10
e986cc
+    clrldi         LENGTH,LENGTH,60              C 'set the high-order 60 bits to zeros'
e986cc
+Lmod:
e986cc
+    C --- process the modulo bytes, padding the low-order bytes with zeros ---
e986cc
+
e986cc
+    cmpldi         LENGTH,0
e986cc
+    beq            Ldone
e986cc
+
e986cc
+    C load table elements
e986cc
+    li             8,1*16
e986cc
+    lxvd2x         VSR(H1M),0,TABLE
e986cc
+    lxvd2x         VSR(H1L),8,TABLE
e986cc
+
e986cc
+    C push every modulo byte to the stack and load them with padding into vector register
e986cc
+    vxor           ZERO,ZERO,ZERO
e986cc
+    addi           8,SP,-16
e986cc
+    stvx           ZERO,0,8
e986cc
+Lstb_loop:
e986cc
+    subic.         LENGTH,LENGTH,1
e986cc
+    lbzx           7,LENGTH,DATA
e986cc
+    stbx           7,LENGTH,8
e986cc
+    bne            Lstb_loop
e986cc
+    lxvd2x         VSR(C0),0,8
e986cc
+
e986cc
+IF_LE(<
e986cc
+    vperm          C0,C0,C0,LE_MASK
e986cc
+>)
e986cc
+
e986cc
+    C previous digest combining
e986cc
+    vxor           C0,C0,D
e986cc
+
e986cc
+    C polynomial multiplication
e986cc
+    vpmsumd        F,H1L,C0
e986cc
+    vpmsumd        R,H1M,C0
e986cc
+
e986cc
+    C reduction
e986cc
+    vpmsumd        T,F,POLY_L
e986cc
+    xxswapd        VSR(D),VSR(F)
e986cc
+    vxor           R,R,T
e986cc
+    vxor           D,R,D
e986cc
+
e986cc
+Ldone:
e986cc
+    C byte-reverse of each doubleword permuting on little-endian mode
e986cc
+IF_LE(<
e986cc
+    vperm          D,D,D,LE_MASK
e986cc
+>)
e986cc
+    stxvd2x        VSR(D),0,X                    C store digest 'D'
e986cc
+
e986cc
+    blr
e986cc
+EPILOGUE(_nettle_gcm_hash)
e986cc
+
e986cc
+.data
e986cc
+    C 0xC2000000000000000000000000000001
e986cc
+.polynomial:
e986cc
+.align 4
e986cc
+IF_BE(<
e986cc
+.byte 0xC2
e986cc
+.rept 14
e986cc
+.byte 0x00
e986cc
+.endr
e986cc
+.byte 0x01
e986cc
+>,<
e986cc
+.byte 0x01
e986cc
+.rept 14
e986cc
+.byte 0x00
e986cc
+.endr
e986cc
+.byte 0xC2
e986cc
+>)