Blame SOURCES/0071-AES-GCM-performance-optimization.patch

727bdf
Upstream-Status: Backport [https://github.com/openssl/openssl/commit/44a563dde1584cd9284e80b6e45ee5019be8d36c, https://github.com/openssl/openssl/commit/345c99b6654b8313c792d54f829943068911ddbd]
727bdf
diff --git a/crypto/modes/asm/aes-gcm-ppc.pl b/crypto/modes/asm/aes-gcm-ppc.pl
727bdf
new file mode 100644
727bdf
index 0000000..6624e6c
727bdf
--- /dev/null
727bdf
+++ b/crypto/modes/asm/aes-gcm-ppc.pl
727bdf
@@ -0,0 +1,1438 @@
727bdf
+#! /usr/bin/env perl
727bdf
+# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
727bdf
+# Copyright 2021- IBM Inc. All rights reserved
727bdf
+#
727bdf
+# Licensed under the Apache License 2.0 (the "License").  You may not use
727bdf
+# this file except in compliance with the License.  You can obtain a copy
727bdf
+# in the file LICENSE in the source distribution or at
727bdf
+# https://www.openssl.org/source/license.html
727bdf
+#
727bdf
+#===================================================================================
727bdf
+# Written by Danny Tsen <dtsen@us.ibm.com> for OpenSSL Project,
727bdf
+#
727bdf
+# GHASH is based on the Karatsuba multiplication method.
727bdf
+#
727bdf
+#    Xi xor X1
727bdf
+#
727bdf
+#    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
727bdf
+#      (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
727bdf
+#      (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
727bdf
+#      (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
727bdf
+#      (X4.h * H.h + X4.l * H.l + X4 * H)
727bdf
+#
727bdf
+# Xi = v0
727bdf
+# H Poly = v2
727bdf
+# Hash keys = v3 - v14
727bdf
+#     ( H.l, H, H.h)
727bdf
+#     ( H^2.l, H^2, H^2.h)
727bdf
+#     ( H^3.l, H^3, H^3.h)
727bdf
+#     ( H^4.l, H^4, H^4.h)
727bdf
+#
727bdf
+# v30 is IV
727bdf
+# v31 - counter 1
727bdf
+#
727bdf
+# AES used,
727bdf
+#     vs0 - vs14 for round keys
727bdf
+#     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
727bdf
+#
727bdf
+# This implementation uses stitched AES-GCM approach to improve overall performance.
727bdf
+# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
727bdf
+#
727bdf
+# Current large block (16384 bytes) performance per second with 128 bit key --
727bdf
+#
727bdf
+#                        Encrypt  Decrypt
727bdf
+# Power10[le] (3.5GHz)   5.32G    5.26G
727bdf
+#
727bdf
+# ===================================================================================
727bdf
+#
727bdf
+# $output is the last argument if it looks like a file (it has an extension)
727bdf
+# $flavour is the first argument if it doesn't look like a file
727bdf
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
727bdf
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
727bdf
+
727bdf
+if ($flavour =~ /64/) {
727bdf
+	$SIZE_T=8;
727bdf
+	$LRSAVE=2*$SIZE_T;
727bdf
+	$STU="stdu";
727bdf
+	$POP="ld";
727bdf
+	$PUSH="std";
727bdf
+	$UCMP="cmpld";
727bdf
+	$SHRI="srdi";
727bdf
+} elsif ($flavour =~ /32/) {
727bdf
+	$SIZE_T=4;
727bdf
+	$LRSAVE=$SIZE_T;
727bdf
+	$STU="stwu";
727bdf
+	$POP="lwz";
727bdf
+	$PUSH="stw";
727bdf
+	$UCMP="cmplw";
727bdf
+	$SHRI="srwi";
727bdf
+} else { die "nonsense $flavour"; }
727bdf
+
727bdf
+$sp="r1";
727bdf
+$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
727bdf
+
727bdf
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
727bdf
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
727bdf
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
727bdf
+die "can't locate ppc-xlate.pl";
727bdf
+
727bdf
+open STDOUT,"| $^X $xlate $flavour \"$output\""
727bdf
+    or die "can't call $xlate: $!";
727bdf
+
727bdf
+$code=<<___;
727bdf
+.machine        "any"
727bdf
+.text
727bdf
+
727bdf
+# 4x loops
727bdf
+# v15 - v18 - input states
727bdf
+# vs1 - vs9 - round keys
727bdf
+#
727bdf
+.macro Loop_aes_middle4x
727bdf
+	xxlor	19+32, 1, 1
727bdf
+	xxlor	20+32, 2, 2
727bdf
+	xxlor	21+32, 3, 3
727bdf
+	xxlor	22+32, 4, 4
727bdf
+
727bdf
+	vcipher	15, 15, 19
727bdf
+	vcipher	16, 16, 19
727bdf
+	vcipher	17, 17, 19
727bdf
+	vcipher	18, 18, 19
727bdf
+
727bdf
+	vcipher	15, 15, 20
727bdf
+	vcipher	16, 16, 20
727bdf
+	vcipher	17, 17, 20
727bdf
+	vcipher	18, 18, 20
727bdf
+
727bdf
+	vcipher	15, 15, 21
727bdf
+	vcipher	16, 16, 21
727bdf
+	vcipher	17, 17, 21
727bdf
+	vcipher	18, 18, 21
727bdf
+
727bdf
+	vcipher	15, 15, 22
727bdf
+	vcipher	16, 16, 22
727bdf
+	vcipher	17, 17, 22
727bdf
+	vcipher	18, 18, 22
727bdf
+
727bdf
+	xxlor	19+32, 5, 5
727bdf
+	xxlor	20+32, 6, 6
727bdf
+	xxlor	21+32, 7, 7
727bdf
+	xxlor	22+32, 8, 8
727bdf
+
727bdf
+	vcipher	15, 15, 19
727bdf
+	vcipher	16, 16, 19
727bdf
+	vcipher	17, 17, 19
727bdf
+	vcipher	18, 18, 19
727bdf
+
727bdf
+	vcipher	15, 15, 20
727bdf
+	vcipher	16, 16, 20
727bdf
+	vcipher	17, 17, 20
727bdf
+	vcipher	18, 18, 20
727bdf
+
727bdf
+	vcipher	15, 15, 21
727bdf
+	vcipher	16, 16, 21
727bdf
+	vcipher	17, 17, 21
727bdf
+	vcipher	18, 18, 21
727bdf
+
727bdf
+	vcipher	15, 15, 22
727bdf
+	vcipher	16, 16, 22
727bdf
+	vcipher	17, 17, 22
727bdf
+	vcipher	18, 18, 22
727bdf
+
727bdf
+	xxlor	23+32, 9, 9
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	16, 16, 23
727bdf
+	vcipher	17, 17, 23
727bdf
+	vcipher	18, 18, 23
727bdf
+.endm
727bdf
+
727bdf
+# 8x loops
727bdf
+# v15 - v22 - input states
727bdf
+# vs1 - vs9 - round keys
727bdf
+#
727bdf
+.macro Loop_aes_middle8x
727bdf
+	xxlor	23+32, 1, 1
727bdf
+	xxlor	24+32, 2, 2
727bdf
+	xxlor	25+32, 3, 3
727bdf
+	xxlor	26+32, 4, 4
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	16, 16, 23
727bdf
+	vcipher	17, 17, 23
727bdf
+	vcipher	18, 18, 23
727bdf
+	vcipher	19, 19, 23
727bdf
+	vcipher	20, 20, 23
727bdf
+	vcipher	21, 21, 23
727bdf
+	vcipher	22, 22, 23
727bdf
+
727bdf
+	vcipher	15, 15, 24
727bdf
+	vcipher	16, 16, 24
727bdf
+	vcipher	17, 17, 24
727bdf
+	vcipher	18, 18, 24
727bdf
+	vcipher	19, 19, 24
727bdf
+	vcipher	20, 20, 24
727bdf
+	vcipher	21, 21, 24
727bdf
+	vcipher	22, 22, 24
727bdf
+
727bdf
+	vcipher	15, 15, 25
727bdf
+	vcipher	16, 16, 25
727bdf
+	vcipher	17, 17, 25
727bdf
+	vcipher	18, 18, 25
727bdf
+	vcipher	19, 19, 25
727bdf
+	vcipher	20, 20, 25
727bdf
+	vcipher	21, 21, 25
727bdf
+	vcipher	22, 22, 25
727bdf
+
727bdf
+	vcipher	15, 15, 26
727bdf
+	vcipher	16, 16, 26
727bdf
+	vcipher	17, 17, 26
727bdf
+	vcipher	18, 18, 26
727bdf
+	vcipher	19, 19, 26
727bdf
+	vcipher	20, 20, 26
727bdf
+	vcipher	21, 21, 26
727bdf
+	vcipher	22, 22, 26
727bdf
+
727bdf
+	xxlor	23+32, 5, 5
727bdf
+	xxlor	24+32, 6, 6
727bdf
+	xxlor	25+32, 7, 7
727bdf
+	xxlor	26+32, 8, 8
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	16, 16, 23
727bdf
+	vcipher	17, 17, 23
727bdf
+	vcipher	18, 18, 23
727bdf
+	vcipher	19, 19, 23
727bdf
+	vcipher	20, 20, 23
727bdf
+	vcipher	21, 21, 23
727bdf
+	vcipher	22, 22, 23
727bdf
+
727bdf
+	vcipher	15, 15, 24
727bdf
+	vcipher	16, 16, 24
727bdf
+	vcipher	17, 17, 24
727bdf
+	vcipher	18, 18, 24
727bdf
+	vcipher	19, 19, 24
727bdf
+	vcipher	20, 20, 24
727bdf
+	vcipher	21, 21, 24
727bdf
+	vcipher	22, 22, 24
727bdf
+
727bdf
+	vcipher	15, 15, 25
727bdf
+	vcipher	16, 16, 25
727bdf
+	vcipher	17, 17, 25
727bdf
+	vcipher	18, 18, 25
727bdf
+	vcipher	19, 19, 25
727bdf
+	vcipher	20, 20, 25
727bdf
+	vcipher	21, 21, 25
727bdf
+	vcipher	22, 22, 25
727bdf
+
727bdf
+	vcipher	15, 15, 26
727bdf
+	vcipher	16, 16, 26
727bdf
+	vcipher	17, 17, 26
727bdf
+	vcipher	18, 18, 26
727bdf
+	vcipher	19, 19, 26
727bdf
+	vcipher	20, 20, 26
727bdf
+	vcipher	21, 21, 26
727bdf
+	vcipher	22, 22, 26
727bdf
+
727bdf
+	xxlor	23+32, 9, 9
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	16, 16, 23
727bdf
+	vcipher	17, 17, 23
727bdf
+	vcipher	18, 18, 23
727bdf
+	vcipher	19, 19, 23
727bdf
+	vcipher	20, 20, 23
727bdf
+	vcipher	21, 21, 23
727bdf
+	vcipher	22, 22, 23
727bdf
+.endm
727bdf
+
727bdf
+#
727bdf
+# Compute 4x hash values based on Karatsuba method.
727bdf
+#
727bdf
+ppc_aes_gcm_ghash:
727bdf
+	vxor		15, 15, 0
727bdf
+
727bdf
+	xxlxor		29, 29, 29
727bdf
+
727bdf
+	vpmsumd		23, 12, 15		# H4.L * X.L
727bdf
+	vpmsumd		24, 9, 16
727bdf
+	vpmsumd		25, 6, 17
727bdf
+	vpmsumd		26, 3, 18
727bdf
+
727bdf
+	vxor		23, 23, 24
727bdf
+	vxor		23, 23, 25
727bdf
+	vxor		23, 23, 26		# L
727bdf
+
727bdf
+	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
727bdf
+	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
727bdf
+	vpmsumd		26, 7, 17
727bdf
+	vpmsumd		27, 4, 18
727bdf
+
727bdf
+	vxor		24, 24, 25
727bdf
+	vxor		24, 24, 26
727bdf
+	vxor		24, 24, 27		# M
727bdf
+
727bdf
+	# sum hash and reduction with H Poly
727bdf
+	vpmsumd		28, 23, 2		# reduction
727bdf
+
727bdf
+	xxlor		29+32, 29, 29
727bdf
+	vsldoi		26, 24, 29, 8		# mL
727bdf
+	vsldoi		29, 29, 24, 8		# mH
727bdf
+	vxor		23, 23, 26		# mL + L
727bdf
+
727bdf
+	vsldoi		23, 23, 23, 8		# swap
727bdf
+	vxor		23, 23, 28
727bdf
+
727bdf
+	vpmsumd		24, 14, 15		# H4.H * X.H
727bdf
+	vpmsumd		25, 11, 16
727bdf
+	vpmsumd		26, 8, 17
727bdf
+	vpmsumd		27, 5, 18
727bdf
+
727bdf
+	vxor		24, 24, 25
727bdf
+	vxor		24, 24, 26
727bdf
+	vxor		24, 24, 27
727bdf
+
727bdf
+	vxor		24, 24, 29
727bdf
+
727bdf
+	# sum hash and reduction with H Poly
727bdf
+	vsldoi		27, 23, 23, 8		# swap
727bdf
+	vpmsumd		23, 23, 2
727bdf
+	vxor		27, 27, 24
727bdf
+	vxor		23, 23, 27
727bdf
+
727bdf
+	xxlor		32, 23+32, 23+32		# update hash
727bdf
+
727bdf
+	blr
727bdf
+
727bdf
+#
727bdf
+# Combine two 4x ghash
727bdf
+# v15 - v22 - input blocks
727bdf
+#
727bdf
+.macro ppc_aes_gcm_ghash2_4x
727bdf
+	# first 4x hash
727bdf
+	vxor		15, 15, 0		# Xi + X
727bdf
+
727bdf
+	xxlxor		29, 29, 29
727bdf
+
727bdf
+	vpmsumd		23, 12, 15		# H4.L * X.L
727bdf
+	vpmsumd		24, 9, 16
727bdf
+	vpmsumd		25, 6, 17
727bdf
+	vpmsumd		26, 3, 18
727bdf
+
727bdf
+	vxor		23, 23, 24
727bdf
+	vxor		23, 23, 25
727bdf
+	vxor		23, 23, 26		# L
727bdf
+
727bdf
+	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
727bdf
+	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
727bdf
+	vpmsumd		26, 7, 17
727bdf
+	vpmsumd		27, 4, 18
727bdf
+
727bdf
+	vxor		24, 24, 25
727bdf
+	vxor		24, 24, 26
727bdf
+
727bdf
+	# sum hash and reduction with H Poly
727bdf
+	vpmsumd		28, 23, 2		# reduction
727bdf
+
727bdf
+	xxlor		29+32, 29, 29
727bdf
+
727bdf
+	vxor		24, 24, 27		# M
727bdf
+	vsldoi		26, 24, 29, 8		# mL
727bdf
+	vsldoi		29, 29, 24, 8		# mH
727bdf
+	vxor		23, 23, 26		# mL + L
727bdf
+
727bdf
+	vsldoi		23, 23, 23, 8		# swap
727bdf
+	vxor		23, 23, 28
727bdf
+
727bdf
+	vpmsumd		24, 14, 15		# H4.H * X.H
727bdf
+	vpmsumd		25, 11, 16
727bdf
+	vpmsumd		26, 8, 17
727bdf
+	vpmsumd		27, 5, 18
727bdf
+
727bdf
+	vxor		24, 24, 25
727bdf
+	vxor		24, 24, 26
727bdf
+	vxor		24, 24, 27		# H
727bdf
+
727bdf
+	vxor		24, 24, 29		# H + mH
727bdf
+
727bdf
+	# sum hash and reduction with H Poly
727bdf
+	vsldoi		27, 23, 23, 8		# swap
727bdf
+	vpmsumd		23, 23, 2
727bdf
+	vxor		27, 27, 24
727bdf
+	vxor		27, 23, 27		# 1st Xi
727bdf
+
727bdf
+	# 2nd 4x hash
727bdf
+	vpmsumd		24, 9, 20
727bdf
+	vpmsumd		25, 6, 21
727bdf
+	vpmsumd		26, 3, 22
727bdf
+	vxor		19, 19, 27		# Xi + X
727bdf
+	vpmsumd		23, 12, 19		# H4.L * X.L
727bdf
+
727bdf
+	vxor		23, 23, 24
727bdf
+	vxor		23, 23, 25
727bdf
+	vxor		23, 23, 26		# L
727bdf
+
727bdf
+	vpmsumd		24, 13, 19		# H4.L * X.H + H4.H * X.L
727bdf
+	vpmsumd		25, 10, 20		# H3.L * X1.H + H3.H * X1.L
727bdf
+	vpmsumd		26, 7, 21
727bdf
+	vpmsumd		27, 4, 22
727bdf
+
727bdf
+	vxor		24, 24, 25
727bdf
+	vxor		24, 24, 26
727bdf
+
727bdf
+	# sum hash and reduction with H Poly
727bdf
+	vpmsumd		28, 23, 2		# reduction
727bdf
+
727bdf
+	xxlor		29+32, 29, 29
727bdf
+
727bdf
+	vxor		24, 24, 27		# M
727bdf
+	vsldoi		26, 24, 29, 8		# mL
727bdf
+	vsldoi		29, 29, 24, 8		# mH
727bdf
+	vxor		23, 23, 26		# mL + L
727bdf
+
727bdf
+	vsldoi		23, 23, 23, 8		# swap
727bdf
+	vxor		23, 23, 28
727bdf
+
727bdf
+	vpmsumd		24, 14, 19		# H4.H * X.H
727bdf
+	vpmsumd		25, 11, 20
727bdf
+	vpmsumd		26, 8, 21
727bdf
+	vpmsumd		27, 5, 22
727bdf
+
727bdf
+	vxor		24, 24, 25
727bdf
+	vxor		24, 24, 26
727bdf
+	vxor		24, 24, 27		# H
727bdf
+
727bdf
+	vxor		24, 24, 29		# H + mH
727bdf
+
727bdf
+	# sum hash and reduction with H Poly
727bdf
+	vsldoi		27, 23, 23, 8		# swap
727bdf
+	vpmsumd		23, 23, 2
727bdf
+	vxor		27, 27, 24
727bdf
+	vxor		23, 23, 27
727bdf
+
727bdf
+	xxlor		32, 23+32, 23+32		# update hash
727bdf
+
727bdf
+.endm
727bdf
+
727bdf
+#
727bdf
+# Compute update single hash
727bdf
+#
727bdf
+.macro ppc_update_hash_1x
727bdf
+	vxor		28, 28, 0
727bdf
+
727bdf
+	vxor		19, 19, 19
727bdf
+
727bdf
+	vpmsumd		22, 3, 28		# L
727bdf
+	vpmsumd		23, 4, 28		# M
727bdf
+	vpmsumd		24, 5, 28		# H
727bdf
+
727bdf
+	vpmsumd		27, 22, 2		# reduction
727bdf
+
727bdf
+	vsldoi		25, 23, 19, 8		# mL
727bdf
+	vsldoi		26, 19, 23, 8		# mH
727bdf
+	vxor		22, 22, 25		# LL + LL
727bdf
+	vxor		24, 24, 26		# HH + HH
727bdf
+
727bdf
+	vsldoi		22, 22, 22, 8		# swap
727bdf
+	vxor		22, 22, 27
727bdf
+
727bdf
+	vsldoi		20, 22, 22, 8		# swap
727bdf
+	vpmsumd		22, 22, 2		# reduction
727bdf
+	vxor		20, 20, 24
727bdf
+	vxor		22, 22, 20
727bdf
+
727bdf
+	vmr		0, 22			# update hash
727bdf
+
727bdf
+.endm
727bdf
+
727bdf
+#
727bdf
+# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
727bdf
+#               const AES_KEY *key, unsigned char iv[16],
727bdf
+#               void *Xip);
727bdf
+#
727bdf
+#    r3 - inp
727bdf
+#    r4 - out
727bdf
+#    r5 - len
727bdf
+#    r6 - AES round keys
727bdf
+#    r7 - iv
727bdf
+#    r8 - Xi, HPoli, hash keys
727bdf
+#
727bdf
+.global ppc_aes_gcm_encrypt
727bdf
+.align 5
727bdf
+ppc_aes_gcm_encrypt:
727bdf
+_ppc_aes_gcm_encrypt:
727bdf
+
727bdf
+	stdu 1,-512(1)
727bdf
+	mflr 0
727bdf
+
727bdf
+	std	14,112(1)
727bdf
+	std	15,120(1)
727bdf
+	std	16,128(1)
727bdf
+	std	17,136(1)
727bdf
+	std	18,144(1)
727bdf
+	std	19,152(1)
727bdf
+	std	20,160(1)
727bdf
+	std	21,168(1)
727bdf
+	li	9, 256
727bdf
+	stvx	20, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	21, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	22, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	23, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	24, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	25, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	26, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	27, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	28, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	29, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	30, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	31, 9, 1
727bdf
+	std	0, 528(1)
727bdf
+
727bdf
+	# Load Xi
727bdf
+	lxvb16x	32, 0, 8	# load Xi
727bdf
+
727bdf
+	# load Hash - h^4, h^3, h^2, h
727bdf
+	li	10, 32
727bdf
+	lxvd2x	2+32, 10, 8	# H Poli
727bdf
+	li	10, 48
727bdf
+	lxvd2x	3+32, 10, 8	# Hl
727bdf
+	li	10, 64
727bdf
+	lxvd2x	4+32, 10, 8	# H
727bdf
+	li	10, 80
727bdf
+	lxvd2x	5+32, 10, 8	# Hh
727bdf
+
727bdf
+	li	10, 96
727bdf
+	lxvd2x	6+32, 10, 8	# H^2l
727bdf
+	li	10, 112
727bdf
+	lxvd2x	7+32, 10, 8	# H^2
727bdf
+	li	10, 128
727bdf
+	lxvd2x	8+32, 10, 8	# H^2h
727bdf
+
727bdf
+	li	10, 144
727bdf
+	lxvd2x	9+32, 10, 8	# H^3l
727bdf
+	li	10, 160
727bdf
+	lxvd2x	10+32, 10, 8	# H^3
727bdf
+	li	10, 176
727bdf
+	lxvd2x	11+32, 10, 8	# H^3h
727bdf
+
727bdf
+	li	10, 192
727bdf
+	lxvd2x	12+32, 10, 8	# H^4l
727bdf
+	li	10, 208
727bdf
+	lxvd2x	13+32, 10, 8	# H^4
727bdf
+	li	10, 224
727bdf
+	lxvd2x	14+32, 10, 8	# H^4h
727bdf
+
727bdf
+	# initialize ICB: GHASH( IV ), IV - r7
727bdf
+	lxvb16x	30+32, 0, 7	# load IV  - v30
727bdf
+
727bdf
+	mr	12, 5		# length
727bdf
+	li	11, 0		# block index
727bdf
+
727bdf
+	# counter 1
727bdf
+	vxor	31, 31, 31
727bdf
+	vspltisb 22, 1
727bdf
+	vsldoi	31, 31, 22,1	# counter 1
727bdf
+
727bdf
+	# load round key to VSR
727bdf
+	lxv	0, 0(6)
727bdf
+	lxv	1, 0x10(6)
727bdf
+	lxv	2, 0x20(6)
727bdf
+	lxv	3, 0x30(6)
727bdf
+	lxv	4, 0x40(6)
727bdf
+	lxv	5, 0x50(6)
727bdf
+	lxv	6, 0x60(6)
727bdf
+	lxv	7, 0x70(6)
727bdf
+	lxv	8, 0x80(6)
727bdf
+	lxv	9, 0x90(6)
727bdf
+	lxv	10, 0xa0(6)
727bdf
+
727bdf
+	# load rounds - 10 (128), 12 (192), 14 (256)
727bdf
+	lwz	9,240(6)
727bdf
+
727bdf
+	#
727bdf
+	# vxor	state, state, w # addroundkey
727bdf
+	xxlor	32+29, 0, 0
727bdf
+	vxor	15, 30, 29	# IV + round key - add round key 0
727bdf
+
727bdf
+	cmpdi	9, 10
727bdf
+	beq	Loop_aes_gcm_8x
727bdf
+
727bdf
+	# load 2 more round keys (v11, v12)
727bdf
+	lxv	11, 0xb0(6)
727bdf
+	lxv	12, 0xc0(6)
727bdf
+
727bdf
+	cmpdi	9, 12
727bdf
+	beq	Loop_aes_gcm_8x
727bdf
+
727bdf
+	# load 2 more round keys (v11, v12, v13, v14)
727bdf
+	lxv	13, 0xd0(6)
727bdf
+	lxv	14, 0xe0(6)
727bdf
+	cmpdi	9, 14
727bdf
+	beq	Loop_aes_gcm_8x
727bdf
+
727bdf
+	b	aes_gcm_out
727bdf
+
727bdf
+.align 5
727bdf
+Loop_aes_gcm_8x:
727bdf
+	mr	14, 3
727bdf
+	mr	9, 4
727bdf
+
727bdf
+	# n blocks
727bdf
+	li	10, 128
727bdf
+	divdu	10, 5, 10	# n 128 bytes-blocks
727bdf
+	cmpdi	10, 0
727bdf
+	beq	Loop_last_block
727bdf
+
727bdf
+	vaddudm	30, 30, 31	# IV + counter
727bdf
+	vxor	16, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	17, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	18, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	19, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	20, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	21, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	22, 30, 29
727bdf
+
727bdf
+	mtctr	10
727bdf
+
727bdf
+	li	15, 16
727bdf
+	li	16, 32
727bdf
+	li	17, 48
727bdf
+	li	18, 64
727bdf
+	li	19, 80
727bdf
+	li	20, 96
727bdf
+	li	21, 112
727bdf
+
727bdf
+	lwz	10, 240(6)
727bdf
+
727bdf
+Loop_8x_block:
727bdf
+
727bdf
+	lxvb16x		15, 0, 14	# load block
727bdf
+	lxvb16x		16, 15, 14	# load block
727bdf
+	lxvb16x		17, 16, 14	# load block
727bdf
+	lxvb16x		18, 17, 14	# load block
727bdf
+	lxvb16x		19, 18, 14	# load block
727bdf
+	lxvb16x		20, 19, 14	# load block
727bdf
+	lxvb16x		21, 20, 14	# load block
727bdf
+	lxvb16x		22, 21, 14	# load block
727bdf
+	addi		14, 14, 128
727bdf
+
727bdf
+	Loop_aes_middle8x
727bdf
+
727bdf
+	xxlor	23+32, 10, 10
727bdf
+
727bdf
+	cmpdi	10, 10
727bdf
+	beq	Do_next_ghash
727bdf
+
727bdf
+	# 192 bits
727bdf
+	xxlor	24+32, 11, 11
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	16, 16, 23
727bdf
+	vcipher	17, 17, 23
727bdf
+	vcipher	18, 18, 23
727bdf
+	vcipher	19, 19, 23
727bdf
+	vcipher	20, 20, 23
727bdf
+	vcipher	21, 21, 23
727bdf
+	vcipher	22, 22, 23
727bdf
+
727bdf
+	vcipher	15, 15, 24
727bdf
+	vcipher	16, 16, 24
727bdf
+	vcipher	17, 17, 24
727bdf
+	vcipher	18, 18, 24
727bdf
+	vcipher	19, 19, 24
727bdf
+	vcipher	20, 20, 24
727bdf
+	vcipher	21, 21, 24
727bdf
+	vcipher	22, 22, 24
727bdf
+
727bdf
+	xxlor	23+32, 12, 12
727bdf
+
727bdf
+	cmpdi	10, 12
727bdf
+	beq	Do_next_ghash
727bdf
+
727bdf
+	# 256 bits
727bdf
+	xxlor	24+32, 13, 13
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	16, 16, 23
727bdf
+	vcipher	17, 17, 23
727bdf
+	vcipher	18, 18, 23
727bdf
+	vcipher	19, 19, 23
727bdf
+	vcipher	20, 20, 23
727bdf
+	vcipher	21, 21, 23
727bdf
+	vcipher	22, 22, 23
727bdf
+
727bdf
+	vcipher	15, 15, 24
727bdf
+	vcipher	16, 16, 24
727bdf
+	vcipher	17, 17, 24
727bdf
+	vcipher	18, 18, 24
727bdf
+	vcipher	19, 19, 24
727bdf
+	vcipher	20, 20, 24
727bdf
+	vcipher	21, 21, 24
727bdf
+	vcipher	22, 22, 24
727bdf
+
727bdf
+	xxlor	23+32, 14, 14
727bdf
+
727bdf
+	cmpdi	10, 14
727bdf
+	beq	Do_next_ghash
727bdf
+	b	aes_gcm_out
727bdf
+
727bdf
+Do_next_ghash:
727bdf
+
727bdf
+	#
727bdf
+	# last round
727bdf
+	vcipherlast     15, 15, 23
727bdf
+	vcipherlast     16, 16, 23
727bdf
+
727bdf
+	xxlxor		47, 47, 15
727bdf
+	stxvb16x        47, 0, 9	# store output
727bdf
+	xxlxor		48, 48, 16
727bdf
+	stxvb16x        48, 15, 9	# store output
727bdf
+
727bdf
+	vcipherlast     17, 17, 23
727bdf
+	vcipherlast     18, 18, 23
727bdf
+
727bdf
+	xxlxor		49, 49, 17
727bdf
+	stxvb16x        49, 16, 9	# store output
727bdf
+	xxlxor		50, 50, 18
727bdf
+	stxvb16x        50, 17, 9	# store output
727bdf
+
727bdf
+	vcipherlast     19, 19, 23
727bdf
+	vcipherlast     20, 20, 23
727bdf
+
727bdf
+	xxlxor		51, 51, 19
727bdf
+	stxvb16x        51, 18, 9	# store output
727bdf
+	xxlxor		52, 52, 20
727bdf
+	stxvb16x        52, 19, 9	# store output
727bdf
+
727bdf
+	vcipherlast     21, 21, 23
727bdf
+	vcipherlast     22, 22, 23
727bdf
+
727bdf
+	xxlxor		53, 53, 21
727bdf
+	stxvb16x        53, 20, 9	# store output
727bdf
+	xxlxor		54, 54, 22
727bdf
+	stxvb16x        54, 21, 9	# store output
727bdf
+
727bdf
+	addi		9, 9, 128
727bdf
+
727bdf
+	# ghash here
727bdf
+	ppc_aes_gcm_ghash2_4x
727bdf
+
727bdf
+	xxlor	27+32, 0, 0
727bdf
+	vaddudm 30, 30, 31		# IV + counter
727bdf
+	vmr	29, 30
727bdf
+	vxor    15, 30, 27		# add round key
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    16, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    17, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    18, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    19, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    20, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    21, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    22, 30, 27
727bdf
+
727bdf
+	addi    12, 12, -128
727bdf
+	addi    11, 11, 128
727bdf
+
727bdf
+	bdnz	Loop_8x_block
727bdf
+
727bdf
+	vmr	30, 29
727bdf
+
727bdf
+Loop_last_block:
727bdf
+	cmpdi   12, 0
727bdf
+	beq     aes_gcm_out
727bdf
+
727bdf
+	# loop last few blocks
727bdf
+	li      10, 16
727bdf
+	divdu   10, 12, 10
727bdf
+
727bdf
+	mtctr   10
727bdf
+
727bdf
+	lwz	10, 240(6)
727bdf
+
727bdf
+	cmpdi   12, 16
727bdf
+	blt     Final_block
727bdf
+
727bdf
+.macro Loop_aes_middle_1x
727bdf
+	xxlor	19+32, 1, 1
727bdf
+	xxlor	20+32, 2, 2
727bdf
+	xxlor	21+32, 3, 3
727bdf
+	xxlor	22+32, 4, 4
727bdf
+
727bdf
+	vcipher 15, 15, 19
727bdf
+	vcipher 15, 15, 20
727bdf
+	vcipher 15, 15, 21
727bdf
+	vcipher 15, 15, 22
727bdf
+
727bdf
+	xxlor	19+32, 5, 5
727bdf
+	xxlor	20+32, 6, 6
727bdf
+	xxlor	21+32, 7, 7
727bdf
+	xxlor	22+32, 8, 8
727bdf
+
727bdf
+	vcipher 15, 15, 19
727bdf
+	vcipher 15, 15, 20
727bdf
+	vcipher 15, 15, 21
727bdf
+	vcipher 15, 15, 22
727bdf
+
727bdf
+	xxlor	19+32, 9, 9
727bdf
+	vcipher 15, 15, 19
727bdf
+.endm
727bdf
+
727bdf
+Next_rem_block:
727bdf
+	lxvb16x 15, 0, 14		# load block
727bdf
+
727bdf
+	Loop_aes_middle_1x
727bdf
+
727bdf
+	xxlor	23+32, 10, 10
727bdf
+
727bdf
+	cmpdi	10, 10
727bdf
+	beq	Do_next_1x
727bdf
+
727bdf
+	# 192 bits
727bdf
+	xxlor	24+32, 11, 11
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	15, 15, 24
727bdf
+
727bdf
+	xxlor	23+32, 12, 12
727bdf
+
727bdf
+	cmpdi	10, 12
727bdf
+	beq	Do_next_1x
727bdf
+
727bdf
+	# 256 bits
727bdf
+	xxlor	24+32, 13, 13
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	15, 15, 24
727bdf
+
727bdf
+	xxlor	23+32, 14, 14
727bdf
+
727bdf
+	cmpdi	10, 14
727bdf
+	beq	Do_next_1x
727bdf
+
727bdf
+Do_next_1x:
727bdf
+	vcipherlast     15, 15, 23
727bdf
+
727bdf
+	xxlxor		47, 47, 15
727bdf
+	stxvb16x	47, 0, 9	# store output
727bdf
+	addi		14, 14, 16
727bdf
+	addi		9, 9, 16
727bdf
+
727bdf
+	vmr		28, 15
727bdf
+	ppc_update_hash_1x
727bdf
+
727bdf
+	addi		12, 12, -16
727bdf
+	addi		11, 11, 16
727bdf
+	xxlor		19+32, 0, 0
727bdf
+	vaddudm		30, 30, 31		# IV + counter
727bdf
+	vxor		15, 30, 19		# add round key
727bdf
+
727bdf
+	bdnz	Next_rem_block
727bdf
+
727bdf
+	cmpdi	12, 0
727bdf
+	beq	aes_gcm_out
727bdf
+
727bdf
+Final_block:
727bdf
+	Loop_aes_middle_1x
727bdf
+
727bdf
+	xxlor	23+32, 10, 10
727bdf
+
727bdf
+	cmpdi	10, 10
727bdf
+	beq	Do_final_1x
727bdf
+
727bdf
+	# 192 bits
727bdf
+	xxlor	24+32, 11, 11
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	15, 15, 24
727bdf
+
727bdf
+	xxlor	23+32, 12, 12
727bdf
+
727bdf
+	cmpdi	10, 12
727bdf
+	beq	Do_final_1x
727bdf
+
727bdf
+	# 256 bits
727bdf
+	xxlor	24+32, 13, 13
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	15, 15, 24
727bdf
+
727bdf
+	xxlor	23+32, 14, 14
727bdf
+
727bdf
+	cmpdi	10, 14
727bdf
+	beq	Do_final_1x
727bdf
+
727bdf
+Do_final_1x:
727bdf
+	vcipherlast     15, 15, 23
727bdf
+
727bdf
+	lxvb16x	15, 0, 14		# load last block
727bdf
+	xxlxor	47, 47, 15
727bdf
+
727bdf
+	# create partial block mask
727bdf
+	li	15, 16
727bdf
+	sub	15, 15, 12		# index to the mask
727bdf
+
727bdf
+	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
727bdf
+	vspltisb	17, 0		# second 16 bytes - 0x0000...00
727bdf
+	li	10, 192
727bdf
+	stvx	16, 10, 1
727bdf
+	addi	10, 10, 16
727bdf
+	stvx	17, 10, 1
727bdf
+
727bdf
+	addi	10, 1, 192
727bdf
+	lxvb16x	16, 15, 10		# load partial block mask
727bdf
+	xxland	47, 47, 16
727bdf
+
727bdf
+	vmr	28, 15
727bdf
+	ppc_update_hash_1x
727bdf
+
727bdf
+	# * should store only the remaining bytes.
727bdf
+	bl	Write_partial_block
727bdf
+
727bdf
+	b aes_gcm_out
727bdf
+
727bdf
+#
727bdf
+# Write partial block
727bdf
+# r9 - output
727bdf
+# r12 - remaining bytes
727bdf
+# v15 - partial input data
727bdf
+#
727bdf
+Write_partial_block:
727bdf
+	li		10, 192
727bdf
+	stxvb16x	15+32, 10, 1		# last block
727bdf
+
727bdf
+	#add		10, 9, 11		# Output
727bdf
+	addi		10, 9, -1
727bdf
+	addi		16, 1, 191
727bdf
+
727bdf
+        mtctr		12			# remaining bytes
727bdf
+	li		15, 0
727bdf
+
727bdf
+Write_last_byte:
727bdf
+        lbzu		14, 1(16)
727bdf
+	stbu		14, 1(10)
727bdf
+        bdnz		Write_last_byte
727bdf
+	blr
727bdf
+
727bdf
+aes_gcm_out:
727bdf
+	# out = state
727bdf
+	stxvb16x	32, 0, 8		# write out Xi
727bdf
+	add	3, 11, 12		# return count
727bdf
+
727bdf
+	li	9, 256
727bdf
+	lvx	20, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	21, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	22, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	23, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	24, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	25, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	26, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	27, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	28, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	29, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	30, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	lvx	31, 9, 1
727bdf
+
727bdf
+	ld	0, 528(1)
727bdf
+	ld      14,112(1)
727bdf
+	ld      15,120(1)
727bdf
+	ld      16,128(1)
727bdf
+	ld      17,136(1)
727bdf
+	ld      18,144(1)
727bdf
+	ld      19,152(1)
727bdf
+	ld      20,160(1)
727bdf
+	ld	21,168(1)
727bdf
+
727bdf
+	mtlr	0
727bdf
+	addi	1, 1, 512
727bdf
+	blr
727bdf
+
727bdf
+#
727bdf
+# 8x Decrypt
727bdf
+#
727bdf
+.global ppc_aes_gcm_decrypt
727bdf
+.align 5
727bdf
+ppc_aes_gcm_decrypt:
727bdf
+_ppc_aes_gcm_decrypt:
727bdf
+
727bdf
+	stdu 1,-512(1)
727bdf
+	mflr 0
727bdf
+
727bdf
+	std	14,112(1)
727bdf
+	std	15,120(1)
727bdf
+	std	16,128(1)
727bdf
+	std	17,136(1)
727bdf
+	std	18,144(1)
727bdf
+	std	19,152(1)
727bdf
+	std	20,160(1)
727bdf
+	std	21,168(1)
727bdf
+	li	9, 256
727bdf
+	stvx	20, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	21, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	22, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	23, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	24, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	25, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	26, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	27, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	28, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	29, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	30, 9, 1
727bdf
+	addi	9, 9, 16
727bdf
+	stvx	31, 9, 1
727bdf
+	std	0, 528(1)
727bdf
+
727bdf
+	# Load Xi
727bdf
+	lxvb16x	32, 0, 8	# load Xi
727bdf
+
727bdf
+	# load Hash - h^4, h^3, h^2, h
727bdf
+	li	10, 32
727bdf
+	lxvd2x	2+32, 10, 8	# H Poli
727bdf
+	li	10, 48
727bdf
+	lxvd2x	3+32, 10, 8	# Hl
727bdf
+	li	10, 64
727bdf
+	lxvd2x	4+32, 10, 8	# H
727bdf
+	li	10, 80
727bdf
+	lxvd2x	5+32, 10, 8	# Hh
727bdf
+
727bdf
+	li	10, 96
727bdf
+	lxvd2x	6+32, 10, 8	# H^2l
727bdf
+	li	10, 112
727bdf
+	lxvd2x	7+32, 10, 8	# H^2
727bdf
+	li	10, 128
727bdf
+	lxvd2x	8+32, 10, 8	# H^2h
727bdf
+
727bdf
+	li	10, 144
727bdf
+	lxvd2x	9+32, 10, 8	# H^3l
727bdf
+	li	10, 160
727bdf
+	lxvd2x	10+32, 10, 8	# H^3
727bdf
+	li	10, 176
727bdf
+	lxvd2x	11+32, 10, 8	# H^3h
727bdf
+
727bdf
+	li	10, 192
727bdf
+	lxvd2x	12+32, 10, 8	# H^4l
727bdf
+	li	10, 208
727bdf
+	lxvd2x	13+32, 10, 8	# H^4
727bdf
+	li	10, 224
727bdf
+	lxvd2x	14+32, 10, 8	# H^4h
727bdf
+
727bdf
+	# initialize ICB: GHASH( IV ), IV - r7
727bdf
+	lxvb16x	30+32, 0, 7	# load IV  - v30
727bdf
+
727bdf
+	mr	12, 5		# length
727bdf
+	li	11, 0		# block index
727bdf
+
727bdf
+	# counter 1
727bdf
+	vxor	31, 31, 31
727bdf
+	vspltisb 22, 1
727bdf
+	vsldoi	31, 31, 22,1	# counter 1
727bdf
+
727bdf
+	# load round key to VSR
727bdf
+	lxv	0, 0(6)
727bdf
+	lxv	1, 0x10(6)
727bdf
+	lxv	2, 0x20(6)
727bdf
+	lxv	3, 0x30(6)
727bdf
+	lxv	4, 0x40(6)
727bdf
+	lxv	5, 0x50(6)
727bdf
+	lxv	6, 0x60(6)
727bdf
+	lxv	7, 0x70(6)
727bdf
+	lxv	8, 0x80(6)
727bdf
+	lxv	9, 0x90(6)
727bdf
+	lxv	10, 0xa0(6)
727bdf
+
727bdf
+	# load rounds - 10 (128), 12 (192), 14 (256)
727bdf
+	lwz	9,240(6)
727bdf
+
727bdf
+	#
727bdf
+	# vxor	state, state, w # addroundkey
727bdf
+	xxlor	32+29, 0, 0
727bdf
+	vxor	15, 30, 29	# IV + round key - add round key 0
727bdf
+
727bdf
+	cmpdi	9, 10
727bdf
+	beq	Loop_aes_gcm_8x_dec
727bdf
+
727bdf
+	# load 2 more round keys (v11, v12)
727bdf
+	lxv	11, 0xb0(6)
727bdf
+	lxv	12, 0xc0(6)
727bdf
+
727bdf
+	cmpdi	9, 12
727bdf
+	beq	Loop_aes_gcm_8x_dec
727bdf
+
727bdf
+	# load 2 more round keys (v11, v12, v13, v14)
727bdf
+	lxv	13, 0xd0(6)
727bdf
+	lxv	14, 0xe0(6)
727bdf
+	cmpdi	9, 14
727bdf
+	beq	Loop_aes_gcm_8x_dec
727bdf
+
727bdf
+	b	aes_gcm_out
727bdf
+
727bdf
+.align 5
727bdf
+Loop_aes_gcm_8x_dec:
727bdf
+	mr	14, 3
727bdf
+	mr	9, 4
727bdf
+
727bdf
+	# n blocks
727bdf
+	li	10, 128
727bdf
+	divdu	10, 5, 10	# n 128 bytes-blocks
727bdf
+	cmpdi	10, 0
727bdf
+	beq	Loop_last_block_dec
727bdf
+
727bdf
+	vaddudm	30, 30, 31	# IV + counter
727bdf
+	vxor	16, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	17, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	18, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	19, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	20, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	21, 30, 29
727bdf
+	vaddudm	30, 30, 31
727bdf
+	vxor	22, 30, 29
727bdf
+
727bdf
+	mtctr	10
727bdf
+
727bdf
+	li	15, 16
727bdf
+	li	16, 32
727bdf
+	li	17, 48
727bdf
+	li	18, 64
727bdf
+	li	19, 80
727bdf
+	li	20, 96
727bdf
+	li	21, 112
727bdf
+
727bdf
+	lwz	10, 240(6)
727bdf
+
727bdf
+Loop_8x_block_dec:
727bdf
+
727bdf
+	lxvb16x		15, 0, 14	# load block
727bdf
+	lxvb16x		16, 15, 14	# load block
727bdf
+	lxvb16x		17, 16, 14	# load block
727bdf
+	lxvb16x		18, 17, 14	# load block
727bdf
+	lxvb16x		19, 18, 14	# load block
727bdf
+	lxvb16x		20, 19, 14	# load block
727bdf
+	lxvb16x		21, 20, 14	# load block
727bdf
+	lxvb16x		22, 21, 14	# load block
727bdf
+	addi		14, 14, 128
727bdf
+
727bdf
+	Loop_aes_middle8x
727bdf
+
727bdf
+	xxlor	23+32, 10, 10
727bdf
+
727bdf
+	cmpdi	10, 10
727bdf
+	beq	Do_last_aes_dec
727bdf
+
727bdf
+	# 192 bits
727bdf
+	xxlor	24+32, 11, 11
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	16, 16, 23
727bdf
+	vcipher	17, 17, 23
727bdf
+	vcipher	18, 18, 23
727bdf
+	vcipher	19, 19, 23
727bdf
+	vcipher	20, 20, 23
727bdf
+	vcipher	21, 21, 23
727bdf
+	vcipher	22, 22, 23
727bdf
+
727bdf
+	vcipher	15, 15, 24
727bdf
+	vcipher	16, 16, 24
727bdf
+	vcipher	17, 17, 24
727bdf
+	vcipher	18, 18, 24
727bdf
+	vcipher	19, 19, 24
727bdf
+	vcipher	20, 20, 24
727bdf
+	vcipher	21, 21, 24
727bdf
+	vcipher	22, 22, 24
727bdf
+
727bdf
+	xxlor	23+32, 12, 12
727bdf
+
727bdf
+	cmpdi	10, 12
727bdf
+	beq	Do_last_aes_dec
727bdf
+
727bdf
+	# 256 bits
727bdf
+	xxlor	24+32, 13, 13
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	16, 16, 23
727bdf
+	vcipher	17, 17, 23
727bdf
+	vcipher	18, 18, 23
727bdf
+	vcipher	19, 19, 23
727bdf
+	vcipher	20, 20, 23
727bdf
+	vcipher	21, 21, 23
727bdf
+	vcipher	22, 22, 23
727bdf
+
727bdf
+	vcipher	15, 15, 24
727bdf
+	vcipher	16, 16, 24
727bdf
+	vcipher	17, 17, 24
727bdf
+	vcipher	18, 18, 24
727bdf
+	vcipher	19, 19, 24
727bdf
+	vcipher	20, 20, 24
727bdf
+	vcipher	21, 21, 24
727bdf
+	vcipher	22, 22, 24
727bdf
+
727bdf
+	xxlor	23+32, 14, 14
727bdf
+
727bdf
+	cmpdi	10, 14
727bdf
+	beq	Do_last_aes_dec
727bdf
+	b	aes_gcm_out
727bdf
+
727bdf
+Do_last_aes_dec:
727bdf
+
727bdf
+	#
727bdf
+	# last round
727bdf
+	vcipherlast     15, 15, 23
727bdf
+	vcipherlast     16, 16, 23
727bdf
+
727bdf
+	xxlxor		47, 47, 15
727bdf
+	stxvb16x        47, 0, 9	# store output
727bdf
+	xxlxor		48, 48, 16
727bdf
+	stxvb16x        48, 15, 9	# store output
727bdf
+
727bdf
+	vcipherlast     17, 17, 23
727bdf
+	vcipherlast     18, 18, 23
727bdf
+
727bdf
+	xxlxor		49, 49, 17
727bdf
+	stxvb16x        49, 16, 9	# store output
727bdf
+	xxlxor		50, 50, 18
727bdf
+	stxvb16x        50, 17, 9	# store output
727bdf
+
727bdf
+	vcipherlast     19, 19, 23
727bdf
+	vcipherlast     20, 20, 23
727bdf
+
727bdf
+	xxlxor		51, 51, 19
727bdf
+	stxvb16x        51, 18, 9	# store output
727bdf
+	xxlxor		52, 52, 20
727bdf
+	stxvb16x        52, 19, 9	# store output
727bdf
+
727bdf
+	vcipherlast     21, 21, 23
727bdf
+	vcipherlast     22, 22, 23
727bdf
+
727bdf
+	xxlxor		53, 53, 21
727bdf
+	stxvb16x        53, 20, 9	# store output
727bdf
+	xxlxor		54, 54, 22
727bdf
+	stxvb16x        54, 21, 9	# store output
727bdf
+
727bdf
+	addi		9, 9, 128
727bdf
+
727bdf
+	xxlor		15+32, 15, 15
727bdf
+	xxlor		16+32, 16, 16
727bdf
+	xxlor		17+32, 17, 17
727bdf
+	xxlor		18+32, 18, 18
727bdf
+	xxlor		19+32, 19, 19
727bdf
+	xxlor		20+32, 20, 20
727bdf
+	xxlor		21+32, 21, 21
727bdf
+	xxlor		22+32, 22, 22
727bdf
+
727bdf
+	# ghash here
727bdf
+	ppc_aes_gcm_ghash2_4x
727bdf
+
727bdf
+	xxlor	27+32, 0, 0
727bdf
+	vaddudm 30, 30, 31		# IV + counter
727bdf
+	vmr	29, 30
727bdf
+	vxor    15, 30, 27		# add round key
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    16, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    17, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    18, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    19, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    20, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    21, 30, 27
727bdf
+	vaddudm 30, 30, 31
727bdf
+	vxor    22, 30, 27
727bdf
+	addi    12, 12, -128
727bdf
+	addi    11, 11, 128
727bdf
+
727bdf
+	bdnz	Loop_8x_block_dec
727bdf
+
727bdf
+	vmr	30, 29
727bdf
+
727bdf
+Loop_last_block_dec:
727bdf
+	cmpdi   12, 0
727bdf
+	beq     aes_gcm_out
727bdf
+
727bdf
+	# loop last few blocks
727bdf
+	li      10, 16
727bdf
+	divdu   10, 12, 10
727bdf
+
727bdf
+	mtctr   10
727bdf
+
727bdf
+	lwz	10,240(6)
727bdf
+
727bdf
+	cmpdi   12, 16
727bdf
+	blt     Final_block_dec
727bdf
+
727bdf
+Next_rem_block_dec:
727bdf
+	lxvb16x 15, 0, 14		# load block
727bdf
+
727bdf
+	Loop_aes_middle_1x
727bdf
+
727bdf
+	xxlor	23+32, 10, 10
727bdf
+
727bdf
+	cmpdi	10, 10
727bdf
+	beq	Do_next_1x_dec
727bdf
+
727bdf
+	# 192 bits
727bdf
+	xxlor	24+32, 11, 11
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	15, 15, 24
727bdf
+
727bdf
+	xxlor	23+32, 12, 12
727bdf
+
727bdf
+	cmpdi	10, 12
727bdf
+	beq	Do_next_1x_dec
727bdf
+
727bdf
+	# 256 bits
727bdf
+	xxlor	24+32, 13, 13
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	15, 15, 24
727bdf
+
727bdf
+	xxlor	23+32, 14, 14
727bdf
+
727bdf
+	cmpdi	10, 14
727bdf
+	beq	Do_next_1x_dec
727bdf
+
727bdf
+Do_next_1x_dec:
727bdf
+	vcipherlast     15, 15, 23
727bdf
+
727bdf
+	xxlxor  47, 47, 15
727bdf
+	stxvb16x        47, 0, 9	# store output
727bdf
+	addi	14, 14, 16
727bdf
+	addi	9, 9, 16
727bdf
+
727bdf
+	xxlor	28+32, 15, 15
727bdf
+	ppc_update_hash_1x
727bdf
+
727bdf
+	addi    12, 12, -16
727bdf
+	addi    11, 11, 16
727bdf
+	xxlor	19+32, 0, 0
727bdf
+	vaddudm 30, 30, 31		# IV + counter
727bdf
+	vxor	15, 30, 19		# add round key
727bdf
+
727bdf
+	bdnz	Next_rem_block_dec
727bdf
+
727bdf
+	cmpdi	12, 0
727bdf
+	beq	aes_gcm_out
727bdf
+
727bdf
+Final_block_dec:
727bdf
+	Loop_aes_middle_1x
727bdf
+
727bdf
+	xxlor	23+32, 10, 10
727bdf
+
727bdf
+	cmpdi	10, 10
727bdf
+	beq	Do_final_1x_dec
727bdf
+
727bdf
+	# 192 bits
727bdf
+	xxlor	24+32, 11, 11
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	15, 15, 24
727bdf
+
727bdf
+	xxlor	23+32, 12, 12
727bdf
+
727bdf
+	cmpdi	10, 12
727bdf
+	beq	Do_final_1x_dec
727bdf
+
727bdf
+	# 256 bits
727bdf
+	xxlor	24+32, 13, 13
727bdf
+
727bdf
+	vcipher	15, 15, 23
727bdf
+	vcipher	15, 15, 24
727bdf
+
727bdf
+	xxlor	23+32, 14, 14
727bdf
+
727bdf
+	cmpdi	10, 14
727bdf
+	beq	Do_final_1x_dec
727bdf
+
727bdf
+Do_final_1x_dec:
727bdf
+	vcipherlast     15, 15, 23
727bdf
+
727bdf
+	lxvb16x	15, 0, 14		# load block
727bdf
+	xxlxor	47, 47, 15
727bdf
+
727bdf
+	# create partial block mask
727bdf
+	li	15, 16
727bdf
+	sub	15, 15, 12		# index to the mask
727bdf
+
727bdf
+	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
727bdf
+	vspltisb	17, 0		# second 16 bytes - 0x0000...00
727bdf
+	li	10, 192
727bdf
+	stvx	16, 10, 1
727bdf
+	addi	10, 10, 16
727bdf
+	stvx	17, 10, 1
727bdf
+
727bdf
+	addi	10, 1, 192
727bdf
+	lxvb16x	16, 15, 10		# load block mask
727bdf
+	xxland	47, 47, 16
727bdf
+
727bdf
+	xxlor	28+32, 15, 15
727bdf
+	ppc_update_hash_1x
727bdf
+
727bdf
+	# * should store only the remaining bytes.
727bdf
+	bl	Write_partial_block
727bdf
+
727bdf
+	b aes_gcm_out
727bdf
+
727bdf
+
727bdf
+___
727bdf
+
727bdf
+foreach (split("\n",$code)) {
727bdf
+	s/\`([^\`]*)\`/eval $1/geo;
727bdf
+
727bdf
+	if ($flavour =~ /le$/o) {	# little-endian
727bdf
+	    s/le\?//o		or
727bdf
+	    s/be\?/#be#/o;
727bdf
+	} else {
727bdf
+	    s/le\?/#le#/o	or
727bdf
+	    s/be\?//o;
727bdf
+	}
727bdf
+	print $_,"\n";
727bdf
+}
727bdf
+
727bdf
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
727bdf
diff --git a/crypto/modes/build.info b/crypto/modes/build.info
727bdf
index 687e872..0ea122e 100644
727bdf
--- a/crypto/modes/build.info
727bdf
+++ b/crypto/modes/build.info
727bdf
@@ -32,7 +32,7 @@ IF[{- !$disabled{asm} -}]
727bdf
   $MODESASM_parisc20_64=$MODESASM_parisc11
727bdf
   $MODESDEF_parisc20_64=$MODESDEF_parisc11
727bdf
 
727bdf
-  $MODESASM_ppc32=ghashp8-ppc.s
727bdf
+  $MODESASM_ppc32=ghashp8-ppc.s aes-gcm-ppc.s
727bdf
   $MODESDEF_ppc32=
727bdf
   $MODESASM_ppc64=$MODESASM_ppc32
727bdf
   $MODESDEF_ppc64=$MODESDEF_ppc32
727bdf
@@ -71,6 +71,7 @@ INCLUDE[ghash-sparcv9.o]=..
727bdf
 GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl
727bdf
 GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl
727bdf
 GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl
727bdf
+GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl
727bdf
 GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl
727bdf
 INCLUDE[ghash-armv4.o]=..
727bdf
 GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl
727bdf
diff --git a/include/crypto/aes_platform.h b/include/crypto/aes_platform.h
727bdf
index e95ad5a..0c281a3 100644
727bdf
--- a/include/crypto/aes_platform.h
727bdf
+++ b/include/crypto/aes_platform.h
727bdf
@@ -74,6 +74,26 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
727bdf
 #   define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
727bdf
 #   define HWAES_xts_encrypt aes_p8_xts_encrypt
727bdf
 #   define HWAES_xts_decrypt aes_p8_xts_decrypt
727bdf
+#   define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300)
727bdf
+#   define AES_GCM_ENC_BYTES 128
727bdf
+#   define AES_GCM_DEC_BYTES 128
727bdf
+size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out,
727bdf
+                           size_t len, const void *key, unsigned char ivec[16],
727bdf
+                           u64 *Xi);
727bdf
+size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out,
727bdf
+                           size_t len, const void *key, unsigned char ivec[16],
727bdf
+                           u64 *Xi);
727bdf
+size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out,
727bdf
+                                size_t len, const void *key,
727bdf
+                                unsigned char ivec[16], u64 *Xi);
727bdf
+size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out,
727bdf
+                                size_t len, const void *key,
727bdf
+                                unsigned char ivec[16], u64 *Xi);
727bdf
+#   define AES_gcm_encrypt ppc_aes_gcm_encrypt_wrap
727bdf
+#   define AES_gcm_decrypt ppc_aes_gcm_decrypt_wrap
727bdf
+#   define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \
727bdf
+                              (gctx)->gcm.ghash==gcm_ghash_p8)
727bdf
+void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
727bdf
 #  endif /* PPC */
727bdf
 
727bdf
 #  if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
727bdf
diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw.c b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
727bdf
index 44fa9d4..789ec12 100644
727bdf
--- a/providers/implementations/ciphers/cipher_aes_gcm_hw.c
727bdf
+++ b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
727bdf
@@ -141,6 +141,8 @@ static const PROV_GCM_HW aes_gcm = {
727bdf
 # include "cipher_aes_gcm_hw_t4.inc"
727bdf
 #elif defined(AES_PMULL_CAPABLE) && defined(AES_GCM_ASM)
727bdf
 # include "cipher_aes_gcm_hw_armv8.inc"
727bdf
+#elif defined(PPC_AES_GCM_CAPABLE)
727bdf
+# include "cipher_aes_gcm_hw_ppc.inc"
727bdf
 #else
727bdf
 const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits)
727bdf
 {
727bdf
diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
727bdf
new file mode 100644
727bdf
index 0000000..4eed0f4
727bdf
--- /dev/null
727bdf
+++ b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
727bdf
@@ -0,0 +1,119 @@
727bdf
+/*
727bdf
+ * Copyright 2001-2021 The OpenSSL Project Authors. All Rights Reserved.
727bdf
+ *
727bdf
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
727bdf
+ * this file except in compliance with the License.  You can obtain a copy
727bdf
+ * in the file LICENSE in the source distribution or at
727bdf
+ * https://www.openssl.org/source/license.html
727bdf
+ */
727bdf
+
727bdf
+/*-
727bdf
+ * PPC support for AES GCM.
727bdf
+ * This file is included by cipher_aes_gcm_hw.c
727bdf
+ */
727bdf
+
727bdf
+static int aes_ppc_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
727bdf
+                               size_t keylen)
727bdf
+{
727bdf
+    PROV_AES_GCM_CTX *actx = (PROV_AES_GCM_CTX *)ctx;
727bdf
+    AES_KEY *ks = &actx->ks.ks;
727bdf
+
727bdf
+    GCM_HW_SET_KEY_CTR_FN(ks, aes_p8_set_encrypt_key, aes_p8_encrypt,
727bdf
+                          aes_p8_ctr32_encrypt_blocks);
727bdf
+    return 1;
727bdf
+}
727bdf
+
727bdf
+
727bdf
+extern size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
727bdf
+                                  const void *key, unsigned char ivec[16], u64 *Xi);
727bdf
+extern size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
727bdf
+                                  const void *key, unsigned char ivec[16], u64 *Xi);
727bdf
+
727bdf
+static inline u32 UTO32(unsigned char *buf)
727bdf
+{
727bdf
+    return ((u32) buf[0] << 24) | ((u32) buf[1] << 16) | ((u32) buf[2] << 8) | ((u32) buf[3]);
727bdf
+}
727bdf
+
727bdf
+static inline u32 add32TOU(unsigned char buf[4], u32 n)
727bdf
+{
727bdf
+    u32 r;
727bdf
+
727bdf
+    r = UTO32(buf);
727bdf
+    r += n;
727bdf
+    buf[0] = (unsigned char) (r >> 24) & 0xFF;
727bdf
+    buf[1] = (unsigned char) (r >> 16) & 0xFF;
727bdf
+    buf[2] = (unsigned char) (r >> 8) & 0xFF;
727bdf
+    buf[3] = (unsigned char) r & 0xFF;
727bdf
+    return r;
727bdf
+}
727bdf
+
727bdf
+static size_t aes_p10_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len,
727bdf
+                                const void *key, unsigned char ivec[16], u64 *Xi, int encrypt)
727bdf
+{
727bdf
+    int s = 0;
727bdf
+    int ndone = 0;
727bdf
+    int ctr_reset = 0;
727bdf
+    u64 blocks_unused;
727bdf
+    u64 nb = len / 16;
727bdf
+    u64 next_ctr = 0;
727bdf
+    unsigned char ctr_saved[12];
727bdf
+
727bdf
+    memcpy(ctr_saved, ivec, 12);
727bdf
+
727bdf
+    while (nb) {
727bdf
+        blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12);
727bdf
+        if (nb > blocks_unused) {
727bdf
+            len = blocks_unused * 16;
727bdf
+            nb -= blocks_unused;
727bdf
+            next_ctr = blocks_unused;
727bdf
+            ctr_reset = 1;
727bdf
+        } else {
727bdf
+            len = nb * 16;
727bdf
+            next_ctr = nb;
727bdf
+            nb = 0;
727bdf
+        }
727bdf
+
727bdf
+        s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi)
727bdf
+                    : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi);
727bdf
+
727bdf
+        /* add counter to ivec */
727bdf
+        add32TOU(ivec + 12, (u32) next_ctr);
727bdf
+        if (ctr_reset) {
727bdf
+            ctr_reset = 0;
727bdf
+            in += len;
727bdf
+            out += len;
727bdf
+        }
727bdf
+        memcpy(ivec, ctr_saved, 12);
727bdf
+        ndone += s;
727bdf
+    }
727bdf
+
727bdf
+    return ndone;
727bdf
+}
727bdf
+
727bdf
+size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
727bdf
+                                const void *key, unsigned char ivec[16], u64 *Xi)
727bdf
+{
727bdf
+    return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 1);
727bdf
+}
727bdf
+
727bdf
+size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
727bdf
+                                const void *key, unsigned char ivec[16], u64 *Xi)
727bdf
+{
727bdf
+    return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 0);
727bdf
+}
727bdf
+
727bdf
+
727bdf
+static const PROV_GCM_HW aes_ppc_gcm = {
727bdf
+    aes_ppc_gcm_initkey,
727bdf
+    ossl_gcm_setiv,
727bdf
+    ossl_gcm_aad_update,
727bdf
+    generic_aes_gcm_cipher_update,
727bdf
+    ossl_gcm_cipher_final,
727bdf
+    ossl_gcm_one_shot
727bdf
+};
727bdf
+
727bdf
+const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits)
727bdf
+{
727bdf
+    return PPC_AES_GCM_CAPABLE ? &aes_ppc_gcm : &aes_gcm;
727bdf
+}
727bdf
+