Blame SOURCES/openssl-1.0.1e-ppc-asm-update.patch

83c29f
diff --git a/Configure b/Configure
83c29f
index 9c803dc..5a5c2d8 100755
83c29f
--- a/Configure
83c29f
+++ b/Configure
83c29f
@@ -139,8 +139,8 @@ my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes
83c29f
 my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
83c29f
 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
83c29f
 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
83c29f
-my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::";
83c29f
-my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::";
83c29f
+my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:";
83c29f
+my $ppc32_asm=$ppc64_asm;
83c29f
 my $no_asm=":::::::::::::::void";
83c29f
 
83c29f
 # As for $BSDthreads. Idea is to maintain "collective" set of flags,
83c29f
@@ -357,6 +357,7 @@ my %table=(
83c29f
 ####
83c29f
 "linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
83c29f
 "linux-ppc64",	"gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc64_asm}:linux64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
83c29f
+"linux-ppc64le","gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:$ppc64_asm:linux64le:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::",
83c29f
 "linux-ia64",	"gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
83c29f
 "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
83c29f
 "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
83c29f
@@ -462,8 +463,8 @@ my %table=(
83c29f
 
83c29f
 #### IBM's AIX.
83c29f
 "aix3-cc",  "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:::",
83c29f
-"aix-gcc",  "gcc:-O -DB_ENDIAN::-pthread:AIX::BN_LLONG RC4_CHAR:${ppc32_asm}:aix32:dlfcn:aix-shared::-shared -Wl,-G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X32",
83c29f
-"aix64-gcc","gcc:-maix64 -O -DB_ENDIAN::-pthread:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR:${ppc64_asm}:aix64:dlfcn:aix-shared::-maix64 -shared -Wl,-G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X64",
83c29f
+"aix-gcc",  "gcc:-O -DB_ENDIAN::-pthread:AIX::BN_LLONG RC4_CHAR:$ppc32_asm:aix32:dlfcn:aix-shared::-shared -Wl,-G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X32",
83c29f
+"aix64-gcc","gcc:-maix64 -O -DB_ENDIAN::-pthread:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR:$ppc64_asm:aix64:dlfcn:aix-shared::-maix64 -shared -Wl,-G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X64",
83c29f
 # Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE
83c29f
 # at build time. $OBJECT_MODE is respected at ./config stage!
83c29f
 "aix-cc",   "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst::-qthreaded -D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR:${ppc32_asm}:aix32:dlfcn:aix-shared::-q32 -G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
83c29f
@@ -1525,7 +1526,7 @@ else	{
83c29f
 	$wp_obj="wp_block.o";
83c29f
 	}
83c29f
 $cmll_obj=$cmll_enc	unless ($cmll_obj =~ /.o$/);
83c29f
-if ($modes_obj =~ /ghash/)
83c29f
+if ($modes_obj =~ /ghash\-/)
83c29f
 	{
83c29f
 	$cflags.=" -DGHASH_ASM";
83c29f
 	}
83c29f
diff --git a/config b/config
83c29f
index 88b9bc6..8b80802 100755
83c29f
--- a/config
83c29f
+++ b/config
83c29f
@@ -587,13 +587,20 @@ case "$GUESSOS" in
83c29f
 	fi
83c29f
 	;;
83c29f
   ppc64-*-linux2)
83c29f
-	echo "WARNING! If you wish to build 64-bit library, then you have to"
83c29f
-	echo "         invoke './Configure linux-ppc64' *manually*."
83c29f
-	if [ "$TEST" = "false" -a -t 1 ]; then
83c29f
-	    echo "         You have about 5 seconds to press Ctrl-C to abort."
83c29f
-	    (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1
83c29f
+	if [ -z "$KERNEL_BITS" ]; then
83c29f
+	    echo "WARNING! If you wish to build 64-bit library, then you have to"
83c29f
+	    echo "         invoke './Configure linux-ppc64' *manually*."
83c29f
+	    if [ "$TEST" = "false" -a -t 1 ]; then
83c29f
+		echo "         You have about 5 seconds to press Ctrl-C to abort."
83c29f
+		(trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1
83c29f
+	    fi
83c29f
+	fi
83c29f
+	if [ "$KERNEL_BITS" = "64" ]; then
83c29f
+	    OUT="linux-ppc64"
83c29f
+	else
83c29f
+	    OUT="linux-ppc"
83c29f
+	    (echo "__LP64__" | gcc -E -x c - 2>/dev/null | grep "^__LP64__" 2>&1 > /dev/null) || options="$options -m32"
83c29f
 	fi
83c29f
-	OUT="linux-ppc"
83c29f
 	;;
83c29f
   ppc-*-linux2) OUT="linux-ppc" ;;
83c29f
   ppc60x-*-vxworks*) OUT="vxworks-ppc60x" ;;
83c29f
diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
83c29f
index 45ede0a..847f4ee 100644
83c29f
--- a/crypto/aes/Makefile
83c29f
+++ b/crypto/aes/Makefile
83c29f
@@ -71,6 +71,10 @@ aes-sparcv9.s: asm/aes-sparcv9.pl
83c29f
 
83c29f
 aes-ppc.s:	asm/aes-ppc.pl
83c29f
 	$(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@
83c29f
+vpaes-ppc.s:	asm/vpaes-ppc.pl
83c29f
+	$(PERL) asm/vpaes-ppc.pl $(PERLASM_SCHEME) $@
83c29f
+aesp8-ppc.s:	asm/aesp8-ppc.pl
83c29f
+	$(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@
83c29f
 
83c29f
 aes-parisc.s:	asm/aes-parisc.pl
83c29f
 	$(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@
83c29f
diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl
83c29f
index 7c52cbe..7a99fc3 100644
83c29f
--- a/crypto/aes/asm/aes-ppc.pl
83c29f
+++ b/crypto/aes/asm/aes-ppc.pl
83c29f
@@ -45,6 +45,8 @@ if ($flavour =~ /64/) {
83c29f
 	$PUSH	="stw";
83c29f
 } else { die "nonsense $flavour"; }
83c29f
 
83c29f
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
83c29f
+
83c29f
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83c29f
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
83c29f
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
83c29f
@@ -68,7 +70,7 @@ $key="r5";
83c29f
 $Tbl0="r3";
83c29f
 $Tbl1="r6";
83c29f
 $Tbl2="r7";
83c29f
-$Tbl3="r2";
83c29f
+$Tbl3=$out;	# stay away from "r2"; $out is offloaded to stack
83c29f
 
83c29f
 $s0="r8";
83c29f
 $s1="r9";
83c29f
@@ -76,7 +78,7 @@ $s2="r10";
83c29f
 $s3="r11";
83c29f
 
83c29f
 $t0="r12";
83c29f
-$t1="r13";
83c29f
+$t1="r0";	# stay away from "r13";
83c29f
 $t2="r14";
83c29f
 $t3="r15";
83c29f
 
83c29f
@@ -100,9 +102,6 @@ $acc13="r29";
83c29f
 $acc14="r30";
83c29f
 $acc15="r31";
83c29f
 
83c29f
-# stay away from TLS pointer
83c29f
-if ($SIZE_T==8)	{ die if ($t1 ne "r13");  $t1="r0";		}
83c29f
-else		{ die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0";	}
83c29f
 $mask80=$Tbl2;
83c29f
 $mask1b=$Tbl3;
83c29f
 
83c29f
@@ -337,8 +336,7 @@ $code.=<<___;
83c29f
 	$STU	$sp,-$FRAME($sp)
83c29f
 	mflr	r0
83c29f
 
83c29f
-	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
83c29f
-	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
83c29f
+	$PUSH	$out,`$FRAME-$SIZE_T*19`($sp)
83c29f
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
83c29f
 	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
83c29f
 	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
83c29f
@@ -365,16 +363,61 @@ $code.=<<___;
83c29f
 	bne	Lenc_unaligned
83c29f
 
83c29f
 Lenc_unaligned_ok:
83c29f
+___
83c29f
+$code.=<<___ if (!$LITTLE_ENDIAN);
83c29f
 	lwz	$s0,0($inp)
83c29f
 	lwz	$s1,4($inp)
83c29f
 	lwz	$s2,8($inp)
83c29f
 	lwz	$s3,12($inp)
83c29f
+___
83c29f
+$code.=<<___ if ($LITTLE_ENDIAN);
83c29f
+	lwz	$t0,0($inp)
83c29f
+	lwz	$t1,4($inp)
83c29f
+	lwz	$t2,8($inp)
83c29f
+	lwz	$t3,12($inp)
83c29f
+	rotlwi	$s0,$t0,8
83c29f
+	rotlwi	$s1,$t1,8
83c29f
+	rotlwi	$s2,$t2,8
83c29f
+	rotlwi	$s3,$t3,8
83c29f
+	rlwimi	$s0,$t0,24,0,7
83c29f
+	rlwimi	$s1,$t1,24,0,7
83c29f
+	rlwimi	$s2,$t2,24,0,7
83c29f
+	rlwimi	$s3,$t3,24,0,7
83c29f
+	rlwimi	$s0,$t0,24,16,23
83c29f
+	rlwimi	$s1,$t1,24,16,23
83c29f
+	rlwimi	$s2,$t2,24,16,23
83c29f
+	rlwimi	$s3,$t3,24,16,23
83c29f
+___
83c29f
+$code.=<<___;
83c29f
 	bl	LAES_Te
83c29f
 	bl	Lppc_AES_encrypt_compact
83c29f
+	$POP	$out,`$FRAME-$SIZE_T*19`($sp)
83c29f
+___
83c29f
+$code.=<<___ if ($LITTLE_ENDIAN);
83c29f
+	rotlwi	$t0,$s0,8
83c29f
+	rotlwi	$t1,$s1,8
83c29f
+	rotlwi	$t2,$s2,8
83c29f
+	rotlwi	$t3,$s3,8
83c29f
+	rlwimi	$t0,$s0,24,0,7
83c29f
+	rlwimi	$t1,$s1,24,0,7
83c29f
+	rlwimi	$t2,$s2,24,0,7
83c29f
+	rlwimi	$t3,$s3,24,0,7
83c29f
+	rlwimi	$t0,$s0,24,16,23
83c29f
+	rlwimi	$t1,$s1,24,16,23
83c29f
+	rlwimi	$t2,$s2,24,16,23
83c29f
+	rlwimi	$t3,$s3,24,16,23
83c29f
+	stw	$t0,0($out)
83c29f
+	stw	$t1,4($out)
83c29f
+	stw	$t2,8($out)
83c29f
+	stw	$t3,12($out)
83c29f
+___
83c29f
+$code.=<<___ if (!$LITTLE_ENDIAN);
83c29f
 	stw	$s0,0($out)
83c29f
 	stw	$s1,4($out)
83c29f
 	stw	$s2,8($out)
83c29f
 	stw	$s3,12($out)
83c29f
+___
83c29f
+$code.=<<___;
83c29f
 	b	Lenc_done
83c29f
 
83c29f
 Lenc_unaligned:
83c29f
@@ -417,6 +460,7 @@ Lenc_xpage:
83c29f
 
83c29f
 	bl	LAES_Te
83c29f
 	bl	Lppc_AES_encrypt_compact
83c29f
+	$POP	$out,`$FRAME-$SIZE_T*19`($sp)
83c29f
 
83c29f
 	extrwi	$acc00,$s0,8,0
83c29f
 	extrwi	$acc01,$s0,8,8
83c29f
@@ -449,8 +493,6 @@ Lenc_xpage:
83c29f
 
83c29f
 Lenc_done:
83c29f
 	$POP	r0,`$FRAME+$LRSAVE`($sp)
83c29f
-	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
83c29f
-	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
83c29f
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
83c29f
 	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
83c29f
 	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
83c29f
@@ -764,6 +806,7 @@ Lenc_compact_done:
83c29f
 	blr
83c29f
 	.long	0
83c29f
 	.byte	0,12,0x14,0,0,0,0,0
83c29f
+.size	.AES_encrypt,.-.AES_encrypt
83c29f
 
83c29f
 .globl	.AES_decrypt
83c29f
 .align	7
83c29f
@@ -771,8 +814,7 @@ Lenc_compact_done:
83c29f
 	$STU	$sp,-$FRAME($sp)
83c29f
 	mflr	r0
83c29f
 
83c29f
-	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
83c29f
-	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
83c29f
+	$PUSH	$out,`$FRAME-$SIZE_T*19`($sp)
83c29f
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
83c29f
 	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
83c29f
 	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
83c29f
@@ -799,16 +841,61 @@ Lenc_compact_done:
83c29f
 	bne	Ldec_unaligned
83c29f
 
83c29f
 Ldec_unaligned_ok:
83c29f
+___
83c29f
+$code.=<<___ if (!$LITTLE_ENDIAN);
83c29f
 	lwz	$s0,0($inp)
83c29f
 	lwz	$s1,4($inp)
83c29f
 	lwz	$s2,8($inp)
83c29f
 	lwz	$s3,12($inp)
83c29f
+___
83c29f
+$code.=<<___ if ($LITTLE_ENDIAN);
83c29f
+	lwz	$t0,0($inp)
83c29f
+	lwz	$t1,4($inp)
83c29f
+	lwz	$t2,8($inp)
83c29f
+	lwz	$t3,12($inp)
83c29f
+	rotlwi	$s0,$t0,8
83c29f
+	rotlwi	$s1,$t1,8
83c29f
+	rotlwi	$s2,$t2,8
83c29f
+	rotlwi	$s3,$t3,8
83c29f
+	rlwimi	$s0,$t0,24,0,7
83c29f
+	rlwimi	$s1,$t1,24,0,7
83c29f
+	rlwimi	$s2,$t2,24,0,7
83c29f
+	rlwimi	$s3,$t3,24,0,7
83c29f
+	rlwimi	$s0,$t0,24,16,23
83c29f
+	rlwimi	$s1,$t1,24,16,23
83c29f
+	rlwimi	$s2,$t2,24,16,23
83c29f
+	rlwimi	$s3,$t3,24,16,23
83c29f
+___
83c29f
+$code.=<<___;
83c29f
 	bl	LAES_Td
83c29f
 	bl	Lppc_AES_decrypt_compact
83c29f
+	$POP	$out,`$FRAME-$SIZE_T*19`($sp)
83c29f
+___
83c29f
+$code.=<<___ if ($LITTLE_ENDIAN);
83c29f
+	rotlwi	$t0,$s0,8
83c29f
+	rotlwi	$t1,$s1,8
83c29f
+	rotlwi	$t2,$s2,8
83c29f
+	rotlwi	$t3,$s3,8
83c29f
+	rlwimi	$t0,$s0,24,0,7
83c29f
+	rlwimi	$t1,$s1,24,0,7
83c29f
+	rlwimi	$t2,$s2,24,0,7
83c29f
+	rlwimi	$t3,$s3,24,0,7
83c29f
+	rlwimi	$t0,$s0,24,16,23
83c29f
+	rlwimi	$t1,$s1,24,16,23
83c29f
+	rlwimi	$t2,$s2,24,16,23
83c29f
+	rlwimi	$t3,$s3,24,16,23
83c29f
+	stw	$t0,0($out)
83c29f
+	stw	$t1,4($out)
83c29f
+	stw	$t2,8($out)
83c29f
+	stw	$t3,12($out)
83c29f
+___
83c29f
+$code.=<<___ if (!$LITTLE_ENDIAN);
83c29f
 	stw	$s0,0($out)
83c29f
 	stw	$s1,4($out)
83c29f
 	stw	$s2,8($out)
83c29f
 	stw	$s3,12($out)
83c29f
+___
83c29f
+$code.=<<___;
83c29f
 	b	Ldec_done
83c29f
 
83c29f
 Ldec_unaligned:
83c29f
@@ -851,6 +938,7 @@ Ldec_xpage:
83c29f
 
83c29f
 	bl	LAES_Td
83c29f
 	bl	Lppc_AES_decrypt_compact
83c29f
+	$POP	$out,`$FRAME-$SIZE_T*19`($sp)
83c29f
 
83c29f
 	extrwi	$acc00,$s0,8,0
83c29f
 	extrwi	$acc01,$s0,8,8
83c29f
@@ -883,8 +971,6 @@ Ldec_xpage:
83c29f
 
83c29f
 Ldec_done:
83c29f
 	$POP	r0,`$FRAME+$LRSAVE`($sp)
83c29f
-	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
83c29f
-	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
83c29f
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
83c29f
 	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
83c29f
 	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
83c29f
@@ -1355,6 +1441,7 @@ Ldec_compact_done:
83c29f
 	blr
83c29f
 	.long	0
83c29f
 	.byte	0,12,0x14,0,0,0,0,0
83c29f
+.size	.AES_decrypt,.-.AES_decrypt
83c29f
 
83c29f
 .asciz	"AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
83c29f
 .align	7
83c29f
diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
83c29f
new file mode 100755
83c29f
index 0000000..3ee8979
83c29f
--- /dev/null
83c29f
+++ b/crypto/aes/asm/aesp8-ppc.pl
83c29f
@@ -0,0 +1,1940 @@
83c29f
+#!/usr/bin/env perl
83c29f
+#
83c29f
+# ====================================================================
83c29f
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
83c29f
+# project. The module is, however, dual licensed under OpenSSL and
83c29f
+# CRYPTOGAMS licenses depending on where you obtain it. For further
83c29f
+# details see http://www.openssl.org/~appro/cryptogams/.
83c29f
+# ====================================================================
83c29f
+#
83c29f
+# This module implements support for AES instructions as per PowerISA
83c29f
+# specification version 2.07, first implemented by POWER8 processor.
83c29f
+# The module is endian-agnostic in sense that it supports both big-
83c29f
+# and little-endian cases. Data alignment in parallelizable modes is
83c29f
+# handled with VSX loads and stores, which implies MSR.VSX flag being
83c29f
+# set. It should also be noted that ISA specification doesn't prohibit
83c29f
+# alignment exceptions for these instructions on page boundaries.
83c29f
+# Initially alignment was handled in pure AltiVec/VMX way [when data
83c29f
+# is aligned programmatically, which in turn guarantees exception-
83c29f
+# free execution], but it turned to hamper performance when vcipher
83c29f
+# instructions are interleaved. It's reckoned that eventual
83c29f
+# misalignment penalties at page boundaries are in average lower
83c29f
+# than additional overhead in pure AltiVec approach.
83c29f
+
83c29f
+$flavour = shift;
83c29f
+
83c29f
+if ($flavour =~ /64/) {
83c29f
+	$SIZE_T	=8;
83c29f
+	$LRSAVE	=2*$SIZE_T;
83c29f
+	$STU	="stdu";
83c29f
+	$POP	="ld";
83c29f
+	$PUSH	="std";
83c29f
+	$UCMP	="cmpld";
83c29f
+	$SHL	="sldi";
83c29f
+} elsif ($flavour =~ /32/) {
83c29f
+	$SIZE_T	=4;
83c29f
+	$LRSAVE	=$SIZE_T;
83c29f
+	$STU	="stwu";
83c29f
+	$POP	="lwz";
83c29f
+	$PUSH	="stw";
83c29f
+	$UCMP	="cmplw";
83c29f
+	$SHL	="slwi";
83c29f
+} else { die "nonsense $flavour"; }
83c29f
+
83c29f
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
83c29f
+
83c29f
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83c29f
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
83c29f
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
83c29f
+die "can't locate ppc-xlate.pl";
83c29f
+
83c29f
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
83c29f
+
83c29f
+$FRAME=8*$SIZE_T;
83c29f
+$prefix="aes_p8";
83c29f
+
83c29f
+$sp="r1";
83c29f
+$vrsave="r12";
83c29f
+
83c29f
+#########################################################################
83c29f
+{{{	# Key setup procedures						#
83c29f
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
83c29f
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
83c29f
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
83c29f
+
83c29f
+$code.=<<___;
83c29f
+.machine	"any"
83c29f
+
83c29f
+.text
83c29f
+
83c29f
+.align	7
83c29f
+rcon:
83c29f
+.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
83c29f
+.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
83c29f
+.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
83c29f
+.long	0,0,0,0						?asis
83c29f
+Lconsts:
83c29f
+	mflr	r0
83c29f
+	bcl	20,31,\$+4
83c29f
+	mflr	$ptr	 #vvvvv "distance between . and rcon
83c29f
+	addi	$ptr,$ptr,-0x48
83c29f
+	mtlr	r0
83c29f
+	blr
83c29f
+	.long	0
83c29f
+	.byte	0,12,0x14,0,0,0,0,0
83c29f
+.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
83c29f
+
83c29f
+.globl	.${prefix}_set_encrypt_key
83c29f
+.align	5
83c29f
+.${prefix}_set_encrypt_key:
83c29f
+Lset_encrypt_key:
83c29f
+	mflr		r11
83c29f
+	$PUSH		r11,$LRSAVE($sp)
83c29f
+
83c29f
+	li		$ptr,-1
83c29f
+	${UCMP}i	$inp,0
83c29f
+	beq-		Lenc_key_abort		# if ($inp==0) return -1;
83c29f
+	${UCMP}i	$out,0
83c29f
+	beq-		Lenc_key_abort		# if ($out==0) return -1;
83c29f
+	li		$ptr,-2
83c29f
+	cmpwi		$bits,128
83c29f
+	blt-		Lenc_key_abort
83c29f
+	cmpwi		$bits,256
83c29f
+	bgt-		Lenc_key_abort
83c29f
+	andi.		r0,$bits,0x3f
83c29f
+	bne-		Lenc_key_abort
83c29f
+
83c29f
+	lis		r0,0xfff0
83c29f
+	mfspr		$vrsave,256
83c29f
+	mtspr		256,r0
83c29f
+
83c29f
+	bl		Lconsts
83c29f
+	mtlr		r11
83c29f
+
83c29f
+	neg		r9,$inp
83c29f
+	lvx		$in0,0,$inp
83c29f
+	addi		$inp,$inp,15		# 15 is not typo
83c29f
+	lvsr		$key,0,r9		# borrow $key
83c29f
+	li		r8,0x20
83c29f
+	cmpwi		$bits,192
83c29f
+	lvx		$in1,0,$inp
83c29f
+	le?vspltisb	$mask,0x0f		# borrow $mask
83c29f
+	lvx		$rcon,0,$ptr
83c29f
+	le?vxor		$key,$key,$mask		# adjust for byte swap
83c29f
+	lvx		$mask,r8,$ptr
83c29f
+	addi		$ptr,$ptr,0x10
83c29f
+	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
83c29f
+	li		$cnt,8
83c29f
+	vxor		$zero,$zero,$zero
83c29f
+	mtctr		$cnt
83c29f
+
83c29f
+	?lvsr		$outperm,0,$out
83c29f
+	vspltisb	$outmask,-1
83c29f
+	lvx		$outhead,0,$out
83c29f
+	?vperm		$outmask,$zero,$outmask,$outperm
83c29f
+
83c29f
+	blt		Loop128
83c29f
+	addi		$inp,$inp,8
83c29f
+	beq		L192
83c29f
+	addi		$inp,$inp,8
83c29f
+	b		L256
83c29f
+
83c29f
+.align	4
83c29f
+Loop128:
83c29f
+	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
83c29f
+	vsldoi		$tmp,$zero,$in0,12	# >>32
83c29f
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	vcipherlast	$key,$key,$rcon
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$out,$out,16
83c29f
+
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	 vadduwm	$rcon,$rcon,$rcon
83c29f
+	vxor		$in0,$in0,$key
83c29f
+	bdnz		Loop128
83c29f
+
83c29f
+	lvx		$rcon,0,$ptr		# last two round keys
83c29f
+
83c29f
+	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
83c29f
+	vsldoi		$tmp,$zero,$in0,12	# >>32
83c29f
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	vcipherlast	$key,$key,$rcon
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$out,$out,16
83c29f
+
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	 vadduwm	$rcon,$rcon,$rcon
83c29f
+	vxor		$in0,$in0,$key
83c29f
+
83c29f
+	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
83c29f
+	vsldoi		$tmp,$zero,$in0,12	# >>32
83c29f
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	vcipherlast	$key,$key,$rcon
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$out,$out,16
83c29f
+
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vxor		$in0,$in0,$key
83c29f
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	 stvx		$stage,0,$out
83c29f
+
83c29f
+	addi		$inp,$out,15		# 15 is not typo
83c29f
+	addi		$out,$out,0x50
83c29f
+
83c29f
+	li		$rounds,10
83c29f
+	b		Ldone
83c29f
+
83c29f
+.align	4
83c29f
+L192:
83c29f
+	lvx		$tmp,0,$inp
83c29f
+	li		$cnt,4
83c29f
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$out,$out,16
83c29f
+	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
83c29f
+	vspltisb	$key,8			# borrow $key
83c29f
+	mtctr		$cnt
83c29f
+	vsububm		$mask,$mask,$key	# adjust the mask
83c29f
+
83c29f
+Loop192:
83c29f
+	vperm		$key,$in1,$in1,$mask	# roate-n-splat
83c29f
+	vsldoi		$tmp,$zero,$in0,12	# >>32
83c29f
+	vcipherlast	$key,$key,$rcon
83c29f
+
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+
83c29f
+	 vsldoi		$stage,$zero,$in1,8
83c29f
+	vspltw		$tmp,$in0,3
83c29f
+	vxor		$tmp,$tmp,$in1
83c29f
+	vsldoi		$in1,$zero,$in1,12	# >>32
83c29f
+	 vadduwm	$rcon,$rcon,$rcon
83c29f
+	vxor		$in1,$in1,$tmp
83c29f
+	vxor		$in0,$in0,$key
83c29f
+	vxor		$in1,$in1,$key
83c29f
+	 vsldoi		$stage,$stage,$in0,8
83c29f
+
83c29f
+	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
83c29f
+	vsldoi		$tmp,$zero,$in0,12	# >>32
83c29f
+	 vperm		$outtail,$stage,$stage,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	vcipherlast	$key,$key,$rcon
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$out,$out,16
83c29f
+
83c29f
+	 vsldoi		$stage,$in0,$in1,8
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	 vperm		$outtail,$stage,$stage,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$out,$out,16
83c29f
+
83c29f
+	vspltw		$tmp,$in0,3
83c29f
+	vxor		$tmp,$tmp,$in1
83c29f
+	vsldoi		$in1,$zero,$in1,12	# >>32
83c29f
+	 vadduwm	$rcon,$rcon,$rcon
83c29f
+	vxor		$in1,$in1,$tmp
83c29f
+	vxor		$in0,$in0,$key
83c29f
+	vxor		$in1,$in1,$key
83c29f
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$inp,$out,15		# 15 is not typo
83c29f
+	 addi		$out,$out,16
83c29f
+	bdnz		Loop192
83c29f
+
83c29f
+	li		$rounds,12
83c29f
+	addi		$out,$out,0x20
83c29f
+	b		Ldone
83c29f
+
83c29f
+.align	4
83c29f
+L256:
83c29f
+	lvx		$tmp,0,$inp
83c29f
+	li		$cnt,7
83c29f
+	li		$rounds,14
83c29f
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$out,$out,16
83c29f
+	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
83c29f
+	mtctr		$cnt
83c29f
+
83c29f
+Loop256:
83c29f
+	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
83c29f
+	vsldoi		$tmp,$zero,$in0,12	# >>32
83c29f
+	 vperm		$outtail,$in1,$in1,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	vcipherlast	$key,$key,$rcon
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$out,$out,16
83c29f
+
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in0,$in0,$tmp
83c29f
+	 vadduwm	$rcon,$rcon,$rcon
83c29f
+	vxor		$in0,$in0,$key
83c29f
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
83c29f
+	 vsel		$stage,$outhead,$outtail,$outmask
83c29f
+	 vmr		$outhead,$outtail
83c29f
+	 stvx		$stage,0,$out
83c29f
+	 addi		$inp,$out,15		# 15 is not typo
83c29f
+	 addi		$out,$out,16
83c29f
+	bdz		Ldone
83c29f
+
83c29f
+	vspltw		$key,$in0,3		# just splat
83c29f
+	vsldoi		$tmp,$zero,$in1,12	# >>32
83c29f
+	vsbox		$key,$key
83c29f
+
83c29f
+	vxor		$in1,$in1,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in1,$in1,$tmp
83c29f
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
83c29f
+	vxor		$in1,$in1,$tmp
83c29f
+
83c29f
+	vxor		$in1,$in1,$key
83c29f
+	b		Loop256
83c29f
+
83c29f
+.align	4
83c29f
+Ldone:
83c29f
+	lvx		$in1,0,$inp		# redundant in aligned case
83c29f
+	vsel		$in1,$outhead,$in1,$outmask
83c29f
+	stvx		$in1,0,$inp
83c29f
+	li		$ptr,0
83c29f
+	mtspr		256,$vrsave
83c29f
+	stw		$rounds,0($out)
83c29f
+
83c29f
+Lenc_key_abort:
83c29f
+	mr		r3,$ptr
83c29f
+	blr
83c29f
+	.long		0
83c29f
+	.byte		0,12,0x14,1,0,0,3,0
83c29f
+	.long		0
83c29f
+.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
83c29f
+
83c29f
+.globl	.${prefix}_set_decrypt_key
83c29f
+.align	5
83c29f
+.${prefix}_set_decrypt_key:
83c29f
+	$STU		$sp,-$FRAME($sp)
83c29f
+	mflr		r10
83c29f
+	$PUSH		r10,$FRAME+$LRSAVE($sp)
83c29f
+	bl		Lset_encrypt_key
83c29f
+	mtlr		r10
83c29f
+
83c29f
+	cmpwi		r3,0
83c29f
+	bne-		Ldec_key_abort
83c29f
+
83c29f
+	slwi		$cnt,$rounds,4
83c29f
+	subi		$inp,$out,240		# first round key
83c29f
+	srwi		$rounds,$rounds,1
83c29f
+	add		$out,$inp,$cnt		# last round key
83c29f
+	mtctr		$rounds
83c29f
+
83c29f
+Ldeckey:
83c29f
+	lwz		r0, 0($inp)
83c29f
+	lwz		r6, 4($inp)
83c29f
+	lwz		r7, 8($inp)
83c29f
+	lwz		r8, 12($inp)
83c29f
+	addi		$inp,$inp,16
83c29f
+	lwz		r9, 0($out)
83c29f
+	lwz		r10,4($out)
83c29f
+	lwz		r11,8($out)
83c29f
+	lwz		r12,12($out)
83c29f
+	stw		r0, 0($out)
83c29f
+	stw		r6, 4($out)
83c29f
+	stw		r7, 8($out)
83c29f
+	stw		r8, 12($out)
83c29f
+	subi		$out,$out,16
83c29f
+	stw		r9, -16($inp)
83c29f
+	stw		r10,-12($inp)
83c29f
+	stw		r11,-8($inp)
83c29f
+	stw		r12,-4($inp)
83c29f
+	bdnz		Ldeckey
83c29f
+
83c29f
+	xor		r3,r3,r3		# return value
83c29f
+Ldec_key_abort:
83c29f
+	addi		$sp,$sp,$FRAME
83c29f
+	blr
83c29f
+	.long		0
83c29f
+	.byte		0,12,4,1,0x80,0,3,0
83c29f
+	.long		0
83c29f
+.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
83c29f
+___
83c29f
+}}}
83c29f
+#########################################################################
83c29f
+{{{	# Single block en- and decrypt procedures			#
83c29f
+sub gen_block () {
83c29f
+my $dir = shift;
83c29f
+my $n   = $dir eq "de" ? "n" : "";
83c29f
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
83c29f
+
83c29f
+$code.=<<___;
83c29f
+.globl	.${prefix}_${dir}crypt
83c29f
+.align	5
83c29f
+.${prefix}_${dir}crypt:
83c29f
+	lwz		$rounds,240($key)
83c29f
+	lis		r0,0xfc00
83c29f
+	mfspr		$vrsave,256
83c29f
+	li		$idx,15			# 15 is not typo
83c29f
+	mtspr		256,r0
83c29f
+
83c29f
+	lvx		v0,0,$inp
83c29f
+	neg		r11,$out
83c29f
+	lvx		v1,$idx,$inp
83c29f
+	lvsl		v2,0,$inp		# inpperm
83c29f
+	le?vspltisb	v4,0x0f
83c29f
+	?lvsl		v3,0,r11		# outperm
83c29f
+	le?vxor		v2,v2,v4
83c29f
+	li		$idx,16
83c29f
+	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
83c29f
+	lvx		v1,0,$key
83c29f
+	?lvsl		v5,0,$key		# keyperm
83c29f
+	srwi		$rounds,$rounds,1
83c29f
+	lvx		v2,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	subi		$rounds,$rounds,1
83c29f
+	?vperm		v1,v1,v2,v5		# align round key
83c29f
+
83c29f
+	vxor		v0,v0,v1
83c29f
+	lvx		v1,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	mtctr		$rounds
83c29f
+
83c29f
+Loop_${dir}c:
83c29f
+	?vperm		v2,v2,v1,v5
83c29f
+	v${n}cipher	v0,v0,v2
83c29f
+	lvx		v2,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	?vperm		v1,v1,v2,v5
83c29f
+	v${n}cipher	v0,v0,v1
83c29f
+	lvx		v1,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	bdnz		Loop_${dir}c
83c29f
+
83c29f
+	?vperm		v2,v2,v1,v5
83c29f
+	v${n}cipher	v0,v0,v2
83c29f
+	lvx		v2,$idx,$key
83c29f
+	?vperm		v1,v1,v2,v5
83c29f
+	v${n}cipherlast	v0,v0,v1
83c29f
+
83c29f
+	vspltisb	v2,-1
83c29f
+	vxor		v1,v1,v1
83c29f
+	li		$idx,15			# 15 is not typo
83c29f
+	?vperm		v2,v1,v2,v3		# outmask
83c29f
+	le?vxor		v3,v3,v4
83c29f
+	lvx		v1,0,$out		# outhead
83c29f
+	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
83c29f
+	vsel		v1,v1,v0,v2
83c29f
+	lvx		v4,$idx,$out
83c29f
+	stvx		v1,0,$out
83c29f
+	vsel		v0,v0,v4,v2
83c29f
+	stvx		v0,$idx,$out
83c29f
+
83c29f
+	mtspr		256,$vrsave
83c29f
+	blr
83c29f
+	.long		0
83c29f
+	.byte		0,12,0x14,0,0,0,3,0
83c29f
+	.long		0
83c29f
+.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
83c29f
+___
83c29f
+}
83c29f
+&gen_block("en");
83c29f
+&gen_block("de");
83c29f
+}}}
83c29f
+#########################################################################
83c29f
+{{{	# CBC en- and decrypt procedures				#
83c29f
+my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
83c29f
+my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
83c29f
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
83c29f
+						map("v$_",(4..10));
83c29f
+$code.=<<___;
83c29f
+.globl	.${prefix}_cbc_encrypt
83c29f
+.align	5
83c29f
+.${prefix}_cbc_encrypt:
83c29f
+	${UCMP}i	$len,16
83c29f
+	bltlr-
83c29f
+
83c29f
+	cmpwi		$enc,0			# test direction
83c29f
+	lis		r0,0xffe0
83c29f
+	mfspr		$vrsave,256
83c29f
+	mtspr		256,r0
83c29f
+
83c29f
+	li		$idx,15
83c29f
+	vxor		$rndkey0,$rndkey0,$rndkey0
83c29f
+	le?vspltisb	$tmp,0x0f
83c29f
+
83c29f
+	lvx		$ivec,0,$ivp		# load [unaligned] iv
83c29f
+	lvsl		$inpperm,0,$ivp
83c29f
+	lvx		$inptail,$idx,$ivp
83c29f
+	le?vxor		$inpperm,$inpperm,$tmp
83c29f
+	vperm		$ivec,$ivec,$inptail,$inpperm
83c29f
+
83c29f
+	neg		r11,$inp
83c29f
+	?lvsl		$keyperm,0,$key		# prepare for unaligned key
83c29f
+	lwz		$rounds,240($key)
83c29f
+
83c29f
+	lvsr		$inpperm,0,r11		# prepare for unaligned load
83c29f
+	lvx		$inptail,0,$inp
83c29f
+	addi		$inp,$inp,15		# 15 is not typo
83c29f
+	le?vxor		$inpperm,$inpperm,$tmp
83c29f
+
83c29f
+	?lvsr		$outperm,0,$out		# prepare for unaligned store
83c29f
+	vspltisb	$outmask,-1
83c29f
+	lvx		$outhead,0,$out
83c29f
+	?vperm		$outmask,$rndkey0,$outmask,$outperm
83c29f
+	le?vxor		$outperm,$outperm,$tmp
83c29f
+
83c29f
+	srwi		$rounds,$rounds,1
83c29f
+	li		$idx,16
83c29f
+	subi		$rounds,$rounds,1
83c29f
+	beq		Lcbc_dec
83c29f
+
83c29f
+Lcbc_enc:
83c29f
+	vmr		$inout,$inptail
83c29f
+	lvx		$inptail,0,$inp
83c29f
+	addi		$inp,$inp,16
83c29f
+	mtctr		$rounds
83c29f
+	subi		$len,$len,16		# len-=16
83c29f
+
83c29f
+	lvx		$rndkey0,0,$key
83c29f
+	 vperm		$inout,$inout,$inptail,$inpperm
83c29f
+	lvx		$rndkey1,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
83c29f
+	vxor		$inout,$inout,$rndkey0
83c29f
+	lvx		$rndkey0,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	vxor		$inout,$inout,$ivec
83c29f
+
83c29f
+Loop_cbc_enc:
83c29f
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
83c29f
+	vcipher		$inout,$inout,$rndkey1
83c29f
+	lvx		$rndkey1,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
83c29f
+	vcipher		$inout,$inout,$rndkey0
83c29f
+	lvx		$rndkey0,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	bdnz		Loop_cbc_enc
83c29f
+
83c29f
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
83c29f
+	vcipher		$inout,$inout,$rndkey1
83c29f
+	lvx		$rndkey1,$idx,$key
83c29f
+	li		$idx,16
83c29f
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
83c29f
+	vcipherlast	$ivec,$inout,$rndkey0
83c29f
+	${UCMP}i	$len,16
83c29f
+
83c29f
+	vperm		$tmp,$ivec,$ivec,$outperm
83c29f
+	vsel		$inout,$outhead,$tmp,$outmask
83c29f
+	vmr		$outhead,$tmp
83c29f
+	stvx		$inout,0,$out
83c29f
+	addi		$out,$out,16
83c29f
+	bge		Lcbc_enc
83c29f
+
83c29f
+	b		Lcbc_done
83c29f
+
83c29f
+.align	4
83c29f
+Lcbc_dec:
83c29f
+	${UCMP}i	$len,128
83c29f
+	bge		_aesp8_cbc_decrypt8x
83c29f
+	vmr		$tmp,$inptail
83c29f
+	lvx		$inptail,0,$inp
83c29f
+	addi		$inp,$inp,16
83c29f
+	mtctr		$rounds
83c29f
+	subi		$len,$len,16		# len-=16
83c29f
+
83c29f
+	lvx		$rndkey0,0,$key
83c29f
+	 vperm		$tmp,$tmp,$inptail,$inpperm
83c29f
+	lvx		$rndkey1,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
83c29f
+	vxor		$inout,$tmp,$rndkey0
83c29f
+	lvx		$rndkey0,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+
83c29f
+Loop_cbc_dec:
83c29f
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
83c29f
+	vncipher	$inout,$inout,$rndkey1
83c29f
+	lvx		$rndkey1,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
83c29f
+	vncipher	$inout,$inout,$rndkey0
83c29f
+	lvx		$rndkey0,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	bdnz		Loop_cbc_dec
83c29f
+
83c29f
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
83c29f
+	vncipher	$inout,$inout,$rndkey1
83c29f
+	lvx		$rndkey1,$idx,$key
83c29f
+	li		$idx,16
83c29f
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
83c29f
+	vncipherlast	$inout,$inout,$rndkey0
83c29f
+	${UCMP}i	$len,16
83c29f
+
83c29f
+	vxor		$inout,$inout,$ivec
83c29f
+	vmr		$ivec,$tmp
83c29f
+	vperm		$tmp,$inout,$inout,$outperm
83c29f
+	vsel		$inout,$outhead,$tmp,$outmask
83c29f
+	vmr		$outhead,$tmp
83c29f
+	stvx		$inout,0,$out
83c29f
+	addi		$out,$out,16
83c29f
+	bge		Lcbc_dec
83c29f
+
83c29f
+Lcbc_done:
83c29f
+	addi		$out,$out,-1
83c29f
+	lvx		$inout,0,$out		# redundant in aligned case
83c29f
+	vsel		$inout,$outhead,$inout,$outmask
83c29f
+	stvx		$inout,0,$out
83c29f
+
83c29f
+	neg		$enc,$ivp		# write [unaligned] iv
83c29f
+	li		$idx,15			# 15 is not typo
83c29f
+	vxor		$rndkey0,$rndkey0,$rndkey0
83c29f
+	vspltisb	$outmask,-1
83c29f
+	le?vspltisb	$tmp,0x0f
83c29f
+	?lvsl		$outperm,0,$enc
83c29f
+	?vperm		$outmask,$rndkey0,$outmask,$outperm
83c29f
+	le?vxor		$outperm,$outperm,$tmp
83c29f
+	lvx		$outhead,0,$ivp
83c29f
+	vperm		$ivec,$ivec,$ivec,$outperm
83c29f
+	vsel		$inout,$outhead,$ivec,$outmask
83c29f
+	lvx		$inptail,$idx,$ivp
83c29f
+	stvx		$inout,0,$ivp
83c29f
+	vsel		$inout,$ivec,$inptail,$outmask
83c29f
+	stvx		$inout,$idx,$ivp
83c29f
+
83c29f
+	mtspr		256,$vrsave
83c29f
+	blr
83c29f
+	.long		0
83c29f
+	.byte		0,12,0x14,0,0,0,6,0
83c29f
+	.long		0
83c29f
+___
83c29f
+#########################################################################
83c29f
+{{	# Optimized CBC decrypt procedure				#
83c29f
+my $key_="r11";
83c29f
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
83c29f
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
83c29f
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
83c29f
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
83c29f
+			# v26-v31 last 6 round keys
83c29f
+my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
83c29f
+
83c29f
+$code.=<<___;
83c29f
+.align	5
83c29f
+_aesp8_cbc_decrypt8x:
83c29f
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
83c29f
+	li		r10,`$FRAME+8*16+15`
83c29f
+	li		r11,`$FRAME+8*16+31`
83c29f
+	stvx		v20,r10,$sp		# ABI says so
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v21,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v22,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v23,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v24,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v25,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v26,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v27,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v28,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v29,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v30,r10,$sp
83c29f
+	stvx		v31,r11,$sp
83c29f
+	li		r0,-1
83c29f
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
83c29f
+	li		$x10,0x10
83c29f
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
83c29f
+	li		$x20,0x20
83c29f
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
83c29f
+	li		$x30,0x30
83c29f
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
83c29f
+	li		$x40,0x40
83c29f
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
83c29f
+	li		$x50,0x50
83c29f
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
83c29f
+	li		$x60,0x60
83c29f
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
83c29f
+	li		$x70,0x70
83c29f
+	mtspr		256,r0
83c29f
+
83c29f
+	subi		$rounds,$rounds,3	# -4 in total
83c29f
+	subi		$len,$len,128		# bias
83c29f
+
83c29f
+	lvx		$rndkey0,$x00,$key	# load key schedule
83c29f
+	lvx		v30,$x10,$key
83c29f
+	addi		$key,$key,0x20
83c29f
+	lvx		v31,$x00,$key
83c29f
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
83c29f
+	addi		$key_,$sp,$FRAME+15
83c29f
+	mtctr		$rounds
83c29f
+
83c29f
+Load_cbc_dec_key:
83c29f
+	?vperm		v24,v30,v31,$keyperm
83c29f
+	lvx		v30,$x10,$key
83c29f
+	addi		$key,$key,0x20
83c29f
+	stvx		v24,$x00,$key_		# off-load round[1]
83c29f
+	?vperm		v25,v31,v30,$keyperm
83c29f
+	lvx		v31,$x00,$key
83c29f
+	stvx		v25,$x10,$key_		# off-load round[2]
83c29f
+	addi		$key_,$key_,0x20
83c29f
+	bdnz		Load_cbc_dec_key
83c29f
+
83c29f
+	lvx		v26,$x10,$key
83c29f
+	?vperm		v24,v30,v31,$keyperm
83c29f
+	lvx		v27,$x20,$key
83c29f
+	stvx		v24,$x00,$key_		# off-load round[3]
83c29f
+	?vperm		v25,v31,v26,$keyperm
83c29f
+	lvx		v28,$x30,$key
83c29f
+	stvx		v25,$x10,$key_		# off-load round[4]
83c29f
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
83c29f
+	?vperm		v26,v26,v27,$keyperm
83c29f
+	lvx		v29,$x40,$key
83c29f
+	?vperm		v27,v27,v28,$keyperm
83c29f
+	lvx		v30,$x50,$key
83c29f
+	?vperm		v28,v28,v29,$keyperm
83c29f
+	lvx		v31,$x60,$key
83c29f
+	?vperm		v29,v29,v30,$keyperm
83c29f
+	lvx		$out0,$x70,$key		# borrow $out0
83c29f
+	?vperm		v30,v30,v31,$keyperm
83c29f
+	lvx		v24,$x00,$key_		# pre-load round[1]
83c29f
+	?vperm		v31,v31,$out0,$keyperm
83c29f
+	lvx		v25,$x10,$key_		# pre-load round[2]
83c29f
+
83c29f
+	#lvx		$inptail,0,$inp		# "caller" already did this
83c29f
+	#addi		$inp,$inp,15		# 15 is not typo
83c29f
+	subi		$inp,$inp,15		# undo "caller"
83c29f
+
83c29f
+	 le?li		$idx,8
83c29f
+	lvx_u		$in0,$x00,$inp		# load first 8 "words"
83c29f
+	 le?lvsl	$inpperm,0,$idx
83c29f
+	 le?vspltisb	$tmp,0x0f
83c29f
+	lvx_u		$in1,$x10,$inp
83c29f
+	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
83c29f
+	lvx_u		$in2,$x20,$inp
83c29f
+	 le?vperm	$in0,$in0,$in0,$inpperm
83c29f
+	lvx_u		$in3,$x30,$inp
83c29f
+	 le?vperm	$in1,$in1,$in1,$inpperm
83c29f
+	lvx_u		$in4,$x40,$inp
83c29f
+	 le?vperm	$in2,$in2,$in2,$inpperm
83c29f
+	vxor		$out0,$in0,$rndkey0
83c29f
+	lvx_u		$in5,$x50,$inp
83c29f
+	 le?vperm	$in3,$in3,$in3,$inpperm
83c29f
+	vxor		$out1,$in1,$rndkey0
83c29f
+	lvx_u		$in6,$x60,$inp
83c29f
+	 le?vperm	$in4,$in4,$in4,$inpperm
83c29f
+	vxor		$out2,$in2,$rndkey0
83c29f
+	lvx_u		$in7,$x70,$inp
83c29f
+	addi		$inp,$inp,0x80
83c29f
+	 le?vperm	$in5,$in5,$in5,$inpperm
83c29f
+	vxor		$out3,$in3,$rndkey0
83c29f
+	 le?vperm	$in6,$in6,$in6,$inpperm
83c29f
+	vxor		$out4,$in4,$rndkey0
83c29f
+	 le?vperm	$in7,$in7,$in7,$inpperm
83c29f
+	vxor		$out5,$in5,$rndkey0
83c29f
+	vxor		$out6,$in6,$rndkey0
83c29f
+	vxor		$out7,$in7,$rndkey0
83c29f
+
83c29f
+	mtctr		$rounds
83c29f
+	b		Loop_cbc_dec8x
83c29f
+.align	5
83c29f
+Loop_cbc_dec8x:
83c29f
+	vncipher	$out0,$out0,v24
83c29f
+	vncipher	$out1,$out1,v24
83c29f
+	vncipher	$out2,$out2,v24
83c29f
+	vncipher	$out3,$out3,v24
83c29f
+	vncipher	$out4,$out4,v24
83c29f
+	vncipher	$out5,$out5,v24
83c29f
+	vncipher	$out6,$out6,v24
83c29f
+	vncipher	$out7,$out7,v24
83c29f
+	lvx		v24,$x20,$key_		# round[3]
83c29f
+	addi		$key_,$key_,0x20
83c29f
+
83c29f
+	vncipher	$out0,$out0,v25
83c29f
+	vncipher	$out1,$out1,v25
83c29f
+	vncipher	$out2,$out2,v25
83c29f
+	vncipher	$out3,$out3,v25
83c29f
+	vncipher	$out4,$out4,v25
83c29f
+	vncipher	$out5,$out5,v25
83c29f
+	vncipher	$out6,$out6,v25
83c29f
+	vncipher	$out7,$out7,v25
83c29f
+	lvx		v25,$x10,$key_		# round[4]
83c29f
+	bdnz		Loop_cbc_dec8x
83c29f
+
83c29f
+	subic		$len,$len,128		# $len-=128
83c29f
+	vncipher	$out0,$out0,v24
83c29f
+	vncipher	$out1,$out1,v24
83c29f
+	vncipher	$out2,$out2,v24
83c29f
+	vncipher	$out3,$out3,v24
83c29f
+	vncipher	$out4,$out4,v24
83c29f
+	vncipher	$out5,$out5,v24
83c29f
+	vncipher	$out6,$out6,v24
83c29f
+	vncipher	$out7,$out7,v24
83c29f
+
83c29f
+	subfe.		r0,r0,r0		# borrow?-1:0
83c29f
+	vncipher	$out0,$out0,v25
83c29f
+	vncipher	$out1,$out1,v25
83c29f
+	vncipher	$out2,$out2,v25
83c29f
+	vncipher	$out3,$out3,v25
83c29f
+	vncipher	$out4,$out4,v25
83c29f
+	vncipher	$out5,$out5,v25
83c29f
+	vncipher	$out6,$out6,v25
83c29f
+	vncipher	$out7,$out7,v25
83c29f
+
83c29f
+	and		r0,r0,$len
83c29f
+	vncipher	$out0,$out0,v26
83c29f
+	vncipher	$out1,$out1,v26
83c29f
+	vncipher	$out2,$out2,v26
83c29f
+	vncipher	$out3,$out3,v26
83c29f
+	vncipher	$out4,$out4,v26
83c29f
+	vncipher	$out5,$out5,v26
83c29f
+	vncipher	$out6,$out6,v26
83c29f
+	vncipher	$out7,$out7,v26
83c29f
+
83c29f
+	add		$inp,$inp,r0		# $inp is adjusted in such
83c29f
+						# way that at exit from the
83c29f
+						# loop inX-in7 are loaded
83c29f
+						# with last "words"
83c29f
+	vncipher	$out0,$out0,v27
83c29f
+	vncipher	$out1,$out1,v27
83c29f
+	vncipher	$out2,$out2,v27
83c29f
+	vncipher	$out3,$out3,v27
83c29f
+	vncipher	$out4,$out4,v27
83c29f
+	vncipher	$out5,$out5,v27
83c29f
+	vncipher	$out6,$out6,v27
83c29f
+	vncipher	$out7,$out7,v27
83c29f
+
83c29f
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
83c29f
+	vncipher	$out0,$out0,v28
83c29f
+	vncipher	$out1,$out1,v28
83c29f
+	vncipher	$out2,$out2,v28
83c29f
+	vncipher	$out3,$out3,v28
83c29f
+	vncipher	$out4,$out4,v28
83c29f
+	vncipher	$out5,$out5,v28
83c29f
+	vncipher	$out6,$out6,v28
83c29f
+	vncipher	$out7,$out7,v28
83c29f
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
83c29f
+
83c29f
+	vncipher	$out0,$out0,v29
83c29f
+	vncipher	$out1,$out1,v29
83c29f
+	vncipher	$out2,$out2,v29
83c29f
+	vncipher	$out3,$out3,v29
83c29f
+	vncipher	$out4,$out4,v29
83c29f
+	vncipher	$out5,$out5,v29
83c29f
+	vncipher	$out6,$out6,v29
83c29f
+	vncipher	$out7,$out7,v29
83c29f
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
83c29f
+
83c29f
+	vncipher	$out0,$out0,v30
83c29f
+	 vxor		$ivec,$ivec,v31		# xor with last round key
83c29f
+	vncipher	$out1,$out1,v30
83c29f
+	 vxor		$in0,$in0,v31
83c29f
+	vncipher	$out2,$out2,v30
83c29f
+	 vxor		$in1,$in1,v31
83c29f
+	vncipher	$out3,$out3,v30
83c29f
+	 vxor		$in2,$in2,v31
83c29f
+	vncipher	$out4,$out4,v30
83c29f
+	 vxor		$in3,$in3,v31
83c29f
+	vncipher	$out5,$out5,v30
83c29f
+	 vxor		$in4,$in4,v31
83c29f
+	vncipher	$out6,$out6,v30
83c29f
+	 vxor		$in5,$in5,v31
83c29f
+	vncipher	$out7,$out7,v30
83c29f
+	 vxor		$in6,$in6,v31
83c29f
+
83c29f
+	vncipherlast	$out0,$out0,$ivec
83c29f
+	vncipherlast	$out1,$out1,$in0
83c29f
+	 lvx_u		$in0,$x00,$inp		# load next input block
83c29f
+	vncipherlast	$out2,$out2,$in1
83c29f
+	 lvx_u		$in1,$x10,$inp
83c29f
+	vncipherlast	$out3,$out3,$in2
83c29f
+	 le?vperm	$in0,$in0,$in0,$inpperm
83c29f
+	 lvx_u		$in2,$x20,$inp
83c29f
+	vncipherlast	$out4,$out4,$in3
83c29f
+	 le?vperm	$in1,$in1,$in1,$inpperm
83c29f
+	 lvx_u		$in3,$x30,$inp
83c29f
+	vncipherlast	$out5,$out5,$in4
83c29f
+	 le?vperm	$in2,$in2,$in2,$inpperm
83c29f
+	 lvx_u		$in4,$x40,$inp
83c29f
+	vncipherlast	$out6,$out6,$in5
83c29f
+	 le?vperm	$in3,$in3,$in3,$inpperm
83c29f
+	 lvx_u		$in5,$x50,$inp
83c29f
+	vncipherlast	$out7,$out7,$in6
83c29f
+	 le?vperm	$in4,$in4,$in4,$inpperm
83c29f
+	 lvx_u		$in6,$x60,$inp
83c29f
+	vmr		$ivec,$in7
83c29f
+	 le?vperm	$in5,$in5,$in5,$inpperm
83c29f
+	 lvx_u		$in7,$x70,$inp
83c29f
+	 addi		$inp,$inp,0x80
83c29f
+
83c29f
+	le?vperm	$out0,$out0,$out0,$inpperm
83c29f
+	le?vperm	$out1,$out1,$out1,$inpperm
83c29f
+	stvx_u		$out0,$x00,$out
83c29f
+	 le?vperm	$in6,$in6,$in6,$inpperm
83c29f
+	 vxor		$out0,$in0,$rndkey0
83c29f
+	le?vperm	$out2,$out2,$out2,$inpperm
83c29f
+	stvx_u		$out1,$x10,$out
83c29f
+	 le?vperm	$in7,$in7,$in7,$inpperm
83c29f
+	 vxor		$out1,$in1,$rndkey0
83c29f
+	le?vperm	$out3,$out3,$out3,$inpperm
83c29f
+	stvx_u		$out2,$x20,$out
83c29f
+	 vxor		$out2,$in2,$rndkey0
83c29f
+	le?vperm	$out4,$out4,$out4,$inpperm
83c29f
+	stvx_u		$out3,$x30,$out
83c29f
+	 vxor		$out3,$in3,$rndkey0
83c29f
+	le?vperm	$out5,$out5,$out5,$inpperm
83c29f
+	stvx_u		$out4,$x40,$out
83c29f
+	 vxor		$out4,$in4,$rndkey0
83c29f
+	le?vperm	$out6,$out6,$out6,$inpperm
83c29f
+	stvx_u		$out5,$x50,$out
83c29f
+	 vxor		$out5,$in5,$rndkey0
83c29f
+	le?vperm	$out7,$out7,$out7,$inpperm
83c29f
+	stvx_u		$out6,$x60,$out
83c29f
+	 vxor		$out6,$in6,$rndkey0
83c29f
+	stvx_u		$out7,$x70,$out
83c29f
+	addi		$out,$out,0x80
83c29f
+	 vxor		$out7,$in7,$rndkey0
83c29f
+
83c29f
+	mtctr		$rounds
83c29f
+	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
83c29f
+
83c29f
+	addic.		$len,$len,128
83c29f
+	beq		Lcbc_dec8x_done
83c29f
+	nop
83c29f
+	nop
83c29f
+
83c29f
+Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
83c29f
+	vncipher	$out1,$out1,v24
83c29f
+	vncipher	$out2,$out2,v24
83c29f
+	vncipher	$out3,$out3,v24
83c29f
+	vncipher	$out4,$out4,v24
83c29f
+	vncipher	$out5,$out5,v24
83c29f
+	vncipher	$out6,$out6,v24
83c29f
+	vncipher	$out7,$out7,v24
83c29f
+	lvx		v24,$x20,$key_		# round[3]
83c29f
+	addi		$key_,$key_,0x20
83c29f
+
83c29f
+	vncipher	$out1,$out1,v25
83c29f
+	vncipher	$out2,$out2,v25
83c29f
+	vncipher	$out3,$out3,v25
83c29f
+	vncipher	$out4,$out4,v25
83c29f
+	vncipher	$out5,$out5,v25
83c29f
+	vncipher	$out6,$out6,v25
83c29f
+	vncipher	$out7,$out7,v25
83c29f
+	lvx		v25,$x10,$key_		# round[4]
83c29f
+	bdnz		Loop_cbc_dec8x_tail
83c29f
+
83c29f
+	vncipher	$out1,$out1,v24
83c29f
+	vncipher	$out2,$out2,v24
83c29f
+	vncipher	$out3,$out3,v24
83c29f
+	vncipher	$out4,$out4,v24
83c29f
+	vncipher	$out5,$out5,v24
83c29f
+	vncipher	$out6,$out6,v24
83c29f
+	vncipher	$out7,$out7,v24
83c29f
+
83c29f
+	vncipher	$out1,$out1,v25
83c29f
+	vncipher	$out2,$out2,v25
83c29f
+	vncipher	$out3,$out3,v25
83c29f
+	vncipher	$out4,$out4,v25
83c29f
+	vncipher	$out5,$out5,v25
83c29f
+	vncipher	$out6,$out6,v25
83c29f
+	vncipher	$out7,$out7,v25
83c29f
+
83c29f
+	vncipher	$out1,$out1,v26
83c29f
+	vncipher	$out2,$out2,v26
83c29f
+	vncipher	$out3,$out3,v26
83c29f
+	vncipher	$out4,$out4,v26
83c29f
+	vncipher	$out5,$out5,v26
83c29f
+	vncipher	$out6,$out6,v26
83c29f
+	vncipher	$out7,$out7,v26
83c29f
+
83c29f
+	vncipher	$out1,$out1,v27
83c29f
+	vncipher	$out2,$out2,v27
83c29f
+	vncipher	$out3,$out3,v27
83c29f
+	vncipher	$out4,$out4,v27
83c29f
+	vncipher	$out5,$out5,v27
83c29f
+	vncipher	$out6,$out6,v27
83c29f
+	vncipher	$out7,$out7,v27
83c29f
+
83c29f
+	vncipher	$out1,$out1,v28
83c29f
+	vncipher	$out2,$out2,v28
83c29f
+	vncipher	$out3,$out3,v28
83c29f
+	vncipher	$out4,$out4,v28
83c29f
+	vncipher	$out5,$out5,v28
83c29f
+	vncipher	$out6,$out6,v28
83c29f
+	vncipher	$out7,$out7,v28
83c29f
+
83c29f
+	vncipher	$out1,$out1,v29
83c29f
+	vncipher	$out2,$out2,v29
83c29f
+	vncipher	$out3,$out3,v29
83c29f
+	vncipher	$out4,$out4,v29
83c29f
+	vncipher	$out5,$out5,v29
83c29f
+	vncipher	$out6,$out6,v29
83c29f
+	vncipher	$out7,$out7,v29
83c29f
+
83c29f
+	vncipher	$out1,$out1,v30
83c29f
+	 vxor		$ivec,$ivec,v31		# last round key
83c29f
+	vncipher	$out2,$out2,v30
83c29f
+	 vxor		$in1,$in1,v31
83c29f
+	vncipher	$out3,$out3,v30
83c29f
+	 vxor		$in2,$in2,v31
83c29f
+	vncipher	$out4,$out4,v30
83c29f
+	 vxor		$in3,$in3,v31
83c29f
+	vncipher	$out5,$out5,v30
83c29f
+	 vxor		$in4,$in4,v31
83c29f
+	vncipher	$out6,$out6,v30
83c29f
+	 vxor		$in5,$in5,v31
83c29f
+	vncipher	$out7,$out7,v30
83c29f
+	 vxor		$in6,$in6,v31
83c29f
+
83c29f
+	cmplwi		$len,32			# switch($len)
83c29f
+	blt		Lcbc_dec8x_one
83c29f
+	nop
83c29f
+	beq		Lcbc_dec8x_two
83c29f
+	cmplwi		$len,64
83c29f
+	blt		Lcbc_dec8x_three
83c29f
+	nop
83c29f
+	beq		Lcbc_dec8x_four
83c29f
+	cmplwi		$len,96
83c29f
+	blt		Lcbc_dec8x_five
83c29f
+	nop
83c29f
+	beq		Lcbc_dec8x_six
83c29f
+
83c29f
+Lcbc_dec8x_seven:
83c29f
+	vncipherlast	$out1,$out1,$ivec
83c29f
+	vncipherlast	$out2,$out2,$in1
83c29f
+	vncipherlast	$out3,$out3,$in2
83c29f
+	vncipherlast	$out4,$out4,$in3
83c29f
+	vncipherlast	$out5,$out5,$in4
83c29f
+	vncipherlast	$out6,$out6,$in5
83c29f
+	vncipherlast	$out7,$out7,$in6
83c29f
+	vmr		$ivec,$in7
83c29f
+
83c29f
+	le?vperm	$out1,$out1,$out1,$inpperm
83c29f
+	le?vperm	$out2,$out2,$out2,$inpperm
83c29f
+	stvx_u		$out1,$x00,$out
83c29f
+	le?vperm	$out3,$out3,$out3,$inpperm
83c29f
+	stvx_u		$out2,$x10,$out
83c29f
+	le?vperm	$out4,$out4,$out4,$inpperm
83c29f
+	stvx_u		$out3,$x20,$out
83c29f
+	le?vperm	$out5,$out5,$out5,$inpperm
83c29f
+	stvx_u		$out4,$x30,$out
83c29f
+	le?vperm	$out6,$out6,$out6,$inpperm
83c29f
+	stvx_u		$out5,$x40,$out
83c29f
+	le?vperm	$out7,$out7,$out7,$inpperm
83c29f
+	stvx_u		$out6,$x50,$out
83c29f
+	stvx_u		$out7,$x60,$out
83c29f
+	addi		$out,$out,0x70
83c29f
+	b		Lcbc_dec8x_done
83c29f
+
83c29f
+.align	5
83c29f
+Lcbc_dec8x_six:
83c29f
+	vncipherlast	$out2,$out2,$ivec
83c29f
+	vncipherlast	$out3,$out3,$in2
83c29f
+	vncipherlast	$out4,$out4,$in3
83c29f
+	vncipherlast	$out5,$out5,$in4
83c29f
+	vncipherlast	$out6,$out6,$in5
83c29f
+	vncipherlast	$out7,$out7,$in6
83c29f
+	vmr		$ivec,$in7
83c29f
+
83c29f
+	le?vperm	$out2,$out2,$out2,$inpperm
83c29f
+	le?vperm	$out3,$out3,$out3,$inpperm
83c29f
+	stvx_u		$out2,$x00,$out
83c29f
+	le?vperm	$out4,$out4,$out4,$inpperm
83c29f
+	stvx_u		$out3,$x10,$out
83c29f
+	le?vperm	$out5,$out5,$out5,$inpperm
83c29f
+	stvx_u		$out4,$x20,$out
83c29f
+	le?vperm	$out6,$out6,$out6,$inpperm
83c29f
+	stvx_u		$out5,$x30,$out
83c29f
+	le?vperm	$out7,$out7,$out7,$inpperm
83c29f
+	stvx_u		$out6,$x40,$out
83c29f
+	stvx_u		$out7,$x50,$out
83c29f
+	addi		$out,$out,0x60
83c29f
+	b		Lcbc_dec8x_done
83c29f
+
83c29f
+.align	5
83c29f
+Lcbc_dec8x_five:
83c29f
+	vncipherlast	$out3,$out3,$ivec
83c29f
+	vncipherlast	$out4,$out4,$in3
83c29f
+	vncipherlast	$out5,$out5,$in4
83c29f
+	vncipherlast	$out6,$out6,$in5
83c29f
+	vncipherlast	$out7,$out7,$in6
83c29f
+	vmr		$ivec,$in7
83c29f
+
83c29f
+	le?vperm	$out3,$out3,$out3,$inpperm
83c29f
+	le?vperm	$out4,$out4,$out4,$inpperm
83c29f
+	stvx_u		$out3,$x00,$out
83c29f
+	le?vperm	$out5,$out5,$out5,$inpperm
83c29f
+	stvx_u		$out4,$x10,$out
83c29f
+	le?vperm	$out6,$out6,$out6,$inpperm
83c29f
+	stvx_u		$out5,$x20,$out
83c29f
+	le?vperm	$out7,$out7,$out7,$inpperm
83c29f
+	stvx_u		$out6,$x30,$out
83c29f
+	stvx_u		$out7,$x40,$out
83c29f
+	addi		$out,$out,0x50
83c29f
+	b		Lcbc_dec8x_done
83c29f
+
83c29f
+.align	5
83c29f
+Lcbc_dec8x_four:
83c29f
+	vncipherlast	$out4,$out4,$ivec
83c29f
+	vncipherlast	$out5,$out5,$in4
83c29f
+	vncipherlast	$out6,$out6,$in5
83c29f
+	vncipherlast	$out7,$out7,$in6
83c29f
+	vmr		$ivec,$in7
83c29f
+
83c29f
+	le?vperm	$out4,$out4,$out4,$inpperm
83c29f
+	le?vperm	$out5,$out5,$out5,$inpperm
83c29f
+	stvx_u		$out4,$x00,$out
83c29f
+	le?vperm	$out6,$out6,$out6,$inpperm
83c29f
+	stvx_u		$out5,$x10,$out
83c29f
+	le?vperm	$out7,$out7,$out7,$inpperm
83c29f
+	stvx_u		$out6,$x20,$out
83c29f
+	stvx_u		$out7,$x30,$out
83c29f
+	addi		$out,$out,0x40
83c29f
+	b		Lcbc_dec8x_done
83c29f
+
83c29f
+.align	5
83c29f
+Lcbc_dec8x_three:
83c29f
+	vncipherlast	$out5,$out5,$ivec
83c29f
+	vncipherlast	$out6,$out6,$in5
83c29f
+	vncipherlast	$out7,$out7,$in6
83c29f
+	vmr		$ivec,$in7
83c29f
+
83c29f
+	le?vperm	$out5,$out5,$out5,$inpperm
83c29f
+	le?vperm	$out6,$out6,$out6,$inpperm
83c29f
+	stvx_u		$out5,$x00,$out
83c29f
+	le?vperm	$out7,$out7,$out7,$inpperm
83c29f
+	stvx_u		$out6,$x10,$out
83c29f
+	stvx_u		$out7,$x20,$out
83c29f
+	addi		$out,$out,0x30
83c29f
+	b		Lcbc_dec8x_done
83c29f
+
83c29f
+.align	5
83c29f
+Lcbc_dec8x_two:
83c29f
+	vncipherlast	$out6,$out6,$ivec
83c29f
+	vncipherlast	$out7,$out7,$in6
83c29f
+	vmr		$ivec,$in7
83c29f
+
83c29f
+	le?vperm	$out6,$out6,$out6,$inpperm
83c29f
+	le?vperm	$out7,$out7,$out7,$inpperm
83c29f
+	stvx_u		$out6,$x00,$out
83c29f
+	stvx_u		$out7,$x10,$out
83c29f
+	addi		$out,$out,0x20
83c29f
+	b		Lcbc_dec8x_done
83c29f
+
83c29f
+.align	5
83c29f
+Lcbc_dec8x_one:
83c29f
+	vncipherlast	$out7,$out7,$ivec
83c29f
+	vmr		$ivec,$in7
83c29f
+
83c29f
+	le?vperm	$out7,$out7,$out7,$inpperm
83c29f
+	stvx_u		$out7,0,$out
83c29f
+	addi		$out,$out,0x10
83c29f
+
83c29f
+Lcbc_dec8x_done:
83c29f
+	le?vperm	$ivec,$ivec,$ivec,$inpperm
83c29f
+	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
83c29f
+
83c29f
+	li		r10,`$FRAME+15`
83c29f
+	li		r11,`$FRAME+31`
83c29f
+	stvx		$inpperm,r10,$sp	# wipe copies of round keys
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		$inpperm,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		$inpperm,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		$inpperm,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		$inpperm,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		$inpperm,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		$inpperm,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		$inpperm,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+
83c29f
+	mtspr		256,$vrsave
83c29f
+	lvx		v20,r10,$sp		# ABI says so
83c29f
+	addi		r10,r10,32
83c29f
+	lvx		v21,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	lvx		v22,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	lvx		v23,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	lvx		v24,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	lvx		v25,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	lvx		v26,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	lvx		v27,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	lvx		v28,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	lvx		v29,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	lvx		v30,r10,$sp
83c29f
+	lvx		v31,r11,$sp
83c29f
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
83c29f
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
83c29f
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
83c29f
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
83c29f
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
83c29f
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
83c29f
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
83c29f
+	blr
83c29f
+	.long		0
83c29f
+	.byte		0,12,0x14,0,0x80,6,6,0
83c29f
+	.long		0
83c29f
+.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
83c29f
+___
83c29f
+}}	}}}
83c29f
+
83c29f
+#########################################################################
83c29f
+{{{	# CTR procedure[s]						#
83c29f
+my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
83c29f
+my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
83c29f
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
83c29f
+						map("v$_",(4..11));
83c29f
+my $dat=$tmp;
83c29f
+
83c29f
+$code.=<<___;
83c29f
+.globl	.${prefix}_ctr32_encrypt_blocks
83c29f
+.align	5
83c29f
+.${prefix}_ctr32_encrypt_blocks:
83c29f
+	${UCMP}i	$len,1
83c29f
+	bltlr-
83c29f
+
83c29f
+	lis		r0,0xfff0
83c29f
+	mfspr		$vrsave,256
83c29f
+	mtspr		256,r0
83c29f
+
83c29f
+	li		$idx,15
83c29f
+	vxor		$rndkey0,$rndkey0,$rndkey0
83c29f
+	le?vspltisb	$tmp,0x0f
83c29f
+
83c29f
+	lvx		$ivec,0,$ivp		# load [unaligned] iv
83c29f
+	lvsl		$inpperm,0,$ivp
83c29f
+	lvx		$inptail,$idx,$ivp
83c29f
+	 vspltisb	$one,1
83c29f
+	le?vxor		$inpperm,$inpperm,$tmp
83c29f
+	vperm		$ivec,$ivec,$inptail,$inpperm
83c29f
+	 vsldoi		$one,$rndkey0,$one,1
83c29f
+
83c29f
+	neg		r11,$inp
83c29f
+	?lvsl		$keyperm,0,$key		# prepare for unaligned key
83c29f
+	lwz		$rounds,240($key)
83c29f
+
83c29f
+	lvsr		$inpperm,0,r11		# prepare for unaligned load
83c29f
+	lvx		$inptail,0,$inp
83c29f
+	addi		$inp,$inp,15		# 15 is not typo
83c29f
+	le?vxor		$inpperm,$inpperm,$tmp
83c29f
+
83c29f
+	srwi		$rounds,$rounds,1
83c29f
+	li		$idx,16
83c29f
+	subi		$rounds,$rounds,1
83c29f
+
83c29f
+	${UCMP}i	$len,8
83c29f
+	bge		_aesp8_ctr32_encrypt8x
83c29f
+
83c29f
+	?lvsr		$outperm,0,$out		# prepare for unaligned store
83c29f
+	vspltisb	$outmask,-1
83c29f
+	lvx		$outhead,0,$out
83c29f
+	?vperm		$outmask,$rndkey0,$outmask,$outperm
83c29f
+	le?vxor		$outperm,$outperm,$tmp
83c29f
+
83c29f
+	lvx		$rndkey0,0,$key
83c29f
+	mtctr		$rounds
83c29f
+	lvx		$rndkey1,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
83c29f
+	vxor		$inout,$ivec,$rndkey0
83c29f
+	lvx		$rndkey0,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	b		Loop_ctr32_enc
83c29f
+
83c29f
+.align	5
83c29f
+Loop_ctr32_enc:
83c29f
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
83c29f
+	vcipher		$inout,$inout,$rndkey1
83c29f
+	lvx		$rndkey1,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
83c29f
+	vcipher		$inout,$inout,$rndkey0
83c29f
+	lvx		$rndkey0,$idx,$key
83c29f
+	addi		$idx,$idx,16
83c29f
+	bdnz		Loop_ctr32_enc
83c29f
+
83c29f
+	vadduwm		$ivec,$ivec,$one
83c29f
+	 vmr		$dat,$inptail
83c29f
+	 lvx		$inptail,0,$inp
83c29f
+	 addi		$inp,$inp,16
83c29f
+	 subic.		$len,$len,1		# blocks--
83c29f
+
83c29f
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
83c29f
+	vcipher		$inout,$inout,$rndkey1
83c29f
+	lvx		$rndkey1,$idx,$key
83c29f
+	 vperm		$dat,$dat,$inptail,$inpperm
83c29f
+	 li		$idx,16
83c29f
+	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
83c29f
+	 lvx		$rndkey0,0,$key
83c29f
+	vxor		$dat,$dat,$rndkey1	# last round key
83c29f
+	vcipherlast	$inout,$inout,$dat
83c29f
+
83c29f
+	 lvx		$rndkey1,$idx,$key
83c29f
+	 addi		$idx,$idx,16
83c29f
+	vperm		$inout,$inout,$inout,$outperm
83c29f
+	vsel		$dat,$outhead,$inout,$outmask
83c29f
+	 mtctr		$rounds
83c29f
+	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
83c29f
+	vmr		$outhead,$inout
83c29f
+	 vxor		$inout,$ivec,$rndkey0
83c29f
+	 lvx		$rndkey0,$idx,$key
83c29f
+	 addi		$idx,$idx,16
83c29f
+	stvx		$dat,0,$out
83c29f
+	addi		$out,$out,16
83c29f
+	bne		Loop_ctr32_enc
83c29f
+
83c29f
+	addi		$out,$out,-1
83c29f
+	lvx		$inout,0,$out		# redundant in aligned case
83c29f
+	vsel		$inout,$outhead,$inout,$outmask
83c29f
+	stvx		$inout,0,$out
83c29f
+
83c29f
+	mtspr		256,$vrsave
83c29f
+	blr
83c29f
+	.long		0
83c29f
+	.byte		0,12,0x14,0,0,0,6,0
83c29f
+	.long		0
83c29f
+___
83c29f
+#########################################################################
83c29f
+{{	# Optimized CTR procedure					#
83c29f
+my $key_="r11";
83c29f
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
83c29f
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
83c29f
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
83c29f
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
83c29f
+			# v26-v31 last 6 round keys
83c29f
+my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
83c29f
+my ($two,$three,$four)=($outhead,$outperm,$outmask);
83c29f
+
83c29f
+$code.=<<___;
83c29f
+.align	5
83c29f
+_aesp8_ctr32_encrypt8x:
83c29f
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
83c29f
+	li		r10,`$FRAME+8*16+15`
83c29f
+	li		r11,`$FRAME+8*16+31`
83c29f
+	stvx		v20,r10,$sp		# ABI says so
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v21,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v22,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v23,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v24,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v25,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v26,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v27,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v28,r10,$sp
83c29f
+	addi		r10,r10,32
83c29f
+	stvx		v29,r11,$sp
83c29f
+	addi		r11,r11,32
83c29f
+	stvx		v30,r10,$sp
83c29f
+	stvx		v31,r11,$sp
83c29f
+	li		r0,-1
83c29f
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
83c29f
+	li		$x10,0x10
83c29f
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
83c29f
+	li		$x20,0x20
83c29f
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
83c29f
+	li		$x30,0x30
83c29f
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
83c29f
+	li		$x40,0x40
83c29f
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
83c29f
+	li		$x50,0x50
83c29f
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
83c29f
+	li		$x60,0x60
83c29f
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
83c29f
+	li		$x70,0x70
83c29f
+	mtspr		256,r0
83c29f
+
83c29f
+	subi		$rounds,$rounds,3	# -4 in total
83c29f
+
83c29f
+	lvx		$rndkey0,$x00,$key	# load key schedule
83c29f
+	lvx		v30,$x10,$key
83c29f
+	addi		$key,$key,0x20
83c29f
+	lvx		v31,$x00,$key
83c29f
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
83c29f
+	addi		$key_,$sp,$FRAME+15
83c29f
+	mtctr		$rounds
83c29f
+
83c29f
+Load_ctr32_enc_key:
83c29f
+	?vperm		v24,v30,v31,$keyperm
83c29f
+	lvx		v30,$x10,$key
83c29f
+	addi		$key,$key,0x20
83c29f
+	stvx		v24,$x00,$key_		# off-load round[1]
83c29f
+	?vperm		v25,v31,v30,$keyperm
83c29f
+	lvx		v31,$x00,$key
83c29f
+	stvx		v25,$x10,$key_		# off-load round[2]
83c29f
+	addi		$key_,$key_,0x20
83c29f
+	bdnz		Load_ctr32_enc_key
83c29f
+
83c29f
+	lvx		v26,$x10,$key
83c29f
+	?vperm		v24,v30,v31,$keyperm
83c29f
+	lvx		v27,$x20,$key
83c29f
+	stvx		v24,$x00,$key_		# off-load round[3]
83c29f
+	?vperm		v25,v31,v26,$keyperm
83c29f
+	lvx		v28,$x30,$key
83c29f
+	stvx		v25,$x10,$key_		# off-load round[4]
83c29f
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
83c29f
+	?vperm		v26,v26,v27,$keyperm
83c29f
+	lvx		v29,$x40,$key
83c29f
+	?vperm		v27,v27,v28,$keyperm
83c29f
+	lvx		v30,$x50,$key
83c29f
+	?vperm		v28,v28,v29,$keyperm
83c29f
+	lvx		v31,$x60,$key
83c29f
+	?vperm		v29,v29,v30,$keyperm
83c29f
+	lvx		$out0,$x70,$key		# borrow $out0
83c29f
+	?vperm		v30,v30,v31,$keyperm
83c29f
+	lvx		v24,$x00,$key_		# pre-load round[1]
83c29f
+	?vperm		v31,v31,$out0,$keyperm
83c29f
+	lvx		v25,$x10,$key_		# pre-load round[2]
83c29f
+
83c29f
+	vadduwm		$two,$one,$one
83c29f
+	subi		$inp,$inp,15		# undo "caller"
83c29f
+	$SHL		$len,$len,4
83c29f
+
83c29f
+	vadduwm		$out1,$ivec,$one	# counter values ...
83c29f
+	vadduwm		$out2,$ivec,$two
83c29f
+	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
83c29f
+	 le?li		$idx,8
83c29f
+	vadduwm		$out3,$out1,$two
83c29f
+	vxor		$out1,$out1,$rndkey0
83c29f
+	 le?lvsl	$inpperm,0,$idx
83c29f
+	vadduwm		$out4,$out2,$two
83c29f
+	vxor		$out2,$out2,$rndkey0
83c29f
+	 le?vspltisb	$tmp,0x0f
83c29f
+	vadduwm		$out5,$out3,$two
83c29f
+	vxor		$out3,$out3,$rndkey0
83c29f
+	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
83c29f
+	vadduwm		$out6,$out4,$two
83c29f
+	vxor		$out4,$out4,$rndkey0
83c29f
+	vadduwm		$out7,$out5,$two
83c29f
+	vxor		$out5,$out5,$rndkey0
83c29f
+	vadduwm		$ivec,$out6,$two	# next counter value
83c29f
+	vxor		$out6,$out6,$rndkey0
83c29f
+	vxor		$out7,$out7,$rndkey0
83c29f
+
83c29f
+	mtctr		$rounds
83c29f
+	b		Loop_ctr32_enc8x
83c29f
+.align	5
83c29f
+Loop_ctr32_enc8x:
83c29f
+	vcipher 	$out0,$out0,v24
83c29f
+	vcipher 	$out1,$out1,v24
83c29f
+	vcipher 	$out2,$out2,v24
83c29f
+	vcipher 	$out3,$out3,v24
83c29f
+	vcipher 	$out4,$out4,v24
83c29f
+	vcipher 	$out5,$out5,v24
83c29f
+	vcipher 	$out6,$out6,v24
83c29f
+	vcipher 	$out7,$out7,v24
83c29f
+Loop_ctr32_enc8x_middle:
83c29f
+	lvx		v24,$x20,$key_		# round[3]
83c29f
+	addi		$key_,$key_,0x20
83c29f
+
83c29f
+	vcipher 	$out0,$out0,v25
83c29f
+	vcipher 	$out1,$out1,v25
83c29f
+	vcipher 	$out2,$out2,v25
83c29f
+	vcipher 	$out3,$out3,v25
83c29f
+	vcipher 	$out4,$out4,v25
83c29f
+	vcipher 	$out5,$out5,v25
83c29f
+	vcipher 	$out6,$out6,v25
83c29f
+	vcipher 	$out7,$out7,v25
83c29f
+	lvx		v25,$x10,$key_		# round[4]
83c29f
+	bdnz		Loop_ctr32_enc8x
83c29f
+
83c29f
+	subic		r11,$len,256		# $len-256, borrow $key_
83c29f
+	vcipher 	$out0,$out0,v24
83c29f
+	vcipher 	$out1,$out1,v24
83c29f
+	vcipher 	$out2,$out2,v24
83c29f
+	vcipher 	$out3,$out3,v24
83c29f
+	vcipher 	$out4,$out4,v24
83c29f
+	vcipher 	$out5,$out5,v24
83c29f
+	vcipher 	$out6,$out6,v24
83c29f
+	vcipher 	$out7,$out7,v24
83c29f
+
83c29f
+	subfe		r0,r0,r0		# borrow?-1:0
83c29f
+	vcipher 	$out0,$out0,v25
83c29f
+	vcipher 	$out1,$out1,v25
83c29f
+	vcipher 	$out2,$out2,v25
83c29f
+	vcipher 	$out3,$out3,v25
83c29f
+	vcipher 	$out4,$out4,v25
83c29f
+	vcipher		$out5,$out5,v25
83c29f
+	vcipher		$out6,$out6,v25
83c29f
+	vcipher		$out7,$out7,v25
83c29f
+
83c29f
+	and		r0,r0,r11
83c29f
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
83c29f
+	vcipher		$out0,$out0,v26
83c29f
+	vcipher		$out1,$out1,v26
83c29f
+	vcipher		$out2,$out2,v26
83c29f
+	vcipher		$out3,$out3,v26
83c29f
+	vcipher		$out4,$out4,v26
83c29f
+	vcipher		$out5,$out5,v26
83c29f
+	vcipher		$out6,$out6,v26
83c29f
+	vcipher		$out7,$out7,v26
83c29f
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
83c29f
+
83c29f
+	subic		$len,$len,129		# $len-=129
83c29f
+	vcipher		$out0,$out0,v27
83c29f
+	addi		$len,$len,1		# $len-=128 really
83c29f
+	vcipher		$out1,$out1,v27
83c29f
+	vcipher		$out2,$out2,v27
83c29f
+	vcipher		$out3,$out3,v27
83c29f
+	vcipher		$out4,$out4,v27
83c29f
+	vcipher		$out5,$out5,v27
83c29f
+	vcipher		$out6,$out6,v27
83c29f
+	vcipher		$out7,$out7,v27
83c29f
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
83c29f
+
83c29f
+	vcipher		$out0,$out0,v28
83c29f
+	 lvx_u		$in0,$x00,$inp		# load input
83c29f
+	vcipher		$out1,$out1,v28
83c29f
+	 lvx_u		$in1,$x10,$inp
83c29f
+	vcipher		$out2,$out2,v28
83c29f
+	 lvx_u		$in2,$x20,$inp
83c29f
+	vcipher		$out3,$out3,v28
83c29f
+	 lvx_u		$in3,$x30,$inp
83c29f
+	vcipher		$out4,$out4,v28
83c29f
+	 lvx_u		$in4,$x40,$inp
83c29f
+	vcipher		$out5,$out5,v28
83c29f
+	 lvx_u		$in5,$x50,$inp
83c29f
+	vcipher		$out6,$out6,v28
83c29f
+	 lvx_u		$in6,$x60,$inp
83c29f
+	vcipher		$out7,$out7,v28
83c29f
+	 lvx_u		$in7,$x70,$inp
83c29f
+	 addi		$inp,$inp,0x80
83c29f
+
83c29f
+	vcipher		$out0,$out0,v29
83c29f
+	 le?vperm	$in0,$in0,$in0,$inpperm
83c29f
+	vcipher		$out1,$out1,v29
83c29f
+	 le?vperm	$in1,$in1,$in1,$inpperm
83c29f
+	vcipher		$out2,$out2,v29
83c29f
+	 le?vperm	$in2,$in2,$in2,$inpperm
83c29f
+	vcipher		$out3,$out3,v29
83c29f
+	 le?vperm	$in3,$in3,$in3,$inpperm
83c29f
+	vcipher		$out4,$out4,v29
83c29f
+	 le?vperm	$in4,$in4,$in4,$inpperm
83c29f
+	vcipher		$out5,$out5,v29
83c29f
+	 le?vperm	$in5,$in5,$in5,$inpperm
83c29f
+	vcipher		$out6,$out6,v29
83c29f
+	 le?vperm	$in6,$in6,$in6,$inpperm
83c29f
+	vcipher		$out7,$out7,v29
83c29f
+	 le?vperm	$in7,$in7,$in7,$inpperm
83c29f
+
83c29f
+	add		$inp,$inp,r0		# $inp is adjusted in such
83c29f
+						# way that at exit from the
83c29f
+						# loop inX-in7 are loaded
83c29f
+						# with last "words"
83c29f
+	subfe.		r0,r0,r0		# borrow?-1:0
83c29f
+	vcipher		$out0,$out0,v30
83c29f
+	 vxor		$in0,$in0,v31		# xor with last round key
83c29f
+	vcipher		$out1,$out1,v30
83c29f
+	 vxor		$in1,$in1,v31
83c29f
+	vcipher		$out2,$out2,v30
83c29f
+	 vxor		$in2,$in2,v31
83c29f
+	vcipher		$out3,$out3,v30
83c29f
+	 vxor		$in3,$in3,v31
83c29f
+	vcipher		$out4,$out4,v30
83c29f
+	 vxor		$in4,$in4,v31
83c29f
+	vcipher		$out5,$out5,v30
83c29f
+	 vxor		$in5,$in5,v31
83c29f
+	vcipher		$out6,$out6,v30
83c29f
+	 vxor		$in6,$in6,v31
83c29f
+	vcipher		$out7,$out7,v30
83c29f
+	 vxor		$in7,$in7,v31
83c29f
+
83c29f
+	bne		Lctr32_enc8x_break	# did $len-129 borrow?
83c29f
+
83c29f
+	vcipherlast	$in0,$out0,$in0
83c29f
+	vcipherlast	$in1,$out1,$in1
83c29f
+	 vadduwm	$out1,$ivec,$one	# counter values ...
83c29f
+	vcipherlast	$in2,$out2,$in2
83c29f
+	 vadduwm	$out2,$ivec,$two
83c29f
+	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
83c29f
+	vcipherlast	$in3,$out3,$in3
83c29f
+	 vadduwm	$out3,$out1,$two
83c29f
+	 vxor		$out1,$out1,$rndkey0
83c29f
+	vcipherlast	$in4,$out4,$in4
83c29f
+	 vadduwm	$out4,$out2,$two
83c29f
+	 vxor		$out2,$out2,$rndkey0
83c29f
+	vcipherlast	$in5,$out5,$in5
83c29f
+	 vadduwm	$out5,$out3,$two
83c29f
+	 vxor		$out3,$out3,$rndkey0
83c29f
+	vcipherlast	$in6,$out6,$in6
83c29f
+	 vadduwm	$out6,$out4,$two
83c29f
+	 vxor		$out4,$out4,$rndkey0
83c29f
+	vcipherlast	$in7,$out7,$in7
83c29f
+	 vadduwm	$out7,$out5,$two
83c29f
+	 vxor		$out5,$out5,$rndkey0
83c29f
+	le?vperm	$in0,$in0,$in0,$inpperm
83c29f
+	 vadduwm	$ivec,$out6,$two	# next counter value
83c29f
+	 vxor		$out6,$out6,$rndkey0
83c29f
+	le?vperm	$in1,$in1,$in1,$inpperm
83c29f
+	 vxor		$out7,$out7,$rndkey0
83c29f
+	mtctr		$rounds
83c29f
+
83c29f
+	 vcipher	$out0,$out0,v24
83c29f
+	stvx_u		$in0,$x00,$out
83c29f
+	le?vperm	$in2,$in2,$in2,$inpperm
83c29f
+	 vcipher	$out1,$out1,v24
83c29f
+	stvx_u		$in1,$x10,$out
83c29f
+	le?vperm	$in3,$in3,$in3,$inpperm
83c29f
+	 vcipher	$out2,$out2,v24
83c29f
+	stvx_u		$in2,$x20,$out
83c29f
+	le?vperm	$in4,$in4,$in4,$inpperm
83c29f
+	 vcipher	$out3,$out3,v24
83c29f
+	stvx_u		$in3,$x30,$out
83c29f
+	le?vperm	$in5,$in5,$in5,$inpperm
83c29f
+	 vcipher	$out4,$out4,v24
83c29f
+	stvx_u		$in4,$x40,$out
83c29f
+	le?vperm	$in6,$in6,$in6,$inpperm
83c29f
+	 vcipher	$out5,$out5,v24
83c29f
+	stvx_u		$in5,$x50,$out
83c29f
+	le?vperm	$in7,$in7,$in7,$inpperm
83c29f
+	 vcipher	$out6,$out6,v24
83c29f
+	stvx_u		$in6,$x60,$out
83c29f
+	 vcipher	$out7,$out7,v24
83c29f
+	stvx_u		$in7,$x70,$out
83c29f
+	addi		$out,$out,0x80
83c29f
+
83c29f
+	b		Loop_ctr32_enc8x_middle
83c29f
+
83c29f
+.align	5
83c29f
+Lctr32_enc8x_break:
83c29f
+	cmpwi		$len,-0x60
83c29f
+	blt		Lctr32_enc8x_one
83c29f
+	nop
83c29f
+	beq		Lctr32_enc8x_two
83c29f
+	cmpwi		$len,-0x40
83c29f
+	blt		Lctr32_enc8x_three
83c29f
+	nop
83c29f
+	beq		Lctr32_enc8x_four
83c29f
+	cmpwi		$len,-0x20
83c29f
+	blt		Lctr32_enc8x_five
83c29f
+	nop
83c29f
+	beq		Lctr32_enc8x_six
83c29f
+	cmpwi		$len,0x00
83c29f
+	blt		Lctr32_enc8x_seven
83c29f
+
83c29f
+Lctr32_enc8x_eight:
83c29f
+	vcipherlast	$out0,$out0,$in0
83c29f
+	vcipherlast	$out1,$out1,$in1
83c29f
+	vcipherlast	$out2,$out2,$in2
83c29f
+	vcipherlast	$out3,$out3,$in3
83c29f
+	vcipherlast	$out4,$out4,$in4
83c29f
+	vcipherlast	$out5,$out5,$in5
83c29f
+	vcipherlast	$out6,$out6,$in6
83c29f
+	vcipherlast	$out7,$out7,$in7
83c29f
+
83c29f
+	le?vperm	$out0,$out0,$out0,$inpperm
83c29f
+	le?vperm	$out1,$out1,$out1,$inpperm
83c29f
+	stvx_u		$out0,$x00,$out
83c29f
+	le?vperm	$out2,$out2,$out2,$inpperm
83c29f
+	stvx_u		$out1,$x10,$out
83c29f
+	le?vperm	$out3,$out3,$out3,$inpperm
83c29f
+	stvx_u		$out2,$x20,$out
83c29f
+	le?vperm	$out4,$out4,$out4,$inpperm
83c29f
+	stvx_u		$out3,$x30,$out
83c29f
+	le?vperm	$out5,$out5,$out5,$inpperm
83c29f
+	stvx_u		$out4,$x40,$out
83c29f
+	le?vperm	$out6,$out6,$out6,$inpperm
83c29f
+	stvx_u		$out5,$x50,$out
83c29f
+	le?vperm	$out7,$out7,$out7,$inpperm
83c29f
+	stvx_u		$out6,$x60,$out
83c29f
+	stvx_u		$out7,$x70,$out
83c29f
+	addi		$out,$out,0x80
83c29f
+	b		Lctr32_enc8x_done
83c29f
+
83c29f
+.align	5
83c29f
+Lctr32_enc8x_seven:
83c29f
+	vcipherlast	$out0,$out0,$in1
83c29f
+	vcipherlast	$out1,$out1,$in2
83c29f
+	vcipherlast	$out2,$out2,$in3
83c29f
+	vcipherlast	$out3,$out3,$in4
83c29f
+	vcipherlast	$out4,$out4,$in5
83c29f
+	vcipherlast	$out5,$out5,$in6
83c29f
+	vcipherlast	$out6,$out6,$in7
83c29f
+
83c29f
+	le?vperm	$out0,$out0,$out0,$inpperm
83c29f
+	le?vperm	$out1,$out1,$out1,$inpperm
83c29f
+	stvx_u		$out0,$x00,$out
83c29f
+	le?vperm	$out2,$out2,$out2,$inpperm
83c29f
+	stvx_u		$out1,$x10,$out