Blame SOURCES/openssl-1.0.2k-ppc-update.patch

cfec1a
diff -up openssl-1.0.2k/crypto/aes/asm/aesp8-ppc.pl.ppc-update openssl-1.0.2k/crypto/aes/asm/aesp8-ppc.pl
cfec1a
--- openssl-1.0.2k/crypto/aes/asm/aesp8-ppc.pl.ppc-update	2017-01-26 14:22:03.000000000 +0100
cfec1a
+++ openssl-1.0.2k/crypto/aes/asm/aesp8-ppc.pl	2017-04-13 09:51:40.611133165 +0200
cfec1a
@@ -20,6 +20,19 @@
cfec1a
 # instructions are interleaved. It's reckoned that eventual
cfec1a
 # misalignment penalties at page boundaries are in average lower
cfec1a
 # than additional overhead in pure AltiVec approach.
cfec1a
+#
cfec1a
+# May 2016
cfec1a
+#
cfec1a
+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
cfec1a
+# systems were measured.
cfec1a
+#
cfec1a
+######################################################################
cfec1a
+# Current large-block performance in cycles per byte processed with
cfec1a
+# 128-bit key (less is better).
cfec1a
+#
cfec1a
+#		CBC en-/decrypt	CTR	XTS
cfec1a
+# POWER8[le]	3.96/0.72	0.74	1.1
cfec1a
+# POWER8[be]	3.75/0.65	0.66	1.0
cfec1a
 
cfec1a
 $flavour = shift;
cfec1a
 
cfec1a
@@ -486,6 +499,8 @@ $code.=<<___;
cfec1a
 .globl	.${prefix}_cbc_encrypt
cfec1a
 .align	5
cfec1a
 .${prefix}_cbc_encrypt:
cfec1a
+	li		$idx,15
cfec1a
+	andc		$len,$len,$idx
cfec1a
 	${UCMP}i	$len,16
cfec1a
 	bltlr-
cfec1a
 
cfec1a
@@ -494,7 +509,6 @@ $code.=<<___;
cfec1a
 	mfspr		$vrsave,256
cfec1a
 	mtspr		256,r0
cfec1a
 
cfec1a
-	li		$idx,15
cfec1a
 	vxor		$rndkey0,$rndkey0,$rndkey0
cfec1a
 	le?vspltisb	$tmp,0x0f
cfec1a
 
cfec1a
@@ -1887,6 +1901,1849 @@ Lctr32_enc8x_done:
cfec1a
 ___
cfec1a
 }}	}}}
cfec1a
 
cfec1a
+#########################################################################
cfec1a
+{{{	# XTS procedures						#
cfec1a
+# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
cfec1a
+#                             const AES_KEY *key1, const AES_KEY *key2,	#
cfec1a
+#                             [const] unsigned char iv[16]);		#
cfec1a
+# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
cfec1a
+# input tweak value is assumed to be encrypted already, and last tweak	#
cfec1a
+# value, one suitable for consecutive call on same chunk of data, is	#
cfec1a
+# written back to original buffer. In addition, in "tweak chaining"	#
cfec1a
+# mode only complete input blocks are processed.			#
cfec1a
+
cfec1a
+my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
cfec1a
+my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
cfec1a
+my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
cfec1a
+my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
cfec1a
+my $taillen = $key2;
cfec1a
+
cfec1a
+   ($inp,$idx) = ($idx,$inp);				# reassign
cfec1a
+
cfec1a
+$code.=<<___;
cfec1a
+.globl	.${prefix}_xts_encrypt
cfec1a
+.align	5
cfec1a
+.${prefix}_xts_encrypt:
cfec1a
+	mr		$inp,r3				# reassign
cfec1a
+	li		r3,-1
cfec1a
+	${UCMP}i	$len,16
cfec1a
+	bltlr-
cfec1a
+
cfec1a
+	lis		r0,0xfff0
cfec1a
+	mfspr		r12,256				# save vrsave
cfec1a
+	li		r11,0
cfec1a
+	mtspr		256,r0
cfec1a
+
cfec1a
+	vspltisb	$seven,0x07			# 0x070707..07
cfec1a
+	le?lvsl		$leperm,r11,r11
cfec1a
+	le?vspltisb	$tmp,0x0f
cfec1a
+	le?vxor		$leperm,$leperm,$seven
cfec1a
+
cfec1a
+	li		$idx,15
cfec1a
+	lvx		$tweak,0,$ivp			# load [unaligned] iv
cfec1a
+	lvsl		$inpperm,0,$ivp
cfec1a
+	lvx		$inptail,$idx,$ivp
cfec1a
+	le?vxor		$inpperm,$inpperm,$tmp
cfec1a
+	vperm		$tweak,$tweak,$inptail,$inpperm
cfec1a
+
cfec1a
+	neg		r11,$inp
cfec1a
+	lvsr		$inpperm,0,r11			# prepare for unaligned load
cfec1a
+	lvx		$inout,0,$inp
cfec1a
+	addi		$inp,$inp,15			# 15 is not typo
cfec1a
+	le?vxor		$inpperm,$inpperm,$tmp
cfec1a
+
cfec1a
+	${UCMP}i	$key2,0				# key2==NULL?
cfec1a
+	beq		Lxts_enc_no_key2
cfec1a
+
cfec1a
+	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
cfec1a
+	lwz		$rounds,240($key2)
cfec1a
+	srwi		$rounds,$rounds,1
cfec1a
+	subi		$rounds,$rounds,1
cfec1a
+	li		$idx,16
cfec1a
+
cfec1a
+	lvx		$rndkey0,0,$key2
cfec1a
+	lvx		$rndkey1,$idx,$key2
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vxor		$tweak,$tweak,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key2
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	mtctr		$rounds
cfec1a
+
cfec1a
+Ltweak_xts_enc:
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vcipher		$tweak,$tweak,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key2
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vcipher		$tweak,$tweak,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key2
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	bdnz		Ltweak_xts_enc
cfec1a
+
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vcipher		$tweak,$tweak,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key2
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vcipherlast	$tweak,$tweak,$rndkey0
cfec1a
+
cfec1a
+	li		$ivp,0				# don't chain the tweak
cfec1a
+	b		Lxts_enc
cfec1a
+
cfec1a
+Lxts_enc_no_key2:
cfec1a
+	li		$idx,-16
cfec1a
+	and		$len,$len,$idx			# in "tweak chaining"
cfec1a
+							# mode only complete
cfec1a
+							# blocks are processed
cfec1a
+Lxts_enc:
cfec1a
+	lvx		$inptail,0,$inp
cfec1a
+	addi		$inp,$inp,16
cfec1a
+
cfec1a
+	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
cfec1a
+	lwz		$rounds,240($key1)
cfec1a
+	srwi		$rounds,$rounds,1
cfec1a
+	subi		$rounds,$rounds,1
cfec1a
+	li		$idx,16
cfec1a
+
cfec1a
+	vslb		$eighty7,$seven,$seven		# 0x808080..80
cfec1a
+	vor		$eighty7,$eighty7,$seven	# 0x878787..87
cfec1a
+	vspltisb	$tmp,1				# 0x010101..01
cfec1a
+	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
cfec1a
+
cfec1a
+	${UCMP}i	$len,96
cfec1a
+	bge		_aesp8_xts_encrypt6x
cfec1a
+
cfec1a
+	andi.		$taillen,$len,15
cfec1a
+	subic		r0,$len,32
cfec1a
+	subi		$taillen,$taillen,16
cfec1a
+	subfe		r0,r0,r0
cfec1a
+	and		r0,r0,$taillen
cfec1a
+	add		$inp,$inp,r0
cfec1a
+
cfec1a
+	lvx		$rndkey0,0,$key1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	vperm		$inout,$inout,$inptail,$inpperm
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vxor		$inout,$inout,$tweak
cfec1a
+	vxor		$inout,$inout,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	mtctr		$rounds
cfec1a
+	b		Loop_xts_enc
cfec1a
+
cfec1a
+.align	5
cfec1a
+Loop_xts_enc:
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vcipher		$inout,$inout,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vcipher		$inout,$inout,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	bdnz		Loop_xts_enc
cfec1a
+
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vcipher		$inout,$inout,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	li		$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vxor		$rndkey0,$rndkey0,$tweak
cfec1a
+	vcipherlast	$output,$inout,$rndkey0
cfec1a
+
cfec1a
+	le?vperm	$tmp,$output,$output,$leperm
cfec1a
+	be?nop
cfec1a
+	le?stvx_u	$tmp,0,$out
cfec1a
+	be?stvx_u	$output,0,$out
cfec1a
+	addi		$out,$out,16
cfec1a
+
cfec1a
+	subic.		$len,$len,16
cfec1a
+	beq		Lxts_enc_done
cfec1a
+
cfec1a
+	vmr		$inout,$inptail
cfec1a
+	lvx		$inptail,0,$inp
cfec1a
+	addi		$inp,$inp,16
cfec1a
+	lvx		$rndkey0,0,$key1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+
cfec1a
+	subic		r0,$len,32
cfec1a
+	subfe		r0,r0,r0
cfec1a
+	and		r0,r0,$taillen
cfec1a
+	add		$inp,$inp,r0
cfec1a
+
cfec1a
+	vsrab		$tmp,$tweak,$seven		# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	vperm		$inout,$inout,$inptail,$inpperm
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vxor		$inout,$inout,$tweak
cfec1a
+	vxor		$output,$output,$rndkey0	# just in case $len<16
cfec1a
+	vxor		$inout,$inout,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+
cfec1a
+	mtctr		$rounds
cfec1a
+	${UCMP}i	$len,16
cfec1a
+	bge		Loop_xts_enc
cfec1a
+
cfec1a
+	vxor		$output,$output,$tweak
cfec1a
+	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
cfec1a
+	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
cfec1a
+	vspltisb	$tmp,-1
cfec1a
+	vperm		$inptail,$inptail,$tmp,$inpperm
cfec1a
+	vsel		$inout,$inout,$output,$inptail
cfec1a
+
cfec1a
+	subi		r11,$out,17
cfec1a
+	subi		$out,$out,16
cfec1a
+	mtctr		$len
cfec1a
+	li		$len,16
cfec1a
+Loop_xts_enc_steal:
cfec1a
+	lbzu		r0,1(r11)
cfec1a
+	stb		r0,16(r11)
cfec1a
+	bdnz		Loop_xts_enc_steal
cfec1a
+
cfec1a
+	mtctr		$rounds
cfec1a
+	b		Loop_xts_enc			# one more time...
cfec1a
+
cfec1a
+Lxts_enc_done:
cfec1a
+	${UCMP}i	$ivp,0
cfec1a
+	beq		Lxts_enc_ret
cfec1a
+
cfec1a
+	vsrab		$tmp,$tweak,$seven		# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	le?vperm	$tweak,$tweak,$tweak,$leperm
cfec1a
+	stvx_u		$tweak,0,$ivp
cfec1a
+
cfec1a
+Lxts_enc_ret:
cfec1a
+	mtspr		256,r12				# restore vrsave
cfec1a
+	li		r3,0
cfec1a
+	blr
cfec1a
+	.long		0
cfec1a
+	.byte		0,12,0x04,0,0x80,6,6,0
cfec1a
+	.long		0
cfec1a
+.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
cfec1a
+
cfec1a
+.globl	.${prefix}_xts_decrypt
cfec1a
+.align	5
cfec1a
+.${prefix}_xts_decrypt:
cfec1a
+	mr		$inp,r3				# reassign
cfec1a
+	li		r3,-1
cfec1a
+	${UCMP}i	$len,16
cfec1a
+	bltlr-
cfec1a
+
cfec1a
+	lis		r0,0xfff8
cfec1a
+	mfspr		r12,256				# save vrsave
cfec1a
+	li		r11,0
cfec1a
+	mtspr		256,r0
cfec1a
+
cfec1a
+	andi.		r0,$len,15
cfec1a
+	neg		r0,r0
cfec1a
+	andi.		r0,r0,16
cfec1a
+	sub		$len,$len,r0
cfec1a
+
cfec1a
+	vspltisb	$seven,0x07			# 0x070707..07
cfec1a
+	le?lvsl		$leperm,r11,r11
cfec1a
+	le?vspltisb	$tmp,0x0f
cfec1a
+	le?vxor		$leperm,$leperm,$seven
cfec1a
+
cfec1a
+	li		$idx,15
cfec1a
+	lvx		$tweak,0,$ivp			# load [unaligned] iv
cfec1a
+	lvsl		$inpperm,0,$ivp
cfec1a
+	lvx		$inptail,$idx,$ivp
cfec1a
+	le?vxor		$inpperm,$inpperm,$tmp
cfec1a
+	vperm		$tweak,$tweak,$inptail,$inpperm
cfec1a
+
cfec1a
+	neg		r11,$inp
cfec1a
+	lvsr		$inpperm,0,r11			# prepare for unaligned load
cfec1a
+	lvx		$inout,0,$inp
cfec1a
+	addi		$inp,$inp,15			# 15 is not typo
cfec1a
+	le?vxor		$inpperm,$inpperm,$tmp
cfec1a
+
cfec1a
+	${UCMP}i	$key2,0				# key2==NULL?
cfec1a
+	beq		Lxts_dec_no_key2
cfec1a
+
cfec1a
+	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
cfec1a
+	lwz		$rounds,240($key2)
cfec1a
+	srwi		$rounds,$rounds,1
cfec1a
+	subi		$rounds,$rounds,1
cfec1a
+	li		$idx,16
cfec1a
+
cfec1a
+	lvx		$rndkey0,0,$key2
cfec1a
+	lvx		$rndkey1,$idx,$key2
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vxor		$tweak,$tweak,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key2
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	mtctr		$rounds
cfec1a
+
cfec1a
+Ltweak_xts_dec:
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vcipher		$tweak,$tweak,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key2
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vcipher		$tweak,$tweak,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key2
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	bdnz		Ltweak_xts_dec
cfec1a
+
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vcipher		$tweak,$tweak,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key2
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vcipherlast	$tweak,$tweak,$rndkey0
cfec1a
+
cfec1a
+	li		$ivp,0				# don't chain the tweak
cfec1a
+	b		Lxts_dec
cfec1a
+
cfec1a
+Lxts_dec_no_key2:
cfec1a
+	neg		$idx,$len
cfec1a
+	andi.		$idx,$idx,15
cfec1a
+	add		$len,$len,$idx			# in "tweak chaining"
cfec1a
+							# mode only complete
cfec1a
+							# blocks are processed
cfec1a
+Lxts_dec:
cfec1a
+	lvx		$inptail,0,$inp
cfec1a
+	addi		$inp,$inp,16
cfec1a
+
cfec1a
+	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
cfec1a
+	lwz		$rounds,240($key1)
cfec1a
+	srwi		$rounds,$rounds,1
cfec1a
+	subi		$rounds,$rounds,1
cfec1a
+	li		$idx,16
cfec1a
+
cfec1a
+	vslb		$eighty7,$seven,$seven		# 0x808080..80
cfec1a
+	vor		$eighty7,$eighty7,$seven	# 0x878787..87
cfec1a
+	vspltisb	$tmp,1				# 0x010101..01
cfec1a
+	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
cfec1a
+
cfec1a
+	${UCMP}i	$len,96
cfec1a
+	bge		_aesp8_xts_decrypt6x
cfec1a
+
cfec1a
+	lvx		$rndkey0,0,$key1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	vperm		$inout,$inout,$inptail,$inpperm
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vxor		$inout,$inout,$tweak
cfec1a
+	vxor		$inout,$inout,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	mtctr		$rounds
cfec1a
+
cfec1a
+	${UCMP}i	$len,16
cfec1a
+	blt		Ltail_xts_dec
cfec1a
+	be?b		Loop_xts_dec
cfec1a
+
cfec1a
+.align	5
cfec1a
+Loop_xts_dec:
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vncipher	$inout,$inout,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vncipher	$inout,$inout,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	bdnz		Loop_xts_dec
cfec1a
+
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vncipher	$inout,$inout,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	li		$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vxor		$rndkey0,$rndkey0,$tweak
cfec1a
+	vncipherlast	$output,$inout,$rndkey0
cfec1a
+
cfec1a
+	le?vperm	$tmp,$output,$output,$leperm
cfec1a
+	be?nop
cfec1a
+	le?stvx_u	$tmp,0,$out
cfec1a
+	be?stvx_u	$output,0,$out
cfec1a
+	addi		$out,$out,16
cfec1a
+
cfec1a
+	subic.		$len,$len,16
cfec1a
+	beq		Lxts_dec_done
cfec1a
+
cfec1a
+	vmr		$inout,$inptail
cfec1a
+	lvx		$inptail,0,$inp
cfec1a
+	addi		$inp,$inp,16
cfec1a
+	lvx		$rndkey0,0,$key1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+
cfec1a
+	vsrab		$tmp,$tweak,$seven		# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	vperm		$inout,$inout,$inptail,$inpperm
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vxor		$inout,$inout,$tweak
cfec1a
+	vxor		$inout,$inout,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+
cfec1a
+	mtctr		$rounds
cfec1a
+	${UCMP}i	$len,16
cfec1a
+	bge		Loop_xts_dec
cfec1a
+
cfec1a
+Ltail_xts_dec:
cfec1a
+	vsrab		$tmp,$tweak,$seven		# next tweak value
cfec1a
+	vaddubm		$tweak1,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	vxor		$tweak1,$tweak1,$tmp
cfec1a
+
cfec1a
+	subi		$inp,$inp,16
cfec1a
+	add		$inp,$inp,$len
cfec1a
+
cfec1a
+	vxor		$inout,$inout,$tweak		# :-(
cfec1a
+	vxor		$inout,$inout,$tweak1		# :-)
cfec1a
+
cfec1a
+Loop_xts_dec_short:
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vncipher	$inout,$inout,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vncipher	$inout,$inout,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	bdnz		Loop_xts_dec_short
cfec1a
+
cfec1a
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
cfec1a
+	vncipher	$inout,$inout,$rndkey1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	li		$idx,16
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+	vxor		$rndkey0,$rndkey0,$tweak1
cfec1a
+	vncipherlast	$output,$inout,$rndkey0
cfec1a
+
cfec1a
+	le?vperm	$tmp,$output,$output,$leperm
cfec1a
+	be?nop
cfec1a
+	le?stvx_u	$tmp,0,$out
cfec1a
+	be?stvx_u	$output,0,$out
cfec1a
+
cfec1a
+	vmr		$inout,$inptail
cfec1a
+	lvx		$inptail,0,$inp
cfec1a
+	#addi		$inp,$inp,16
cfec1a
+	lvx		$rndkey0,0,$key1
cfec1a
+	lvx		$rndkey1,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+	vperm		$inout,$inout,$inptail,$inpperm
cfec1a
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
cfec1a
+
cfec1a
+	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
cfec1a
+	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
cfec1a
+	vspltisb	$tmp,-1
cfec1a
+	vperm		$inptail,$inptail,$tmp,$inpperm
cfec1a
+	vsel		$inout,$inout,$output,$inptail
cfec1a
+
cfec1a
+	vxor		$rndkey0,$rndkey0,$tweak
cfec1a
+	vxor		$inout,$inout,$rndkey0
cfec1a
+	lvx		$rndkey0,$idx,$key1
cfec1a
+	addi		$idx,$idx,16
cfec1a
+
cfec1a
+	subi		r11,$out,1
cfec1a
+	mtctr		$len
cfec1a
+	li		$len,16
cfec1a
+Loop_xts_dec_steal:
cfec1a
+	lbzu		r0,1(r11)
cfec1a
+	stb		r0,16(r11)
cfec1a
+	bdnz		Loop_xts_dec_steal
cfec1a
+
cfec1a
+	mtctr		$rounds
cfec1a
+	b		Loop_xts_dec			# one more time...
cfec1a
+
cfec1a
+Lxts_dec_done:
cfec1a
+	${UCMP}i	$ivp,0
cfec1a
+	beq		Lxts_dec_ret
cfec1a
+
cfec1a
+	vsrab		$tmp,$tweak,$seven		# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	le?vperm	$tweak,$tweak,$tweak,$leperm
cfec1a
+	stvx_u		$tweak,0,$ivp
cfec1a
+
cfec1a
+Lxts_dec_ret:
cfec1a
+	mtspr		256,r12				# restore vrsave
cfec1a
+	li		r3,0
cfec1a
+	blr
cfec1a
+	.long		0
cfec1a
+	.byte		0,12,0x04,0,0x80,6,6,0
cfec1a
+	.long		0
cfec1a
+.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
cfec1a
+___
cfec1a
+#########################################################################
cfec1a
+{{	# Optimized XTS procedures					#
cfec1a
+my $key_=$key2;
cfec1a
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
cfec1a
+    $x00=0 if ($flavour =~ /osx/);
cfec1a
+my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
cfec1a
+my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
cfec1a
+my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
cfec1a
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
cfec1a
+			# v26-v31 last 6 round keys
cfec1a
+my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
cfec1a
+my $taillen=$x70;
cfec1a
+
cfec1a
+$code.=<<___;
cfec1a
+.align	5
cfec1a
+_aesp8_xts_encrypt6x:
cfec1a
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
cfec1a
+	mflr		r11
cfec1a
+	li		r7,`$FRAME+8*16+15`
cfec1a
+	li		r3,`$FRAME+8*16+31`
cfec1a
+	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
cfec1a
+	stvx		v20,r7,$sp		# ABI says so
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v21,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v22,r7,$sp
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v23,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v24,r7,$sp
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v25,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v26,r7,$sp
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v27,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v28,r7,$sp
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v29,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v30,r7,$sp
cfec1a
+	stvx		v31,r3,$sp
cfec1a
+	li		r0,-1
cfec1a
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
cfec1a
+	li		$x10,0x10
cfec1a
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
cfec1a
+	li		$x20,0x20
cfec1a
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
cfec1a
+	li		$x30,0x30
cfec1a
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
cfec1a
+	li		$x40,0x40
cfec1a
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
cfec1a
+	li		$x50,0x50
cfec1a
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
cfec1a
+	li		$x60,0x60
cfec1a
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
cfec1a
+	li		$x70,0x70
cfec1a
+	mtspr		256,r0
cfec1a
+
cfec1a
+	subi		$rounds,$rounds,3	# -4 in total
cfec1a
+
cfec1a
+	lvx		$rndkey0,$x00,$key1	# load key schedule
cfec1a
+	lvx		v30,$x10,$key1
cfec1a
+	addi		$key1,$key1,0x20
cfec1a
+	lvx		v31,$x00,$key1
cfec1a
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
cfec1a
+	addi		$key_,$sp,$FRAME+15
cfec1a
+	mtctr		$rounds
cfec1a
+
cfec1a
+Load_xts_enc_key:
cfec1a
+	?vperm		v24,v30,v31,$keyperm
cfec1a
+	lvx		v30,$x10,$key1
cfec1a
+	addi		$key1,$key1,0x20
cfec1a
+	stvx		v24,$x00,$key_		# off-load round[1]
cfec1a
+	?vperm		v25,v31,v30,$keyperm
cfec1a
+	lvx		v31,$x00,$key1
cfec1a
+	stvx		v25,$x10,$key_		# off-load round[2]
cfec1a
+	addi		$key_,$key_,0x20
cfec1a
+	bdnz		Load_xts_enc_key
cfec1a
+
cfec1a
+	lvx		v26,$x10,$key1
cfec1a
+	?vperm		v24,v30,v31,$keyperm
cfec1a
+	lvx		v27,$x20,$key1
cfec1a
+	stvx		v24,$x00,$key_		# off-load round[3]
cfec1a
+	?vperm		v25,v31,v26,$keyperm
cfec1a
+	lvx		v28,$x30,$key1
cfec1a
+	stvx		v25,$x10,$key_		# off-load round[4]
cfec1a
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
cfec1a
+	?vperm		v26,v26,v27,$keyperm
cfec1a
+	lvx		v29,$x40,$key1
cfec1a
+	?vperm		v27,v27,v28,$keyperm
cfec1a
+	lvx		v30,$x50,$key1
cfec1a
+	?vperm		v28,v28,v29,$keyperm
cfec1a
+	lvx		v31,$x60,$key1
cfec1a
+	?vperm		v29,v29,v30,$keyperm
cfec1a
+	lvx		$twk5,$x70,$key1	# borrow $twk5
cfec1a
+	?vperm		v30,v30,v31,$keyperm
cfec1a
+	lvx		v24,$x00,$key_		# pre-load round[1]
cfec1a
+	?vperm		v31,v31,$twk5,$keyperm
cfec1a
+	lvx		v25,$x10,$key_		# pre-load round[2]
cfec1a
+
cfec1a
+	 vperm		$in0,$inout,$inptail,$inpperm
cfec1a
+	 subi		$inp,$inp,31		# undo "caller"
cfec1a
+	vxor		$twk0,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out0,$in0,$twk0
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in1,$x10,$inp
cfec1a
+	vxor		$twk1,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in1,$in1,$in1,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out1,$in1,$twk1
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in2,$x20,$inp
cfec1a
+	 andi.		$taillen,$len,15
cfec1a
+	vxor		$twk2,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in2,$in2,$in2,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out2,$in2,$twk2
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in3,$x30,$inp
cfec1a
+	 sub		$len,$len,$taillen
cfec1a
+	vxor		$twk3,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in3,$in3,$in3,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out3,$in3,$twk3
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in4,$x40,$inp
cfec1a
+	 subi		$len,$len,0x60
cfec1a
+	vxor		$twk4,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in4,$in4,$in4,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out4,$in4,$twk4
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in5,$x50,$inp
cfec1a
+	 addi		$inp,$inp,0x60
cfec1a
+	vxor		$twk5,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in5,$in5,$in5,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out5,$in5,$twk5
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	vxor		v31,v31,$rndkey0
cfec1a
+	mtctr		$rounds
cfec1a
+	b		Loop_xts_enc6x
cfec1a
+
cfec1a
+.align	5
cfec1a
+Loop_xts_enc6x:
cfec1a
+	vcipher		$out0,$out0,v24
cfec1a
+	vcipher		$out1,$out1,v24
cfec1a
+	vcipher		$out2,$out2,v24
cfec1a
+	vcipher		$out3,$out3,v24
cfec1a
+	vcipher		$out4,$out4,v24
cfec1a
+	vcipher		$out5,$out5,v24
cfec1a
+	lvx		v24,$x20,$key_		# round[3]
cfec1a
+	addi		$key_,$key_,0x20
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v25
cfec1a
+	vcipher		$out1,$out1,v25
cfec1a
+	vcipher		$out2,$out2,v25
cfec1a
+	vcipher		$out3,$out3,v25
cfec1a
+	vcipher		$out4,$out4,v25
cfec1a
+	vcipher		$out5,$out5,v25
cfec1a
+	lvx		v25,$x10,$key_		# round[4]
cfec1a
+	bdnz		Loop_xts_enc6x
cfec1a
+
cfec1a
+	subic		$len,$len,96		# $len-=96
cfec1a
+	 vxor		$in0,$twk0,v31		# xor with last round key
cfec1a
+	vcipher		$out0,$out0,v24
cfec1a
+	vcipher		$out1,$out1,v24
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk0,$tweak,$rndkey0
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	vcipher		$out2,$out2,v24
cfec1a
+	vcipher		$out3,$out3,v24
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vcipher		$out4,$out4,v24
cfec1a
+	vcipher		$out5,$out5,v24
cfec1a
+
cfec1a
+	subfe.		r0,r0,r0		# borrow?-1:0
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vcipher		$out0,$out0,v25
cfec1a
+	vcipher		$out1,$out1,v25
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vcipher		$out2,$out2,v25
cfec1a
+	vcipher		$out3,$out3,v25
cfec1a
+	 vxor		$in1,$twk1,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk1,$tweak,$rndkey0
cfec1a
+	vcipher		$out4,$out4,v25
cfec1a
+	vcipher		$out5,$out5,v25
cfec1a
+
cfec1a
+	and		r0,r0,$len
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vcipher		$out0,$out0,v26
cfec1a
+	vcipher		$out1,$out1,v26
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vcipher		$out2,$out2,v26
cfec1a
+	vcipher		$out3,$out3,v26
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vcipher		$out4,$out4,v26
cfec1a
+	vcipher		$out5,$out5,v26
cfec1a
+
cfec1a
+	add		$inp,$inp,r0		# $inp is adjusted in such
cfec1a
+						# way that at exit from the
cfec1a
+						# loop inX-in5 are loaded
cfec1a
+						# with last "words"
cfec1a
+	 vxor		$in2,$twk2,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk2,$tweak,$rndkey0
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	vcipher		$out0,$out0,v27
cfec1a
+	vcipher		$out1,$out1,v27
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vcipher		$out2,$out2,v27
cfec1a
+	vcipher		$out3,$out3,v27
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vcipher		$out4,$out4,v27
cfec1a
+	vcipher		$out5,$out5,v27
cfec1a
+
cfec1a
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vcipher		$out0,$out0,v28
cfec1a
+	vcipher		$out1,$out1,v28
cfec1a
+	 vxor		$in3,$twk3,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk3,$tweak,$rndkey0
cfec1a
+	vcipher		$out2,$out2,v28
cfec1a
+	vcipher		$out3,$out3,v28
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vcipher		$out4,$out4,v28
cfec1a
+	vcipher		$out5,$out5,v28
cfec1a
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v29
cfec1a
+	vcipher		$out1,$out1,v29
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vcipher		$out2,$out2,v29
cfec1a
+	vcipher		$out3,$out3,v29
cfec1a
+	 vxor		$in4,$twk4,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk4,$tweak,$rndkey0
cfec1a
+	vcipher		$out4,$out4,v29
cfec1a
+	vcipher		$out5,$out5,v29
cfec1a
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v30
cfec1a
+	vcipher		$out1,$out1,v30
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vcipher		$out2,$out2,v30
cfec1a
+	vcipher		$out3,$out3,v30
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vcipher		$out4,$out4,v30
cfec1a
+	vcipher		$out5,$out5,v30
cfec1a
+	 vxor		$in5,$twk5,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk5,$tweak,$rndkey0
cfec1a
+
cfec1a
+	vcipherlast	$out0,$out0,$in0
cfec1a
+	 lvx_u		$in0,$x00,$inp		# load next input block
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vcipherlast	$out1,$out1,$in1
cfec1a
+	 lvx_u		$in1,$x10,$inp
cfec1a
+	vcipherlast	$out2,$out2,$in2
cfec1a
+	 le?vperm	$in0,$in0,$in0,$leperm
cfec1a
+	 lvx_u		$in2,$x20,$inp
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vcipherlast	$out3,$out3,$in3
cfec1a
+	 le?vperm	$in1,$in1,$in1,$leperm
cfec1a
+	 lvx_u		$in3,$x30,$inp
cfec1a
+	vcipherlast	$out4,$out4,$in4
cfec1a
+	 le?vperm	$in2,$in2,$in2,$leperm
cfec1a
+	 lvx_u		$in4,$x40,$inp
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vcipherlast	$tmp,$out5,$in5		# last block might be needed
cfec1a
+						# in stealing mode
cfec1a
+	 le?vperm	$in3,$in3,$in3,$leperm
cfec1a
+	 lvx_u		$in5,$x50,$inp
cfec1a
+	 addi		$inp,$inp,0x60
cfec1a
+	 le?vperm	$in4,$in4,$in4,$leperm
cfec1a
+	 le?vperm	$in5,$in5,$in5,$leperm
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	 vxor		$out0,$in0,$twk0
cfec1a
+	le?vperm	$out2,$out2,$out2,$leperm
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	 vxor		$out1,$in1,$twk1
cfec1a
+	le?vperm	$out3,$out3,$out3,$leperm
cfec1a
+	stvx_u		$out2,$x20,$out
cfec1a
+	 vxor		$out2,$in2,$twk2
cfec1a
+	le?vperm	$out4,$out4,$out4,$leperm
cfec1a
+	stvx_u		$out3,$x30,$out
cfec1a
+	 vxor		$out3,$in3,$twk3
cfec1a
+	le?vperm	$out5,$tmp,$tmp,$leperm
cfec1a
+	stvx_u		$out4,$x40,$out
cfec1a
+	 vxor		$out4,$in4,$twk4
cfec1a
+	le?stvx_u	$out5,$x50,$out
cfec1a
+	be?stvx_u	$tmp, $x50,$out
cfec1a
+	 vxor		$out5,$in5,$twk5
cfec1a
+	addi		$out,$out,0x60
cfec1a
+
cfec1a
+	mtctr		$rounds
cfec1a
+	beq		Loop_xts_enc6x		# did $len-=96 borrow?
cfec1a
+
cfec1a
+	addic.		$len,$len,0x60
cfec1a
+	beq		Lxts_enc6x_zero
cfec1a
+	cmpwi		$len,0x20
cfec1a
+	blt		Lxts_enc6x_one
cfec1a
+	nop
cfec1a
+	beq		Lxts_enc6x_two
cfec1a
+	cmpwi		$len,0x40
cfec1a
+	blt		Lxts_enc6x_three
cfec1a
+	nop
cfec1a
+	beq		Lxts_enc6x_four
cfec1a
+
cfec1a
+Lxts_enc6x_five:
cfec1a
+	vxor		$out0,$in1,$twk0
cfec1a
+	vxor		$out1,$in2,$twk1
cfec1a
+	vxor		$out2,$in3,$twk2
cfec1a
+	vxor		$out3,$in4,$twk3
cfec1a
+	vxor		$out4,$in5,$twk4
cfec1a
+
cfec1a
+	bl		_aesp8_xts_enc5x
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	vmr		$twk0,$twk5		# unused tweak
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	le?vperm	$out2,$out2,$out2,$leperm
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	le?vperm	$out3,$out3,$out3,$leperm
cfec1a
+	stvx_u		$out2,$x20,$out
cfec1a
+	vxor		$tmp,$out4,$twk5	# last block prep for stealing
cfec1a
+	le?vperm	$out4,$out4,$out4,$leperm
cfec1a
+	stvx_u		$out3,$x30,$out
cfec1a
+	stvx_u		$out4,$x40,$out
cfec1a
+	addi		$out,$out,0x50
cfec1a
+	bne		Lxts_enc6x_steal
cfec1a
+	b		Lxts_enc6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_enc6x_four:
cfec1a
+	vxor		$out0,$in2,$twk0
cfec1a
+	vxor		$out1,$in3,$twk1
cfec1a
+	vxor		$out2,$in4,$twk2
cfec1a
+	vxor		$out3,$in5,$twk3
cfec1a
+	vxor		$out4,$out4,$out4
cfec1a
+
cfec1a
+	bl		_aesp8_xts_enc5x
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	vmr		$twk0,$twk4		# unused tweak
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	le?vperm	$out2,$out2,$out2,$leperm
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	vxor		$tmp,$out3,$twk4	# last block prep for stealing
cfec1a
+	le?vperm	$out3,$out3,$out3,$leperm
cfec1a
+	stvx_u		$out2,$x20,$out
cfec1a
+	stvx_u		$out3,$x30,$out
cfec1a
+	addi		$out,$out,0x40
cfec1a
+	bne		Lxts_enc6x_steal
cfec1a
+	b		Lxts_enc6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_enc6x_three:
cfec1a
+	vxor		$out0,$in3,$twk0
cfec1a
+	vxor		$out1,$in4,$twk1
cfec1a
+	vxor		$out2,$in5,$twk2
cfec1a
+	vxor		$out3,$out3,$out3
cfec1a
+	vxor		$out4,$out4,$out4
cfec1a
+
cfec1a
+	bl		_aesp8_xts_enc5x
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	vmr		$twk0,$twk3		# unused tweak
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	vxor		$tmp,$out2,$twk3	# last block prep for stealing
cfec1a
+	le?vperm	$out2,$out2,$out2,$leperm
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	stvx_u		$out2,$x20,$out
cfec1a
+	addi		$out,$out,0x30
cfec1a
+	bne		Lxts_enc6x_steal
cfec1a
+	b		Lxts_enc6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_enc6x_two:
cfec1a
+	vxor		$out0,$in4,$twk0
cfec1a
+	vxor		$out1,$in5,$twk1
cfec1a
+	vxor		$out2,$out2,$out2
cfec1a
+	vxor		$out3,$out3,$out3
cfec1a
+	vxor		$out4,$out4,$out4
cfec1a
+
cfec1a
+	bl		_aesp8_xts_enc5x
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	vmr		$twk0,$twk2		# unused tweak
cfec1a
+	vxor		$tmp,$out1,$twk2	# last block prep for stealing
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	addi		$out,$out,0x20
cfec1a
+	bne		Lxts_enc6x_steal
cfec1a
+	b		Lxts_enc6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_enc6x_one:
cfec1a
+	vxor		$out0,$in5,$twk0
cfec1a
+	nop
cfec1a
+Loop_xts_enc1x:
cfec1a
+	vcipher		$out0,$out0,v24
cfec1a
+	lvx		v24,$x20,$key_		# round[3]
cfec1a
+	addi		$key_,$key_,0x20
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v25
cfec1a
+	lvx		v25,$x10,$key_		# round[4]
cfec1a
+	bdnz		Loop_xts_enc1x
cfec1a
+
cfec1a
+	add		$inp,$inp,$taillen
cfec1a
+	cmpwi		$taillen,0
cfec1a
+	vcipher		$out0,$out0,v24
cfec1a
+
cfec1a
+	subi		$inp,$inp,16
cfec1a
+	vcipher		$out0,$out0,v25
cfec1a
+
cfec1a
+	lvsr		$inpperm,0,$taillen
cfec1a
+	vcipher		$out0,$out0,v26
cfec1a
+
cfec1a
+	lvx_u		$in0,0,$inp
cfec1a
+	vcipher		$out0,$out0,v27
cfec1a
+
cfec1a
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
cfec1a
+	vcipher		$out0,$out0,v28
cfec1a
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v29
cfec1a
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
cfec1a
+	 vxor		$twk0,$twk0,v31
cfec1a
+
cfec1a
+	le?vperm	$in0,$in0,$in0,$leperm
cfec1a
+	vcipher		$out0,$out0,v30
cfec1a
+
cfec1a
+	vperm		$in0,$in0,$in0,$inpperm
cfec1a
+	vcipherlast	$out0,$out0,$twk0
cfec1a
+
cfec1a
+	vmr		$twk0,$twk1		# unused tweak
cfec1a
+	vxor		$tmp,$out0,$twk1	# last block prep for stealing
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	addi		$out,$out,0x10
cfec1a
+	bne		Lxts_enc6x_steal
cfec1a
+	b		Lxts_enc6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_enc6x_zero:
cfec1a
+	cmpwi		$taillen,0
cfec1a
+	beq		Lxts_enc6x_done
cfec1a
+
cfec1a
+	add		$inp,$inp,$taillen
cfec1a
+	subi		$inp,$inp,16
cfec1a
+	lvx_u		$in0,0,$inp
cfec1a
+	lvsr		$inpperm,0,$taillen	# $in5 is no more
cfec1a
+	le?vperm	$in0,$in0,$in0,$leperm
cfec1a
+	vperm		$in0,$in0,$in0,$inpperm
cfec1a
+	vxor		$tmp,$tmp,$twk0
cfec1a
+Lxts_enc6x_steal:
cfec1a
+	vxor		$in0,$in0,$twk0
cfec1a
+	vxor		$out0,$out0,$out0
cfec1a
+	vspltisb	$out1,-1
cfec1a
+	vperm		$out0,$out0,$out1,$inpperm
cfec1a
+	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
cfec1a
+
cfec1a
+	subi		r30,$out,17
cfec1a
+	subi		$out,$out,16
cfec1a
+	mtctr		$taillen
cfec1a
+Loop_xts_enc6x_steal:
cfec1a
+	lbzu		r0,1(r30)
cfec1a
+	stb		r0,16(r30)
cfec1a
+	bdnz		Loop_xts_enc6x_steal
cfec1a
+
cfec1a
+	li		$taillen,0
cfec1a
+	mtctr		$rounds
cfec1a
+	b		Loop_xts_enc1x		# one more time...
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_enc6x_done:
cfec1a
+	${UCMP}i	$ivp,0
cfec1a
+	beq		Lxts_enc6x_ret
cfec1a
+
cfec1a
+	vxor		$tweak,$twk0,$rndkey0
cfec1a
+	le?vperm	$tweak,$tweak,$tweak,$leperm
cfec1a
+	stvx_u		$tweak,0,$ivp
cfec1a
+
cfec1a
+Lxts_enc6x_ret:
cfec1a
+	mtlr		r11
cfec1a
+	li		r10,`$FRAME+15`
cfec1a
+	li		r11,`$FRAME+31`
cfec1a
+	stvx		$seven,r10,$sp		# wipe copies of round keys
cfec1a
+	addi		r10,r10,32
cfec1a
+	stvx		$seven,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	stvx		$seven,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	stvx		$seven,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	stvx		$seven,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	stvx		$seven,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	stvx		$seven,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	stvx		$seven,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+
cfec1a
+	mtspr		256,$vrsave
cfec1a
+	lvx		v20,r10,$sp		# ABI says so
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v21,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v22,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v23,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v24,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v25,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v26,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v27,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v28,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v29,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v30,r10,$sp
cfec1a
+	lvx		v31,r11,$sp
cfec1a
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
cfec1a
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
cfec1a
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
cfec1a
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
cfec1a
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
cfec1a
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
cfec1a
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
cfec1a
+	blr
cfec1a
+	.long		0
cfec1a
+	.byte		0,12,0x04,1,0x80,6,6,0
cfec1a
+	.long		0
cfec1a
+
cfec1a
+.align	5
cfec1a
+_aesp8_xts_enc5x:
cfec1a
+	vcipher		$out0,$out0,v24
cfec1a
+	vcipher		$out1,$out1,v24
cfec1a
+	vcipher		$out2,$out2,v24
cfec1a
+	vcipher		$out3,$out3,v24
cfec1a
+	vcipher		$out4,$out4,v24
cfec1a
+	lvx		v24,$x20,$key_		# round[3]
cfec1a
+	addi		$key_,$key_,0x20
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v25
cfec1a
+	vcipher		$out1,$out1,v25
cfec1a
+	vcipher		$out2,$out2,v25
cfec1a
+	vcipher		$out3,$out3,v25
cfec1a
+	vcipher		$out4,$out4,v25
cfec1a
+	lvx		v25,$x10,$key_		# round[4]
cfec1a
+	bdnz		_aesp8_xts_enc5x
cfec1a
+
cfec1a
+	add		$inp,$inp,$taillen
cfec1a
+	cmpwi		$taillen,0
cfec1a
+	vcipher		$out0,$out0,v24
cfec1a
+	vcipher		$out1,$out1,v24
cfec1a
+	vcipher		$out2,$out2,v24
cfec1a
+	vcipher		$out3,$out3,v24
cfec1a
+	vcipher		$out4,$out4,v24
cfec1a
+
cfec1a
+	subi		$inp,$inp,16
cfec1a
+	vcipher		$out0,$out0,v25
cfec1a
+	vcipher		$out1,$out1,v25
cfec1a
+	vcipher		$out2,$out2,v25
cfec1a
+	vcipher		$out3,$out3,v25
cfec1a
+	vcipher		$out4,$out4,v25
cfec1a
+	 vxor		$twk0,$twk0,v31
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v26
cfec1a
+	lvsr		$inpperm,r0,$taillen	# $in5 is no more
cfec1a
+	vcipher		$out1,$out1,v26
cfec1a
+	vcipher		$out2,$out2,v26
cfec1a
+	vcipher		$out3,$out3,v26
cfec1a
+	vcipher		$out4,$out4,v26
cfec1a
+	 vxor		$in1,$twk1,v31
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v27
cfec1a
+	lvx_u		$in0,0,$inp
cfec1a
+	vcipher		$out1,$out1,v27
cfec1a
+	vcipher		$out2,$out2,v27
cfec1a
+	vcipher		$out3,$out3,v27
cfec1a
+	vcipher		$out4,$out4,v27
cfec1a
+	 vxor		$in2,$twk2,v31
cfec1a
+
cfec1a
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
cfec1a
+	vcipher		$out0,$out0,v28
cfec1a
+	vcipher		$out1,$out1,v28
cfec1a
+	vcipher		$out2,$out2,v28
cfec1a
+	vcipher		$out3,$out3,v28
cfec1a
+	vcipher		$out4,$out4,v28
cfec1a
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
cfec1a
+	 vxor		$in3,$twk3,v31
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v29
cfec1a
+	le?vperm	$in0,$in0,$in0,$leperm
cfec1a
+	vcipher		$out1,$out1,v29
cfec1a
+	vcipher		$out2,$out2,v29
cfec1a
+	vcipher		$out3,$out3,v29
cfec1a
+	vcipher		$out4,$out4,v29
cfec1a
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
cfec1a
+	 vxor		$in4,$twk4,v31
cfec1a
+
cfec1a
+	vcipher		$out0,$out0,v30
cfec1a
+	vperm		$in0,$in0,$in0,$inpperm
cfec1a
+	vcipher		$out1,$out1,v30
cfec1a
+	vcipher		$out2,$out2,v30
cfec1a
+	vcipher		$out3,$out3,v30
cfec1a
+	vcipher		$out4,$out4,v30
cfec1a
+
cfec1a
+	vcipherlast	$out0,$out0,$twk0
cfec1a
+	vcipherlast	$out1,$out1,$in1
cfec1a
+	vcipherlast	$out2,$out2,$in2
cfec1a
+	vcipherlast	$out3,$out3,$in3
cfec1a
+	vcipherlast	$out4,$out4,$in4
cfec1a
+	blr
cfec1a
+        .long   	0
cfec1a
+        .byte   	0,12,0x14,0,0,0,0,0
cfec1a
+
cfec1a
+.align	5
cfec1a
+_aesp8_xts_decrypt6x:
cfec1a
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
cfec1a
+	mflr		r11
cfec1a
+	li		r7,`$FRAME+8*16+15`
cfec1a
+	li		r3,`$FRAME+8*16+31`
cfec1a
+	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
cfec1a
+	stvx		v20,r7,$sp		# ABI says so
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v21,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v22,r7,$sp
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v23,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v24,r7,$sp
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v25,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v26,r7,$sp
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v27,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v28,r7,$sp
cfec1a
+	addi		r7,r7,32
cfec1a
+	stvx		v29,r3,$sp
cfec1a
+	addi		r3,r3,32
cfec1a
+	stvx		v30,r7,$sp
cfec1a
+	stvx		v31,r3,$sp
cfec1a
+	li		r0,-1
cfec1a
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
cfec1a
+	li		$x10,0x10
cfec1a
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
cfec1a
+	li		$x20,0x20
cfec1a
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
cfec1a
+	li		$x30,0x30
cfec1a
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
cfec1a
+	li		$x40,0x40
cfec1a
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
cfec1a
+	li		$x50,0x50
cfec1a
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
cfec1a
+	li		$x60,0x60
cfec1a
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
cfec1a
+	li		$x70,0x70
cfec1a
+	mtspr		256,r0
cfec1a
+
cfec1a
+	subi		$rounds,$rounds,3	# -4 in total
cfec1a
+
cfec1a
+	lvx		$rndkey0,$x00,$key1	# load key schedule
cfec1a
+	lvx		v30,$x10,$key1
cfec1a
+	addi		$key1,$key1,0x20
cfec1a
+	lvx		v31,$x00,$key1
cfec1a
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
cfec1a
+	addi		$key_,$sp,$FRAME+15
cfec1a
+	mtctr		$rounds
cfec1a
+
cfec1a
+Load_xts_dec_key:
cfec1a
+	?vperm		v24,v30,v31,$keyperm
cfec1a
+	lvx		v30,$x10,$key1
cfec1a
+	addi		$key1,$key1,0x20
cfec1a
+	stvx		v24,$x00,$key_		# off-load round[1]
cfec1a
+	?vperm		v25,v31,v30,$keyperm
cfec1a
+	lvx		v31,$x00,$key1
cfec1a
+	stvx		v25,$x10,$key_		# off-load round[2]
cfec1a
+	addi		$key_,$key_,0x20
cfec1a
+	bdnz		Load_xts_dec_key
cfec1a
+
cfec1a
+	lvx		v26,$x10,$key1
cfec1a
+	?vperm		v24,v30,v31,$keyperm
cfec1a
+	lvx		v27,$x20,$key1
cfec1a
+	stvx		v24,$x00,$key_		# off-load round[3]
cfec1a
+	?vperm		v25,v31,v26,$keyperm
cfec1a
+	lvx		v28,$x30,$key1
cfec1a
+	stvx		v25,$x10,$key_		# off-load round[4]
cfec1a
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
cfec1a
+	?vperm		v26,v26,v27,$keyperm
cfec1a
+	lvx		v29,$x40,$key1
cfec1a
+	?vperm		v27,v27,v28,$keyperm
cfec1a
+	lvx		v30,$x50,$key1
cfec1a
+	?vperm		v28,v28,v29,$keyperm
cfec1a
+	lvx		v31,$x60,$key1
cfec1a
+	?vperm		v29,v29,v30,$keyperm
cfec1a
+	lvx		$twk5,$x70,$key1	# borrow $twk5
cfec1a
+	?vperm		v30,v30,v31,$keyperm
cfec1a
+	lvx		v24,$x00,$key_		# pre-load round[1]
cfec1a
+	?vperm		v31,v31,$twk5,$keyperm
cfec1a
+	lvx		v25,$x10,$key_		# pre-load round[2]
cfec1a
+
cfec1a
+	 vperm		$in0,$inout,$inptail,$inpperm
cfec1a
+	 subi		$inp,$inp,31		# undo "caller"
cfec1a
+	vxor		$twk0,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out0,$in0,$twk0
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in1,$x10,$inp
cfec1a
+	vxor		$twk1,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in1,$in1,$in1,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out1,$in1,$twk1
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in2,$x20,$inp
cfec1a
+	 andi.		$taillen,$len,15
cfec1a
+	vxor		$twk2,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in2,$in2,$in2,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out2,$in2,$twk2
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in3,$x30,$inp
cfec1a
+	 sub		$len,$len,$taillen
cfec1a
+	vxor		$twk3,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in3,$in3,$in3,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out3,$in3,$twk3
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in4,$x40,$inp
cfec1a
+	 subi		$len,$len,0x60
cfec1a
+	vxor		$twk4,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in4,$in4,$in4,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out4,$in4,$twk4
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	 lvx_u		$in5,$x50,$inp
cfec1a
+	 addi		$inp,$inp,0x60
cfec1a
+	vxor		$twk5,$tweak,$rndkey0
cfec1a
+	vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	vaddubm		$tweak,$tweak,$tweak
cfec1a
+	vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	 le?vperm	$in5,$in5,$in5,$leperm
cfec1a
+	vand		$tmp,$tmp,$eighty7
cfec1a
+	 vxor		$out5,$in5,$twk5
cfec1a
+	vxor		$tweak,$tweak,$tmp
cfec1a
+
cfec1a
+	vxor		v31,v31,$rndkey0
cfec1a
+	mtctr		$rounds
cfec1a
+	b		Loop_xts_dec6x
cfec1a
+
cfec1a
+.align	5
cfec1a
+Loop_xts_dec6x:
cfec1a
+	vncipher	$out0,$out0,v24
cfec1a
+	vncipher	$out1,$out1,v24
cfec1a
+	vncipher	$out2,$out2,v24
cfec1a
+	vncipher	$out3,$out3,v24
cfec1a
+	vncipher	$out4,$out4,v24
cfec1a
+	vncipher	$out5,$out5,v24
cfec1a
+	lvx		v24,$x20,$key_		# round[3]
cfec1a
+	addi		$key_,$key_,0x20
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v25
cfec1a
+	vncipher	$out1,$out1,v25
cfec1a
+	vncipher	$out2,$out2,v25
cfec1a
+	vncipher	$out3,$out3,v25
cfec1a
+	vncipher	$out4,$out4,v25
cfec1a
+	vncipher	$out5,$out5,v25
cfec1a
+	lvx		v25,$x10,$key_		# round[4]
cfec1a
+	bdnz		Loop_xts_dec6x
cfec1a
+
cfec1a
+	subic		$len,$len,96		# $len-=96
cfec1a
+	 vxor		$in0,$twk0,v31		# xor with last round key
cfec1a
+	vncipher	$out0,$out0,v24
cfec1a
+	vncipher	$out1,$out1,v24
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk0,$tweak,$rndkey0
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	vncipher	$out2,$out2,v24
cfec1a
+	vncipher	$out3,$out3,v24
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vncipher	$out4,$out4,v24
cfec1a
+	vncipher	$out5,$out5,v24
cfec1a
+
cfec1a
+	subfe.		r0,r0,r0		# borrow?-1:0
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vncipher	$out0,$out0,v25
cfec1a
+	vncipher	$out1,$out1,v25
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vncipher	$out2,$out2,v25
cfec1a
+	vncipher	$out3,$out3,v25
cfec1a
+	 vxor		$in1,$twk1,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk1,$tweak,$rndkey0
cfec1a
+	vncipher	$out4,$out4,v25
cfec1a
+	vncipher	$out5,$out5,v25
cfec1a
+
cfec1a
+	and		r0,r0,$len
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vncipher	$out0,$out0,v26
cfec1a
+	vncipher	$out1,$out1,v26
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vncipher	$out2,$out2,v26
cfec1a
+	vncipher	$out3,$out3,v26
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vncipher	$out4,$out4,v26
cfec1a
+	vncipher	$out5,$out5,v26
cfec1a
+
cfec1a
+	add		$inp,$inp,r0		# $inp is adjusted in such
cfec1a
+						# way that at exit from the
cfec1a
+						# loop inX-in5 are loaded
cfec1a
+						# with last "words"
cfec1a
+	 vxor		$in2,$twk2,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk2,$tweak,$rndkey0
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	vncipher	$out0,$out0,v27
cfec1a
+	vncipher	$out1,$out1,v27
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vncipher	$out2,$out2,v27
cfec1a
+	vncipher	$out3,$out3,v27
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vncipher	$out4,$out4,v27
cfec1a
+	vncipher	$out5,$out5,v27
cfec1a
+
cfec1a
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vncipher	$out0,$out0,v28
cfec1a
+	vncipher	$out1,$out1,v28
cfec1a
+	 vxor		$in3,$twk3,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk3,$tweak,$rndkey0
cfec1a
+	vncipher	$out2,$out2,v28
cfec1a
+	vncipher	$out3,$out3,v28
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vncipher	$out4,$out4,v28
cfec1a
+	vncipher	$out5,$out5,v28
cfec1a
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v29
cfec1a
+	vncipher	$out1,$out1,v29
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vncipher	$out2,$out2,v29
cfec1a
+	vncipher	$out3,$out3,v29
cfec1a
+	 vxor		$in4,$twk4,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk4,$tweak,$rndkey0
cfec1a
+	vncipher	$out4,$out4,v29
cfec1a
+	vncipher	$out5,$out5,v29
cfec1a
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v30
cfec1a
+	vncipher	$out1,$out1,v30
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vncipher	$out2,$out2,v30
cfec1a
+	vncipher	$out3,$out3,v30
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vncipher	$out4,$out4,v30
cfec1a
+	vncipher	$out5,$out5,v30
cfec1a
+	 vxor		$in5,$twk5,v31
cfec1a
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
cfec1a
+	 vxor		$twk5,$tweak,$rndkey0
cfec1a
+
cfec1a
+	vncipherlast	$out0,$out0,$in0
cfec1a
+	 lvx_u		$in0,$x00,$inp		# load next input block
cfec1a
+	 vaddubm	$tweak,$tweak,$tweak
cfec1a
+	 vsldoi		$tmp,$tmp,$tmp,15
cfec1a
+	vncipherlast	$out1,$out1,$in1
cfec1a
+	 lvx_u		$in1,$x10,$inp
cfec1a
+	vncipherlast	$out2,$out2,$in2
cfec1a
+	 le?vperm	$in0,$in0,$in0,$leperm
cfec1a
+	 lvx_u		$in2,$x20,$inp
cfec1a
+	 vand		$tmp,$tmp,$eighty7
cfec1a
+	vncipherlast	$out3,$out3,$in3
cfec1a
+	 le?vperm	$in1,$in1,$in1,$leperm
cfec1a
+	 lvx_u		$in3,$x30,$inp
cfec1a
+	vncipherlast	$out4,$out4,$in4
cfec1a
+	 le?vperm	$in2,$in2,$in2,$leperm
cfec1a
+	 lvx_u		$in4,$x40,$inp
cfec1a
+	 vxor		$tweak,$tweak,$tmp
cfec1a
+	vncipherlast	$out5,$out5,$in5
cfec1a
+	 le?vperm	$in3,$in3,$in3,$leperm
cfec1a
+	 lvx_u		$in5,$x50,$inp
cfec1a
+	 addi		$inp,$inp,0x60
cfec1a
+	 le?vperm	$in4,$in4,$in4,$leperm
cfec1a
+	 le?vperm	$in5,$in5,$in5,$leperm
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	 vxor		$out0,$in0,$twk0
cfec1a
+	le?vperm	$out2,$out2,$out2,$leperm
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	 vxor		$out1,$in1,$twk1
cfec1a
+	le?vperm	$out3,$out3,$out3,$leperm
cfec1a
+	stvx_u		$out2,$x20,$out
cfec1a
+	 vxor		$out2,$in2,$twk2
cfec1a
+	le?vperm	$out4,$out4,$out4,$leperm
cfec1a
+	stvx_u		$out3,$x30,$out
cfec1a
+	 vxor		$out3,$in3,$twk3
cfec1a
+	le?vperm	$out5,$out5,$out5,$leperm
cfec1a
+	stvx_u		$out4,$x40,$out
cfec1a
+	 vxor		$out4,$in4,$twk4
cfec1a
+	stvx_u		$out5,$x50,$out
cfec1a
+	 vxor		$out5,$in5,$twk5
cfec1a
+	addi		$out,$out,0x60
cfec1a
+
cfec1a
+	mtctr		$rounds
cfec1a
+	beq		Loop_xts_dec6x		# did $len-=96 borrow?
cfec1a
+
cfec1a
+	addic.		$len,$len,0x60
cfec1a
+	beq		Lxts_dec6x_zero
cfec1a
+	cmpwi		$len,0x20
cfec1a
+	blt		Lxts_dec6x_one
cfec1a
+	nop
cfec1a
+	beq		Lxts_dec6x_two
cfec1a
+	cmpwi		$len,0x40
cfec1a
+	blt		Lxts_dec6x_three
cfec1a
+	nop
cfec1a
+	beq		Lxts_dec6x_four
cfec1a
+
cfec1a
+Lxts_dec6x_five:
cfec1a
+	vxor		$out0,$in1,$twk0
cfec1a
+	vxor		$out1,$in2,$twk1
cfec1a
+	vxor		$out2,$in3,$twk2
cfec1a
+	vxor		$out3,$in4,$twk3
cfec1a
+	vxor		$out4,$in5,$twk4
cfec1a
+
cfec1a
+	bl		_aesp8_xts_dec5x
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	vmr		$twk0,$twk5		# unused tweak
cfec1a
+	vxor		$twk1,$tweak,$rndkey0
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	vxor		$out0,$in0,$twk1
cfec1a
+	le?vperm	$out2,$out2,$out2,$leperm
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	le?vperm	$out3,$out3,$out3,$leperm
cfec1a
+	stvx_u		$out2,$x20,$out
cfec1a
+	le?vperm	$out4,$out4,$out4,$leperm
cfec1a
+	stvx_u		$out3,$x30,$out
cfec1a
+	stvx_u		$out4,$x40,$out
cfec1a
+	addi		$out,$out,0x50
cfec1a
+	bne		Lxts_dec6x_steal
cfec1a
+	b		Lxts_dec6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_dec6x_four:
cfec1a
+	vxor		$out0,$in2,$twk0
cfec1a
+	vxor		$out1,$in3,$twk1
cfec1a
+	vxor		$out2,$in4,$twk2
cfec1a
+	vxor		$out3,$in5,$twk3
cfec1a
+	vxor		$out4,$out4,$out4
cfec1a
+
cfec1a
+	bl		_aesp8_xts_dec5x
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	vmr		$twk0,$twk4		# unused tweak
cfec1a
+	vmr		$twk1,$twk5
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	vxor		$out0,$in0,$twk5
cfec1a
+	le?vperm	$out2,$out2,$out2,$leperm
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	le?vperm	$out3,$out3,$out3,$leperm
cfec1a
+	stvx_u		$out2,$x20,$out
cfec1a
+	stvx_u		$out3,$x30,$out
cfec1a
+	addi		$out,$out,0x40
cfec1a
+	bne		Lxts_dec6x_steal
cfec1a
+	b		Lxts_dec6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_dec6x_three:
cfec1a
+	vxor		$out0,$in3,$twk0
cfec1a
+	vxor		$out1,$in4,$twk1
cfec1a
+	vxor		$out2,$in5,$twk2
cfec1a
+	vxor		$out3,$out3,$out3
cfec1a
+	vxor		$out4,$out4,$out4
cfec1a
+
cfec1a
+	bl		_aesp8_xts_dec5x
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	vmr		$twk0,$twk3		# unused tweak
cfec1a
+	vmr		$twk1,$twk4
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	vxor		$out0,$in0,$twk4
cfec1a
+	le?vperm	$out2,$out2,$out2,$leperm
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	stvx_u		$out2,$x20,$out
cfec1a
+	addi		$out,$out,0x30
cfec1a
+	bne		Lxts_dec6x_steal
cfec1a
+	b		Lxts_dec6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_dec6x_two:
cfec1a
+	vxor		$out0,$in4,$twk0
cfec1a
+	vxor		$out1,$in5,$twk1
cfec1a
+	vxor		$out2,$out2,$out2
cfec1a
+	vxor		$out3,$out3,$out3
cfec1a
+	vxor		$out4,$out4,$out4
cfec1a
+
cfec1a
+	bl		_aesp8_xts_dec5x
cfec1a
+
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	vmr		$twk0,$twk2		# unused tweak
cfec1a
+	vmr		$twk1,$twk3
cfec1a
+	le?vperm	$out1,$out1,$out1,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	vxor		$out0,$in0,$twk3
cfec1a
+	stvx_u		$out1,$x10,$out
cfec1a
+	addi		$out,$out,0x20
cfec1a
+	bne		Lxts_dec6x_steal
cfec1a
+	b		Lxts_dec6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_dec6x_one:
cfec1a
+	vxor		$out0,$in5,$twk0
cfec1a
+	nop
cfec1a
+Loop_xts_dec1x:
cfec1a
+	vncipher	$out0,$out0,v24
cfec1a
+	lvx		v24,$x20,$key_		# round[3]
cfec1a
+	addi		$key_,$key_,0x20
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v25
cfec1a
+	lvx		v25,$x10,$key_		# round[4]
cfec1a
+	bdnz		Loop_xts_dec1x
cfec1a
+
cfec1a
+	subi		r0,$taillen,1
cfec1a
+	vncipher	$out0,$out0,v24
cfec1a
+
cfec1a
+	andi.		r0,r0,16
cfec1a
+	cmpwi		$taillen,0
cfec1a
+	vncipher	$out0,$out0,v25
cfec1a
+
cfec1a
+	sub		$inp,$inp,r0
cfec1a
+	vncipher	$out0,$out0,v26
cfec1a
+
cfec1a
+	lvx_u		$in0,0,$inp
cfec1a
+	vncipher	$out0,$out0,v27
cfec1a
+
cfec1a
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
cfec1a
+	vncipher	$out0,$out0,v28
cfec1a
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v29
cfec1a
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
cfec1a
+	 vxor		$twk0,$twk0,v31
cfec1a
+
cfec1a
+	le?vperm	$in0,$in0,$in0,$leperm
cfec1a
+	vncipher	$out0,$out0,v30
cfec1a
+
cfec1a
+	mtctr		$rounds
cfec1a
+	vncipherlast	$out0,$out0,$twk0
cfec1a
+
cfec1a
+	vmr		$twk0,$twk1		# unused tweak
cfec1a
+	vmr		$twk1,$twk2
cfec1a
+	le?vperm	$out0,$out0,$out0,$leperm
cfec1a
+	stvx_u		$out0,$x00,$out		# store output
cfec1a
+	addi		$out,$out,0x10
cfec1a
+	vxor		$out0,$in0,$twk2
cfec1a
+	bne		Lxts_dec6x_steal
cfec1a
+	b		Lxts_dec6x_done
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_dec6x_zero:
cfec1a
+	cmpwi		$taillen,0
cfec1a
+	beq		Lxts_dec6x_done
cfec1a
+
cfec1a
+	lvx_u		$in0,0,$inp
cfec1a
+	le?vperm	$in0,$in0,$in0,$leperm
cfec1a
+	vxor		$out0,$in0,$twk1
cfec1a
+Lxts_dec6x_steal:
cfec1a
+	vncipher	$out0,$out0,v24
cfec1a
+	lvx		v24,$x20,$key_		# round[3]
cfec1a
+	addi		$key_,$key_,0x20
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v25
cfec1a
+	lvx		v25,$x10,$key_		# round[4]
cfec1a
+	bdnz		Lxts_dec6x_steal
cfec1a
+
cfec1a
+	add		$inp,$inp,$taillen
cfec1a
+	vncipher	$out0,$out0,v24
cfec1a
+
cfec1a
+	cmpwi		$taillen,0
cfec1a
+	vncipher	$out0,$out0,v25
cfec1a
+
cfec1a
+	lvx_u		$in0,0,$inp
cfec1a
+	vncipher	$out0,$out0,v26
cfec1a
+
cfec1a
+	lvsr		$inpperm,0,$taillen	# $in5 is no more
cfec1a
+	vncipher	$out0,$out0,v27
cfec1a
+
cfec1a
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
cfec1a
+	vncipher	$out0,$out0,v28
cfec1a
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v29
cfec1a
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
cfec1a
+	 vxor		$twk1,$twk1,v31
cfec1a
+
cfec1a
+	le?vperm	$in0,$in0,$in0,$leperm
cfec1a
+	vncipher	$out0,$out0,v30
cfec1a
+
cfec1a
+	vperm		$in0,$in0,$in0,$inpperm
cfec1a
+	vncipherlast	$tmp,$out0,$twk1
cfec1a
+
cfec1a
+	le?vperm	$out0,$tmp,$tmp,$leperm
cfec1a
+	le?stvx_u	$out0,0,$out
cfec1a
+	be?stvx_u	$tmp,0,$out
cfec1a
+
cfec1a
+	vxor		$out0,$out0,$out0
cfec1a
+	vspltisb	$out1,-1
cfec1a
+	vperm		$out0,$out0,$out1,$inpperm
cfec1a
+	vsel		$out0,$in0,$tmp,$out0
cfec1a
+	vxor		$out0,$out0,$twk0
cfec1a
+
cfec1a
+	subi		r30,$out,1
cfec1a
+	mtctr		$taillen
cfec1a
+Loop_xts_dec6x_steal:
cfec1a
+	lbzu		r0,1(r30)
cfec1a
+	stb		r0,16(r30)
cfec1a
+	bdnz		Loop_xts_dec6x_steal
cfec1a
+
cfec1a
+	li		$taillen,0
cfec1a
+	mtctr		$rounds
cfec1a
+	b		Loop_xts_dec1x		# one more time...
cfec1a
+
cfec1a
+.align	4
cfec1a
+Lxts_dec6x_done:
cfec1a
+	${UCMP}i	$ivp,0
cfec1a
+	beq		Lxts_dec6x_ret
cfec1a
+
cfec1a
+	vxor		$tweak,$twk0,$rndkey0
cfec1a
+	le?vperm	$tweak,$tweak,$tweak,$leperm
cfec1a
+	stvx_u		$tweak,0,$ivp
cfec1a
+
cfec1a
+Lxts_dec6x_ret:
cfec1a
+	mtlr		r11
cfec1a
+	li		r10,`$FRAME+15`
cfec1a
+	li		r11,`$FRAME+31`
cfec1a
+	stvx		$seven,r10,$sp		# wipe copies of round keys
cfec1a
+	addi		r10,r10,32
cfec1a
+	stvx		$seven,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	stvx		$seven,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	stvx		$seven,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	stvx		$seven,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	stvx		$seven,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	stvx		$seven,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	stvx		$seven,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+
cfec1a
+	mtspr		256,$vrsave
cfec1a
+	lvx		v20,r10,$sp		# ABI says so
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v21,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v22,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v23,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v24,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v25,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v26,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v27,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v28,r10,$sp
cfec1a
+	addi		r10,r10,32
cfec1a
+	lvx		v29,r11,$sp
cfec1a
+	addi		r11,r11,32
cfec1a
+	lvx		v30,r10,$sp
cfec1a
+	lvx		v31,r11,$sp
cfec1a
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
cfec1a
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
cfec1a
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
cfec1a
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
cfec1a
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
cfec1a
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
cfec1a
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
cfec1a
+	blr
cfec1a
+	.long		0
cfec1a
+	.byte		0,12,0x04,1,0x80,6,6,0
cfec1a
+	.long		0
cfec1a
+
cfec1a
+.align	5
cfec1a
+_aesp8_xts_dec5x:
cfec1a
+	vncipher	$out0,$out0,v24
cfec1a
+	vncipher	$out1,$out1,v24
cfec1a
+	vncipher	$out2,$out2,v24
cfec1a
+	vncipher	$out3,$out3,v24
cfec1a
+	vncipher	$out4,$out4,v24
cfec1a
+	lvx		v24,$x20,$key_		# round[3]
cfec1a
+	addi		$key_,$key_,0x20
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v25
cfec1a
+	vncipher	$out1,$out1,v25
cfec1a
+	vncipher	$out2,$out2,v25
cfec1a
+	vncipher	$out3,$out3,v25
cfec1a
+	vncipher	$out4,$out4,v25
cfec1a
+	lvx		v25,$x10,$key_		# round[4]
cfec1a
+	bdnz		_aesp8_xts_dec5x
cfec1a
+
cfec1a
+	subi		r0,$taillen,1
cfec1a
+	vncipher	$out0,$out0,v24
cfec1a
+	vncipher	$out1,$out1,v24
cfec1a
+	vncipher	$out2,$out2,v24
cfec1a
+	vncipher	$out3,$out3,v24
cfec1a
+	vncipher	$out4,$out4,v24
cfec1a
+
cfec1a
+	andi.		r0,r0,16
cfec1a
+	cmpwi		$taillen,0
cfec1a
+	vncipher	$out0,$out0,v25
cfec1a
+	vncipher	$out1,$out1,v25
cfec1a
+	vncipher	$out2,$out2,v25
cfec1a
+	vncipher	$out3,$out3,v25
cfec1a
+	vncipher	$out4,$out4,v25
cfec1a
+	 vxor		$twk0,$twk0,v31
cfec1a
+
cfec1a
+	sub		$inp,$inp,r0
cfec1a
+	vncipher	$out0,$out0,v26
cfec1a
+	vncipher	$out1,$out1,v26
cfec1a
+	vncipher	$out2,$out2,v26
cfec1a
+	vncipher	$out3,$out3,v26
cfec1a
+	vncipher	$out4,$out4,v26
cfec1a
+	 vxor		$in1,$twk1,v31
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v27
cfec1a
+	lvx_u		$in0,0,$inp
cfec1a
+	vncipher	$out1,$out1,v27
cfec1a
+	vncipher	$out2,$out2,v27
cfec1a
+	vncipher	$out3,$out3,v27
cfec1a
+	vncipher	$out4,$out4,v27
cfec1a
+	 vxor		$in2,$twk2,v31
cfec1a
+
cfec1a
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
cfec1a
+	vncipher	$out0,$out0,v28
cfec1a
+	vncipher	$out1,$out1,v28
cfec1a
+	vncipher	$out2,$out2,v28
cfec1a
+	vncipher	$out3,$out3,v28
cfec1a
+	vncipher	$out4,$out4,v28
cfec1a
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
cfec1a
+	 vxor		$in3,$twk3,v31
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v29
cfec1a
+	le?vperm	$in0,$in0,$in0,$leperm
cfec1a
+	vncipher	$out1,$out1,v29
cfec1a
+	vncipher	$out2,$out2,v29
cfec1a
+	vncipher	$out3,$out3,v29
cfec1a
+	vncipher	$out4,$out4,v29
cfec1a
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
cfec1a
+	 vxor		$in4,$twk4,v31
cfec1a
+
cfec1a
+	vncipher	$out0,$out0,v30
cfec1a
+	vncipher	$out1,$out1,v30
cfec1a
+	vncipher	$out2,$out2,v30
cfec1a
+	vncipher	$out3,$out3,v30
cfec1a
+	vncipher	$out4,$out4,v30
cfec1a
+
cfec1a
+	vncipherlast	$out0,$out0,$twk0
cfec1a
+	vncipherlast	$out1,$out1,$in1
cfec1a
+	vncipherlast	$out2,$out2,$in2
cfec1a
+	vncipherlast	$out3,$out3,$in3
cfec1a
+	vncipherlast	$out4,$out4,$in4
cfec1a
+	mtctr		$rounds
cfec1a
+	blr
cfec1a
+        .long   	0
cfec1a
+        .byte   	0,12,0x14,0,0,0,0,0
cfec1a
+___
cfec1a
+}}	}}}
cfec1a
+
cfec1a
 my $consts=1;
cfec1a
 foreach(split("\n",$code)) {
cfec1a
         s/\`([^\`]*)\`/eval($1)/geo;
cfec1a
diff -up openssl-1.0.2k/crypto/evp/e_aes.c.ppc-update openssl-1.0.2k/crypto/evp/e_aes.c
cfec1a
--- openssl-1.0.2k/crypto/evp/e_aes.c.ppc-update	2017-03-09 17:59:26.303232439 +0100
cfec1a
+++ openssl-1.0.2k/crypto/evp/e_aes.c	2017-03-09 17:59:26.314232696 +0100
cfec1a
@@ -172,6 +172,8 @@ void AES_xts_decrypt(const unsigned char
cfec1a
 #  define HWAES_decrypt aes_p8_decrypt
cfec1a
 #  define HWAES_cbc_encrypt aes_p8_cbc_encrypt
cfec1a
 #  define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
cfec1a
+#  define HWAES_xts_encrypt aes_p8_xts_encrypt
cfec1a
+#  define HWAES_xts_decrypt aes_p8_xts_decrypt
cfec1a
 # endif
cfec1a
 
cfec1a
 # if     defined(AES_ASM) && !defined(I386_ONLY) &&      (  \
cfec1a
@@ -911,6 +913,12 @@ void HWAES_cbc_encrypt(const unsigned ch
cfec1a
 void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
cfec1a
                                 size_t len, const AES_KEY *key,
cfec1a
                                 const unsigned char ivec[16]);
cfec1a
+void HWAES_xts_encrypt(const unsigned char *inp, unsigned char *out,
cfec1a
+                       size_t len, const AES_KEY *key1,
cfec1a
+                       const AES_KEY *key2, const unsigned char iv[16]);
cfec1a
+void HWAES_xts_decrypt(const unsigned char *inp, unsigned char *out,
cfec1a
+                       size_t len, const AES_KEY *key1,
cfec1a
+                       const AES_KEY *key2, const unsigned char iv[16]);
cfec1a
 # endif
cfec1a
 
cfec1a
 # define BLOCK_CIPHER_generic_pack(nid,keylen,flags)             \
cfec1a
@@ -1664,10 +1672,16 @@ static int aes_xts_init_key(EVP_CIPHER_C
cfec1a
                     HWAES_set_encrypt_key(key, ctx->key_len * 4,
cfec1a
                                           &xctx->ks1.ks);
cfec1a
                     xctx->xts.block1 = (block128_f) HWAES_encrypt;
cfec1a
+# ifdef HWAES_xts_encrypt
cfec1a
+                    xctx->stream = HWAES_xts_encrypt;
cfec1a
+# endif
cfec1a
                 } else {
cfec1a
                     HWAES_set_decrypt_key(key, ctx->key_len * 4,
cfec1a
                                           &xctx->ks1.ks);
cfec1a
                     xctx->xts.block1 = (block128_f) HWAES_decrypt;
cfec1a
+# ifdef HWAES_xts_decrypt
cfec1a
+                    xctx->stream = HWAES_xts_decrypt;
cfec1a
+#endif
cfec1a
                 }
cfec1a
 
cfec1a
                 HWAES_set_encrypt_key(key + ctx->key_len / 2,