diff -up openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl.s390x-update openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl --- openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl.s390x-update 2020-03-17 15:31:17.000000000 +0100 +++ openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl 2020-03-19 16:45:05.483440129 +0100 @@ -20,41 +20,53 @@ # # 3 times faster than compiler-generated code. -$flavour = shift; +# +# August 2018 +# +# Add vx code path: 4x"vertical". +# +# Copyright IBM Corp. 2018 +# Author: Patrick Steuer + +# +# February 2019 +# +# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's +# 4x"vertical" submission [on z13] and >3 faster than scalar code. +# But to harness overheads revert to transliteration of VSX code path +# from chacha-ppc module, which is also 4x"vertical", to handle inputs +# not longer than 256 bytes. + +use strict; +use FindBin qw($Bin); +use lib "$Bin/../.."; +use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE); +my $flavour = shift; + +my ($z,$SIZE_T); if ($flavour =~ /3[12]/) { + $z=0; # S/390 ABI $SIZE_T=4; - $g=""; } else { + $z=1; # zSeries ABI $SIZE_T=8; - $g="g"; } +my $output; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; - -sub AUTOLOAD() # thunk [simplified] x86-style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; - $code .= "\t$opcode\t".join(',',@_)."\n"; -} my $sp="%r15"; - my $stdframe=16*$SIZE_T+4*8; -my $frame=$stdframe+4*20; - -my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); +sub ROUND { my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); my @t=map("%r$_",(8,9)); - -sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); -my ($xc,$xc_)=map("\"$_\"",@t); -my @x=map("\"$_\"",@x); +my ($xc,$xc_)=map("$_",@t); # Consider order in which variables are addressed by their # index: @@ -78,249 +90,967 @@ my @x=map("\"$_\"",@x); # 'c' stores and loads in the middle, but none in the beginning # or end. - ( - "&alr (@x[$a0],@x[$b0])", # Q1 - "&alr (@x[$a1],@x[$b1])", # Q2 - "&xr (@x[$d0],@x[$a0])", - "&xr (@x[$d1],@x[$a1])", - "&rll (@x[$d0],@x[$d0],16)", - "&rll (@x[$d1],@x[$d1],16)", - - "&alr ($xc,@x[$d0])", - "&alr ($xc_,@x[$d1])", - "&xr (@x[$b0],$xc)", - "&xr (@x[$b1],$xc_)", - "&rll (@x[$b0],@x[$b0],12)", - "&rll (@x[$b1],@x[$b1],12)", - - "&alr (@x[$a0],@x[$b0])", - "&alr (@x[$a1],@x[$b1])", - "&xr (@x[$d0],@x[$a0])", - "&xr (@x[$d1],@x[$a1])", - "&rll (@x[$d0],@x[$d0],8)", - "&rll (@x[$d1],@x[$d1],8)", - - "&alr ($xc,@x[$d0])", - "&alr ($xc_,@x[$d1])", - "&xr (@x[$b0],$xc)", - "&xr (@x[$b1],$xc_)", - "&rll (@x[$b0],@x[$b0],7)", - "&rll (@x[$b1],@x[$b1],7)", - - "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's - "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')", - - "&alr (@x[$a2],@x[$b2])", # Q3 - "&alr (@x[$a3],@x[$b3])", # Q4 - "&xr (@x[$d2],@x[$a2])", - "&xr (@x[$d3],@x[$a3])", - "&rll (@x[$d2],@x[$d2],16)", - "&rll (@x[$d3],@x[$d3],16)", - - "&alr ($xc,@x[$d2])", - "&alr ($xc_,@x[$d3])", - "&xr (@x[$b2],$xc)", - "&xr (@x[$b3],$xc_)", - "&rll (@x[$b2],@x[$b2],12)", - "&rll (@x[$b3],@x[$b3],12)", - - "&alr (@x[$a2],@x[$b2])", - "&alr (@x[$a3],@x[$b3])", - "&xr (@x[$d2],@x[$a2])", - "&xr (@x[$d3],@x[$a3])", - "&rll (@x[$d2],@x[$d2],8)", - "&rll (@x[$d3],@x[$d3],8)", - - "&alr ($xc,@x[$d2])", - "&alr ($xc_,@x[$d3])", - "&xr (@x[$b2],$xc)", - "&xr (@x[$b3],$xc_)", - "&rll (@x[$b2],@x[$b2],7)", - "&rll (@x[$b3],@x[$b3],7)" - ); -} - -$code.=<<___; -.text - -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,\@function -.align 32 -ChaCha20_ctr32: - lt${g}r $len,$len # $len==0? - bzr %r14 - a${g}hi $len,-64 - l${g}hi %r1,-$frame - stm${g} %r6,%r15,`6*$SIZE_T`($sp) - sl${g}r $out,$inp # difference - la $len,0($inp,$len) # end of input minus 64 - larl %r7,.Lsigma - lgr %r0,$sp - la $sp,0(%r1,$sp) - st${g} %r0,0($sp) - - lmg %r8,%r11,0($key) # load key - lmg %r12,%r13,0($counter) # load counter - lmg %r6,%r7,0(%r7) # load sigma constant - - la %r14,0($inp) - st${g} $out,$frame+3*$SIZE_T($sp) - st${g} $len,$frame+4*$SIZE_T($sp) - stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack - srlg @x[12],%r12,32 # 32-bit counter value - j .Loop_outer - -.align 16 -.Loop_outer: - lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7] - lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11] - lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15] - stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11] - lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9] - st @x[12],$stdframe+4*12($sp) # save counter - st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer - lhi %r14,10 - j .Loop - -.align 4 -.Loop: -___ - foreach (&ROUND(0, 4, 8,12)) { eval; } - foreach (&ROUND(0, 5,10,15)) { eval; } -$code.=<<___; - brct %r14,.Loop - - l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer - stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9] - lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp) - - al @x[0],$stdframe+4*0($sp) # accumulate key schedule - al @x[1],$stdframe+4*1($sp) - al @x[2],$stdframe+4*2($sp) - al @x[3],$stdframe+4*3($sp) - al @x[4],$stdframe+4*4($sp) - al @x[5],$stdframe+4*5($sp) - al @x[6],$stdframe+4*6($sp) - al @x[7],$stdframe+4*7($sp) - lrvr @x[0],@x[0] - lrvr @x[1],@x[1] - lrvr @x[2],@x[2] - lrvr @x[3],@x[3] - lrvr @x[4],@x[4] - lrvr @x[5],@x[5] - lrvr @x[6],@x[6] - lrvr @x[7],@x[7] - al @x[12],$stdframe+4*12($sp) - al @x[13],$stdframe+4*13($sp) - al @x[14],$stdframe+4*14($sp) - al @x[15],$stdframe+4*15($sp) - lrvr @x[12],@x[12] - lrvr @x[13],@x[13] - lrvr @x[14],@x[14] - lrvr @x[15],@x[15] - - la @t[0],0(@t[0],%r14) # reconstruct output pointer - cl${g}r %r14,@t[1] - jh .Ltail - - x @x[0],4*0(%r14) # xor with input - x @x[1],4*1(%r14) - st @x[0],4*0(@t[0]) # store output - x @x[2],4*2(%r14) - st @x[1],4*1(@t[0]) - x @x[3],4*3(%r14) - st @x[2],4*2(@t[0]) - x @x[4],4*4(%r14) - st @x[3],4*3(@t[0]) - lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11] - x @x[5],4*5(%r14) - st @x[4],4*4(@t[0]) - x @x[6],4*6(%r14) - al @x[0],$stdframe+4*8($sp) - st @x[5],4*5(@t[0]) - x @x[7],4*7(%r14) - al @x[1],$stdframe+4*9($sp) - st @x[6],4*6(@t[0]) - x @x[12],4*12(%r14) - al @x[2],$stdframe+4*10($sp) - st @x[7],4*7(@t[0]) - x @x[13],4*13(%r14) - al @x[3],$stdframe+4*11($sp) - st @x[12],4*12(@t[0]) - x @x[14],4*14(%r14) - st @x[13],4*13(@t[0]) - x @x[15],4*15(%r14) - st @x[14],4*14(@t[0]) - lrvr @x[0],@x[0] - st @x[15],4*15(@t[0]) - lrvr @x[1],@x[1] - lrvr @x[2],@x[2] - lrvr @x[3],@x[3] - lhi @x[12],1 - x @x[0],4*8(%r14) - al @x[12],$stdframe+4*12($sp) # increment counter - x @x[1],4*9(%r14) - st @x[0],4*8(@t[0]) - x @x[2],4*10(%r14) - st @x[1],4*9(@t[0]) - x @x[3],4*11(%r14) - st @x[2],4*10(@t[0]) - st @x[3],4*11(@t[0]) - - cl${g}r %r14,@t[1] # done yet? - la %r14,64(%r14) - jl .Loop_outer - -.Ldone: - xgr %r0,%r0 - xgr %r1,%r1 - xgr %r2,%r2 - xgr %r3,%r3 - stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy - stmg %r0,%r3,$stdframe+4*12($sp) - - lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) - br %r14 - -.align 16 -.Ltail: - la @t[1],64($t[1]) - stm @x[0],@x[7],$stdframe+4*0($sp) - sl${g}r @t[1],%r14 - lm @x[0],@x[3],$stdframe+4*8+4*8($sp) - l${g}hi @x[6],0 - stm @x[12],@x[15],$stdframe+4*12($sp) - al @x[0],$stdframe+4*8($sp) - al @x[1],$stdframe+4*9($sp) - al @x[2],$stdframe+4*10($sp) - al @x[3],$stdframe+4*11($sp) - lrvr @x[0],@x[0] - lrvr @x[1],@x[1] - lrvr @x[2],@x[2] - lrvr @x[3],@x[3] - stm @x[0],@x[3],$stdframe+4*8($sp) - -.Loop_tail: - llgc @x[4],0(@x[6],%r14) - llgc @x[5],$stdframe(@x[6],$sp) - xr @x[5],@x[4] - stc @x[5],0(@x[6],@t[0]) - la @x[6],1(@x[6]) - brct @t[1],.Loop_tail - - j .Ldone -.size ChaCha20_ctr32,.-ChaCha20_ctr32 - -.align 32 -.Lsigma: -.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral -.asciz "ChaCha20 for s390x, CRYPTOGAMS by " -.align 4 -___ + alr (@x[$a0],@x[$b0]); # Q1 + alr (@x[$a1],@x[$b1]); # Q2 + xr (@x[$d0],@x[$a0]); + xr (@x[$d1],@x[$a1]); + rll (@x[$d0],@x[$d0],16); + rll (@x[$d1],@x[$d1],16); + + alr ($xc,@x[$d0]); + alr ($xc_,@x[$d1]); + xr (@x[$b0],$xc); + xr (@x[$b1],$xc_); + rll (@x[$b0],@x[$b0],12); + rll (@x[$b1],@x[$b1],12); + + alr (@x[$a0],@x[$b0]); + alr (@x[$a1],@x[$b1]); + xr (@x[$d0],@x[$a0]); + xr (@x[$d1],@x[$a1]); + rll (@x[$d0],@x[$d0],8); + rll (@x[$d1],@x[$d1],8); + + alr ($xc,@x[$d0]); + alr ($xc_,@x[$d1]); + xr (@x[$b0],$xc); + xr (@x[$b1],$xc_); + rll (@x[$b0],@x[$b0],7); + rll (@x[$b1],@x[$b1],7); + + stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's + lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)"); + + alr (@x[$a2],@x[$b2]); # Q3 + alr (@x[$a3],@x[$b3]); # Q4 + xr (@x[$d2],@x[$a2]); + xr (@x[$d3],@x[$a3]); + rll (@x[$d2],@x[$d2],16); + rll (@x[$d3],@x[$d3],16); + + alr ($xc,@x[$d2]); + alr ($xc_,@x[$d3]); + xr (@x[$b2],$xc); + xr (@x[$b3],$xc_); + rll (@x[$b2],@x[$b2],12); + rll (@x[$b3],@x[$b3],12); + + alr (@x[$a2],@x[$b2]); + alr (@x[$a3],@x[$b3]); + xr (@x[$d2],@x[$a2]); + xr (@x[$d3],@x[$a3]); + rll (@x[$d2],@x[$d2],8); + rll (@x[$d3],@x[$d3],8); + + alr ($xc,@x[$d2]); + alr ($xc_,@x[$d3]); + xr (@x[$b2],$xc); + xr (@x[$b3],$xc_); + rll (@x[$b2],@x[$b2],7); + rll (@x[$b3],@x[$b3],7); +} + +sub VX_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my @x=map("%v$_",(0..15)); -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/ge; + vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1 + vx (@x[$d0],@x[$d0],@x[$a0]); + verllf (@x[$d0],@x[$d0],16); + vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2 + vx (@x[$d1],@x[$d1],@x[$a1]); + verllf (@x[$d1],@x[$d1],16); + vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3 + vx (@x[$d2],@x[$d2],@x[$a2]); + verllf (@x[$d2],@x[$d2],16); + vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4 + vx (@x[$d3],@x[$d3],@x[$a3]); + verllf (@x[$d3],@x[$d3],16); + + vaf (@x[$c0],@x[$c0],@x[$d0]); + vx (@x[$b0],@x[$b0],@x[$c0]); + verllf (@x[$b0],@x[$b0],12); + vaf (@x[$c1],@x[$c1],@x[$d1]); + vx (@x[$b1],@x[$b1],@x[$c1]); + verllf (@x[$b1],@x[$b1],12); + vaf (@x[$c2],@x[$c2],@x[$d2]); + vx (@x[$b2],@x[$b2],@x[$c2]); + verllf (@x[$b2],@x[$b2],12); + vaf (@x[$c3],@x[$c3],@x[$d3]); + vx (@x[$b3],@x[$b3],@x[$c3]); + verllf (@x[$b3],@x[$b3],12); + + vaf (@x[$a0],@x[$a0],@x[$b0]); + vx (@x[$d0],@x[$d0],@x[$a0]); + verllf (@x[$d0],@x[$d0],8); + vaf (@x[$a1],@x[$a1],@x[$b1]); + vx (@x[$d1],@x[$d1],@x[$a1]); + verllf (@x[$d1],@x[$d1],8); + vaf (@x[$a2],@x[$a2],@x[$b2]); + vx (@x[$d2],@x[$d2],@x[$a2]); + verllf (@x[$d2],@x[$d2],8); + vaf (@x[$a3],@x[$a3],@x[$b3]); + vx (@x[$d3],@x[$d3],@x[$a3]); + verllf (@x[$d3],@x[$d3],8); + + vaf (@x[$c0],@x[$c0],@x[$d0]); + vx (@x[$b0],@x[$b0],@x[$c0]); + verllf (@x[$b0],@x[$b0],7); + vaf (@x[$c1],@x[$c1],@x[$d1]); + vx (@x[$b1],@x[$b1],@x[$c1]); + verllf (@x[$b1],@x[$b1],7); + vaf (@x[$c2],@x[$c2],@x[$d2]); + vx (@x[$b2],@x[$b2],@x[$c2]); + verllf (@x[$b2],@x[$b2],7); + vaf (@x[$c3],@x[$c3],@x[$d3]); + vx (@x[$b3],@x[$b3],@x[$c3]); + verllf (@x[$b3],@x[$b3],7); +} - print $_,"\n"; +sub VX_ROUND { +my @a=@_[0..5]; +my @b=@_[6..11]; +my @c=@_[12..17]; +my @d=@_[18..23]; +my $odd=@_[24]; + + vaf (@a[$_],@a[$_],@b[$_]) for (0..5); + vx (@d[$_],@d[$_],@a[$_]) for (0..5); + verllf (@d[$_],@d[$_],16) for (0..5); + + vaf (@c[$_],@c[$_],@d[$_]) for (0..5); + vx (@b[$_],@b[$_],@c[$_]) for (0..5); + verllf (@b[$_],@b[$_],12) for (0..5); + + vaf (@a[$_],@a[$_],@b[$_]) for (0..5); + vx (@d[$_],@d[$_],@a[$_]) for (0..5); + verllf (@d[$_],@d[$_],8) for (0..5); + + vaf (@c[$_],@c[$_],@d[$_]) for (0..5); + vx (@b[$_],@b[$_],@c[$_]) for (0..5); + verllf (@b[$_],@b[$_],7) for (0..5); + + vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5); + vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5); + vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5); } -close STDOUT or die "error closing STDOUT: $!"; + +PERLASM_BEGIN($output); + +INCLUDE ("s390x_arch.h"); +TEXT (); + +################ +# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len, +# const unsigned int key[8], const unsigned int counter[4]) +my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); +{ +my $frame=$stdframe+4*20; +my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); +my @t=map("%r$_",(8,9)); + +GLOBL ("ChaCha20_ctr32"); +TYPE ("ChaCha20_ctr32","\@function"); +ALIGN (32); +LABEL ("ChaCha20_ctr32"); + larl ("%r1","OPENSSL_s390xcap_P"); + + lghi ("%r0",64); +&{$z? \<gr:\<r} ($len,$len); # len==0? + bzr ("%r14"); + lg ("%r1","S390X_STFLE+16(%r1)"); +&{$z? \&clgr:\&clr} ($len,"%r0"); + jle (".Lshort"); + + tmhh ("%r1",0x4000); # check for vx bit + jnz (".LChaCha20_ctr32_vx"); + +LABEL (".Lshort"); +&{$z? \&aghi:\&ahi} ($len,-64); +&{$z? \&lghi:\&lhi} ("%r1",-$frame); +&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)"); +&{$z? \&slgr:\&slr} ($out,$inp); # difference + la ($len,"0($inp,$len)"); # end of input minus 64 + larl ("%r7",".Lsigma"); + lgr ("%r0",$sp); + la ($sp,"0(%r1,$sp)"); +&{$z? \&stg:\&st} ("%r0","0($sp)"); + + lmg ("%r8","%r11","0($key)"); # load key + lmg ("%r12","%r13","0($counter)"); # load counter + lmg ("%r6","%r7","0(%r7)"); # load sigma constant + + la ("%r14","0($inp)"); +&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)"); +&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)"); + stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack + srlg (@x[12],"%r12",32); # 32-bit counter value + j (".Loop_outer"); + +ALIGN (16); +LABEL (".Loop_outer"); + lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7] + lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11] + lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15] + stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11] + lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9] + st (@x[12],"$stdframe+4*12($sp)"); # save counter +&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer + lhi ("%r14",10); + j (".Loop"); + +ALIGN (4); +LABEL (".Loop"); + ROUND (0, 4, 8,12); + ROUND (0, 5,10,15); + brct ("%r14",".Loop"); + +&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer + stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9] +&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)"); + + al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule + al (@x[1],"$stdframe+4*1($sp)"); + al (@x[2],"$stdframe+4*2($sp)"); + al (@x[3],"$stdframe+4*3($sp)"); + al (@x[4],"$stdframe+4*4($sp)"); + al (@x[5],"$stdframe+4*5($sp)"); + al (@x[6],"$stdframe+4*6($sp)"); + al (@x[7],"$stdframe+4*7($sp)"); + lrvr (@x[0],@x[0]); + lrvr (@x[1],@x[1]); + lrvr (@x[2],@x[2]); + lrvr (@x[3],@x[3]); + lrvr (@x[4],@x[4]); + lrvr (@x[5],@x[5]); + lrvr (@x[6],@x[6]); + lrvr (@x[7],@x[7]); + al (@x[12],"$stdframe+4*12($sp)"); + al (@x[13],"$stdframe+4*13($sp)"); + al (@x[14],"$stdframe+4*14($sp)"); + al (@x[15],"$stdframe+4*15($sp)"); + lrvr (@x[12],@x[12]); + lrvr (@x[13],@x[13]); + lrvr (@x[14],@x[14]); + lrvr (@x[15],@x[15]); + + la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer +&{$z? \&clgr:\&clr} ("%r14",@t[1]); + jh (".Ltail"); + + x (@x[0],"4*0(%r14)"); # xor with input + x (@x[1],"4*1(%r14)"); + st (@x[0],"4*0(@t[0])"); # store output + x (@x[2],"4*2(%r14)"); + st (@x[1],"4*1(@t[0])"); + x (@x[3],"4*3(%r14)"); + st (@x[2],"4*2(@t[0])"); + x (@x[4],"4*4(%r14)"); + st (@x[3],"4*3(@t[0])"); + lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11] + x (@x[5],"4*5(%r14)"); + st (@x[4],"4*4(@t[0])"); + x (@x[6],"4*6(%r14)"); + al (@x[0],"$stdframe+4*8($sp)"); + st (@x[5],"4*5(@t[0])"); + x (@x[7],"4*7(%r14)"); + al (@x[1],"$stdframe+4*9($sp)"); + st (@x[6],"4*6(@t[0])"); + x (@x[12],"4*12(%r14)"); + al (@x[2],"$stdframe+4*10($sp)"); + st (@x[7],"4*7(@t[0])"); + x (@x[13],"4*13(%r14)"); + al (@x[3],"$stdframe+4*11($sp)"); + st (@x[12],"4*12(@t[0])"); + x (@x[14],"4*14(%r14)"); + st (@x[13],"4*13(@t[0])"); + x (@x[15],"4*15(%r14)"); + st (@x[14],"4*14(@t[0])"); + lrvr (@x[0],@x[0]); + st (@x[15],"4*15(@t[0])"); + lrvr (@x[1],@x[1]); + lrvr (@x[2],@x[2]); + lrvr (@x[3],@x[3]); + lhi (@x[12],1); + x (@x[0],"4*8(%r14)"); + al (@x[12],"$stdframe+4*12($sp)"); # increment counter + x (@x[1],"4*9(%r14)"); + st (@x[0],"4*8(@t[0])"); + x (@x[2],"4*10(%r14)"); + st (@x[1],"4*9(@t[0])"); + x (@x[3],"4*11(%r14)"); + st (@x[2],"4*10(@t[0])"); + st (@x[3],"4*11(@t[0])"); + +&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet? + la ("%r14","64(%r14)"); + jl (".Loop_outer"); + +LABEL (".Ldone"); + xgr ("%r0","%r0"); + xgr ("%r1","%r1"); + xgr ("%r2","%r2"); + xgr ("%r3","%r3"); + stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy + stmg ("%r0","%r3","$stdframe+4*12($sp)"); + +&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)"); + br ("%r14"); + +ALIGN (16); +LABEL (".Ltail"); + la (@t[1],"64($t[1])"); + stm (@x[0],@x[7],"$stdframe+4*0($sp)"); +&{$z? \&slgr:\&slr} (@t[1],"%r14"); + lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); +&{$z? \&lghi:\&lhi} (@x[6],0); + stm (@x[12],@x[15],"$stdframe+4*12($sp)"); + al (@x[0],"$stdframe+4*8($sp)"); + al (@x[1],"$stdframe+4*9($sp)"); + al (@x[2],"$stdframe+4*10($sp)"); + al (@x[3],"$stdframe+4*11($sp)"); + lrvr (@x[0],@x[0]); + lrvr (@x[1],@x[1]); + lrvr (@x[2],@x[2]); + lrvr (@x[3],@x[3]); + stm (@x[0],@x[3],"$stdframe+4*8($sp)"); + +LABEL (".Loop_tail"); + llgc (@x[4],"0(@x[6],%r14)"); + llgc (@x[5],"$stdframe(@x[6],$sp)"); + xr (@x[5],@x[4]); + stc (@x[5],"0(@x[6],@t[0])"); + la (@x[6],"1(@x[6])"); + brct (@t[1],".Loop_tail"); + + j (".Ldone"); +SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32"); +} + +######################################################################## +# 4x"vertical" layout minimizes amount of instructions, but pipeline +# runs underutilized [because of vector instructions' high latency]. +# On the other hand minimum amount of data it takes to fully utilize +# the pipeline is higher, so that effectively, short inputs would be +# processed slower. Hence this code path targeting <=256 bytes lengths. +# +{ +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15)); +my @K=map("%v$_",(16..19)); +my $CTR="%v26"; +my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30)); +my $beperm="%v31"; +my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10))); +my $FRAME=$stdframe+4*16; + +ALIGN (32); +LABEL ("ChaCha20_ctr32_4x"); +LABEL (".LChaCha20_ctr32_4x"); +&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)"); +if (!$z) { + std ("%f4","16*$SIZE_T+2*8($sp)"); + std ("%f6","16*$SIZE_T+3*8($sp)"); +} +&{$z? \&lghi:\&lhi} ("%r1",-$FRAME); + lgr ("%r0",$sp); + la ($sp,"0(%r1,$sp)"); +&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain +if ($z) { + std ("%f8","$stdframe+8*0($sp)"); + std ("%f9","$stdframe+8*1($sp)"); + std ("%f10","$stdframe+8*2($sp)"); + std ("%f11","$stdframe+8*3($sp)"); + std ("%f12","$stdframe+8*4($sp)"); + std ("%f13","$stdframe+8*5($sp)"); + std ("%f14","$stdframe+8*6($sp)"); + std ("%f15","$stdframe+8*7($sp)"); +} + larl ("%r7",".Lsigma"); + lhi ("%r0",10); + lhi ("%r1",0); + + vl (@K[0],"0(%r7)"); # load sigma + vl (@K[1],"0($key)"); # load key + vl (@K[2],"16($key)"); + vl (@K[3],"0($counter)"); # load counter + + vl ($beperm,"0x40(%r7)"); + vl ($xt1,"0x50(%r7)"); + vrepf ($CTR,@K[3],0); + vlvgf (@K[3],"%r1",0); # clear @K[3].word[0] + vaf ($CTR,$CTR,$xt1); + +#LABEL (".Loop_outer_4x"); + vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma + + vrepf ($xb0,@K[1],0); # smash the key + vrepf ($xb1,@K[1],1); + vrepf ($xb2,@K[1],2); + vrepf ($xb3,@K[1],3); + + vrepf ($xc0,@K[2],0); + vrepf ($xc1,@K[2],1); + vrepf ($xc2,@K[2],2); + vrepf ($xc3,@K[2],3); + + vlr ($xd0,$CTR); + vrepf ($xd1,@K[3],1); + vrepf ($xd2,@K[3],2); + vrepf ($xd3,@K[3],3); + +LABEL (".Loop_4x"); + VX_lane_ROUND(0, 4, 8,12); + VX_lane_ROUND(0, 5,10,15); + brct ("%r0",".Loop_4x"); + + vaf ($xd0,$xd0,$CTR); + + vmrhf ($xt0,$xa0,$xa1); # transpose data + vmrhf ($xt1,$xa2,$xa3); + vmrlf ($xt2,$xa0,$xa1); + vmrlf ($xt3,$xa2,$xa3); + vpdi ($xa0,$xt0,$xt1,0b0000); + vpdi ($xa1,$xt0,$xt1,0b0101); + vpdi ($xa2,$xt2,$xt3,0b0000); + vpdi ($xa3,$xt2,$xt3,0b0101); + + vmrhf ($xt0,$xb0,$xb1); + vmrhf ($xt1,$xb2,$xb3); + vmrlf ($xt2,$xb0,$xb1); + vmrlf ($xt3,$xb2,$xb3); + vpdi ($xb0,$xt0,$xt1,0b0000); + vpdi ($xb1,$xt0,$xt1,0b0101); + vpdi ($xb2,$xt2,$xt3,0b0000); + vpdi ($xb3,$xt2,$xt3,0b0101); + + vmrhf ($xt0,$xc0,$xc1); + vmrhf ($xt1,$xc2,$xc3); + vmrlf ($xt2,$xc0,$xc1); + vmrlf ($xt3,$xc2,$xc3); + vpdi ($xc0,$xt0,$xt1,0b0000); + vpdi ($xc1,$xt0,$xt1,0b0101); + vpdi ($xc2,$xt2,$xt3,0b0000); + vpdi ($xc3,$xt2,$xt3,0b0101); + + vmrhf ($xt0,$xd0,$xd1); + vmrhf ($xt1,$xd2,$xd3); + vmrlf ($xt2,$xd0,$xd1); + vmrlf ($xt3,$xd2,$xd3); + vpdi ($xd0,$xt0,$xt1,0b0000); + vpdi ($xd1,$xt0,$xt1,0b0101); + vpdi ($xd2,$xt2,$xt3,0b0000); + vpdi ($xd3,$xt2,$xt3,0b0101); + + #vrepif ($xt0,4); + #vaf ($CTR,$CTR,$xt0); # next counter value + + vaf ($xa0,$xa0,@K[0]); + vaf ($xb0,$xb0,@K[1]); + vaf ($xc0,$xc0,@K[2]); + vaf ($xd0,$xd0,@K[3]); + + vperm ($xa0,$xa0,$xa0,$beperm); + vperm ($xb0,$xb0,$xb0,$beperm); + vperm ($xc0,$xc0,$xc0,$beperm); + vperm ($xd0,$xd0,$xd0,$beperm); + + #&{$z? \&clgfi:\&clfi} ($len,0x40); + #jl (".Ltail_4x"); + + vlm ($xt0,$xt3,"0($inp)"); + + vx ($xt0,$xt0,$xa0); + vx ($xt1,$xt1,$xb0); + vx ($xt2,$xt2,$xc0); + vx ($xt3,$xt3,$xd0); + + vstm ($xt0,$xt3,"0($out)"); + + la ($inp,"0x40($inp)"); + la ($out,"0x40($out)"); +&{$z? \&aghi:\&ahi} ($len,-0x40); + #je (".Ldone_4x"); + + vaf ($xa0,$xa1,@K[0]); + vaf ($xb0,$xb1,@K[1]); + vaf ($xc0,$xc1,@K[2]); + vaf ($xd0,$xd1,@K[3]); + + vperm ($xa0,$xa0,$xa0,$beperm); + vperm ($xb0,$xb0,$xb0,$beperm); + vperm ($xc0,$xc0,$xc0,$beperm); + vperm ($xd0,$xd0,$xd0,$beperm); + +&{$z? \&clgfi:\&clfi} ($len,0x40); + jl (".Ltail_4x"); + + vlm ($xt0,$xt3,"0($inp)"); + + vx ($xt0,$xt0,$xa0); + vx ($xt1,$xt1,$xb0); + vx ($xt2,$xt2,$xc0); + vx ($xt3,$xt3,$xd0); + + vstm ($xt0,$xt3,"0($out)"); + + la ($inp,"0x40($inp)"); + la ($out,"0x40($out)"); +&{$z? \&aghi:\&ahi} ($len,-0x40); + je (".Ldone_4x"); + + vaf ($xa0,$xa2,@K[0]); + vaf ($xb0,$xb2,@K[1]); + vaf ($xc0,$xc2,@K[2]); + vaf ($xd0,$xd2,@K[3]); + + vperm ($xa0,$xa0,$xa0,$beperm); + vperm ($xb0,$xb0,$xb0,$beperm); + vperm ($xc0,$xc0,$xc0,$beperm); + vperm ($xd0,$xd0,$xd0,$beperm); + +&{$z? \&clgfi:\&clfi} ($len,0x40); + jl (".Ltail_4x"); + + vlm ($xt0,$xt3,"0($inp)"); + + vx ($xt0,$xt0,$xa0); + vx ($xt1,$xt1,$xb0); + vx ($xt2,$xt2,$xc0); + vx ($xt3,$xt3,$xd0); + + vstm ($xt0,$xt3,"0($out)"); + + la ($inp,"0x40($inp)"); + la ($out,"0x40($out)"); +&{$z? \&aghi:\&ahi} ($len,-0x40); + je (".Ldone_4x"); + + vaf ($xa0,$xa3,@K[0]); + vaf ($xb0,$xb3,@K[1]); + vaf ($xc0,$xc3,@K[2]); + vaf ($xd0,$xd3,@K[3]); + + vperm ($xa0,$xa0,$xa0,$beperm); + vperm ($xb0,$xb0,$xb0,$beperm); + vperm ($xc0,$xc0,$xc0,$beperm); + vperm ($xd0,$xd0,$xd0,$beperm); + +&{$z? \&clgfi:\&clfi} ($len,0x40); + jl (".Ltail_4x"); + + vlm ($xt0,$xt3,"0($inp)"); + + vx ($xt0,$xt0,$xa0); + vx ($xt1,$xt1,$xb0); + vx ($xt2,$xt2,$xc0); + vx ($xt3,$xt3,$xd0); + + vstm ($xt0,$xt3,"0($out)"); + + #la $inp,0x40($inp)); + #la $out,0x40($out)); + #lhi %r0,10); + #&{$z? \&aghi:\&ahi} $len,-0x40); + #jne .Loop_outer_4x); + +LABEL (".Ldone_4x"); +if (!$z) { + ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); + ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); +} else { + ld ("%f8","$stdframe+8*0($sp)"); + ld ("%f9","$stdframe+8*1($sp)"); + ld ("%f10","$stdframe+8*2($sp)"); + ld ("%f11","$stdframe+8*3($sp)"); + ld ("%f12","$stdframe+8*4($sp)"); + ld ("%f13","$stdframe+8*5($sp)"); + ld ("%f14","$stdframe+8*6($sp)"); + ld ("%f15","$stdframe+8*7($sp)"); +} +&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); + la ($sp,"$FRAME($sp)"); + br ("%r14"); + +ALIGN (16); +LABEL (".Ltail_4x"); +if (!$z) { + vlr ($xt0,$xb0); + ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); + ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); + + vst ($xa0,"$stdframe+0x00($sp)"); + vst ($xt0,"$stdframe+0x10($sp)"); + vst ($xc0,"$stdframe+0x20($sp)"); + vst ($xd0,"$stdframe+0x30($sp)"); +} else { + vlr ($xt0,$xc0); + ld ("%f8","$stdframe+8*0($sp)"); + ld ("%f9","$stdframe+8*1($sp)"); + ld ("%f10","$stdframe+8*2($sp)"); + ld ("%f11","$stdframe+8*3($sp)"); + vlr ($xt1,$xd0); + ld ("%f12","$stdframe+8*4($sp)"); + ld ("%f13","$stdframe+8*5($sp)"); + ld ("%f14","$stdframe+8*6($sp)"); + ld ("%f15","$stdframe+8*7($sp)"); + + vst ($xa0,"$stdframe+0x00($sp)"); + vst ($xb0,"$stdframe+0x10($sp)"); + vst ($xt0,"$stdframe+0x20($sp)"); + vst ($xt1,"$stdframe+0x30($sp)"); +} + lghi ("%r1",0); + +LABEL (".Loop_tail_4x"); + llgc ("%r5","0(%r1,$inp)"); + llgc ("%r6","$stdframe(%r1,$sp)"); + xr ("%r6","%r5"); + stc ("%r6","0(%r1,$out)"); + la ("%r1","1(%r1)"); + brct ($len,".Loop_tail_4x"); + +&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); + la ($sp,"$FRAME($sp)"); + br ("%r14"); +SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x"); +} + +######################################################################## +# 6x"horizontal" layout is optimal fit for the platform in its current +# shape, more specifically for given vector instructions' latency. Well, +# computational part of 8x"vertical" would be faster, but it consumes +# all registers and dealing with that will diminish the return... +# +{ +my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1, + $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3, + $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23)); +my @K=map("%v$_",(27,24..26)); +my ($t0,$t1,$t2,$t3)=map("%v$_",27..30); +my $beperm="%v31"; +my $FRAME=$stdframe + 4*16; + +GLOBL ("ChaCha20_ctr32_vx"); +ALIGN (32); +LABEL ("ChaCha20_ctr32_vx"); +LABEL (".LChaCha20_ctr32_vx"); +&{$z? \&clgfi:\&clfi} ($len,256); + jle (".LChaCha20_ctr32_4x"); +&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)"); +if (!$z) { + std ("%f4","16*$SIZE_T+2*8($sp)"); + std ("%f6","16*$SIZE_T+3*8($sp)"); +} +&{$z? \&lghi:\&lhi} ("%r1",-$FRAME); + lgr ("%r0",$sp); + la ($sp,"0(%r1,$sp)"); +&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain +if ($z) { + std ("%f8","$FRAME-8*8($sp)"); + std ("%f9","$FRAME-8*7($sp)"); + std ("%f10","$FRAME-8*6($sp)"); + std ("%f11","$FRAME-8*5($sp)"); + std ("%f12","$FRAME-8*4($sp)"); + std ("%f13","$FRAME-8*3($sp)"); + std ("%f14","$FRAME-8*2($sp)"); + std ("%f15","$FRAME-8*1($sp)"); +} + larl ("%r7",".Lsigma"); + lhi ("%r0",10); + + vlm (@K[1],@K[2],"0($key)"); # load key + vl (@K[3],"0($counter)"); # load counter + + vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ... + +LABEL (".Loop_outer_vx"); + vlr ($a0,@K[0]); + vlr ($b0,@K[1]); + vlr ($a1,@K[0]); + vlr ($b1,@K[1]); + vlr ($a2,@K[0]); + vlr ($b2,@K[1]); + vlr ($a3,@K[0]); + vlr ($b3,@K[1]); + vlr ($a4,@K[0]); + vlr ($b4,@K[1]); + vlr ($a5,@K[0]); + vlr ($b5,@K[1]); + + vlr ($d0,@K[3]); + vaf ($d1,@K[3],$t1); # K[3]+1 + vaf ($d2,@K[3],$t2); # K[3]+2 + vaf ($d3,@K[3],$t3); # K[3]+3 + vaf ($d4,$d2,$t2); # K[3]+4 + vaf ($d5,$d2,$t3); # K[3]+5 + + vlr ($c0,@K[2]); + vlr ($c1,@K[2]); + vlr ($c2,@K[2]); + vlr ($c3,@K[2]); + vlr ($c4,@K[2]); + vlr ($c5,@K[2]); + + vlr ($t1,$d1); + vlr ($t2,$d2); + vlr ($t3,$d3); + +ALIGN (4); +LABEL (".Loop_vx"); + + VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5, + $b0,$b1,$b2,$b3,$b4,$b5, + $c0,$c1,$c2,$c3,$c4,$c5, + $d0,$d1,$d2,$d3,$d4,$d5, + 0); + + VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5, + $b0,$b1,$b2,$b3,$b4,$b5, + $c0,$c1,$c2,$c3,$c4,$c5, + $d0,$d1,$d2,$d3,$d4,$d5, + 1); + + brct ("%r0",".Loop_vx"); + + vaf ($a0,$a0,@K[0]); + vaf ($b0,$b0,@K[1]); + vaf ($c0,$c0,@K[2]); + vaf ($d0,$d0,@K[3]); + vaf ($a1,$a1,@K[0]); + vaf ($d1,$d1,$t1); # +K[3]+1 + + vperm ($a0,$a0,$a0,$beperm); + vperm ($b0,$b0,$b0,$beperm); + vperm ($c0,$c0,$c0,$beperm); + vperm ($d0,$d0,$d0,$beperm); + +&{$z? \&clgfi:\&clfi} ($len,0x40); + jl (".Ltail_vx"); + + vaf ($d2,$d2,$t2); # +K[3]+2 + vaf ($d3,$d3,$t3); # +K[3]+3 + vlm ($t0,$t3,"0($inp)"); + + vx ($a0,$a0,$t0); + vx ($b0,$b0,$t1); + vx ($c0,$c0,$t2); + vx ($d0,$d0,$t3); + + vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments + + vstm ($a0,$d0,"0($out)"); + + la ($inp,"0x40($inp)"); + la ($out,"0x40($out)"); +&{$z? \&aghi:\&ahi} ($len,-0x40); + je (".Ldone_vx"); + + vaf ($b1,$b1,@K[1]); + vaf ($c1,$c1,@K[2]); + + vperm ($a0,$a1,$a1,$beperm); + vperm ($b0,$b1,$b1,$beperm); + vperm ($c0,$c1,$c1,$beperm); + vperm ($d0,$d1,$d1,$beperm); + +&{$z? \&clgfi:\&clfi} ($len,0x40); + jl (".Ltail_vx"); + + vlm ($a1,$d1,"0($inp)"); + + vx ($a0,$a0,$a1); + vx ($b0,$b0,$b1); + vx ($c0,$c0,$c1); + vx ($d0,$d0,$d1); + + vstm ($a0,$d0,"0($out)"); + + la ($inp,"0x40($inp)"); + la ($out,"0x40($out)"); +&{$z? \&aghi:\&ahi} ($len,-0x40); + je (".Ldone_vx"); + + vaf ($a2,$a2,@K[0]); + vaf ($b2,$b2,@K[1]); + vaf ($c2,$c2,@K[2]); + + vperm ($a0,$a2,$a2,$beperm); + vperm ($b0,$b2,$b2,$beperm); + vperm ($c0,$c2,$c2,$beperm); + vperm ($d0,$d2,$d2,$beperm); + +&{$z? \&clgfi:\&clfi} ($len,0x40); + jl (".Ltail_vx"); + + vlm ($a1,$d1,"0($inp)"); + + vx ($a0,$a0,$a1); + vx ($b0,$b0,$b1); + vx ($c0,$c0,$c1); + vx ($d0,$d0,$d1); + + vstm ($a0,$d0,"0($out)"); + + la ($inp,"0x40($inp)"); + la ($out,"0x40($out)"); +&{$z? \&aghi:\&ahi} ($len,-0x40); + je (".Ldone_vx"); + + vaf ($a3,$a3,@K[0]); + vaf ($b3,$b3,@K[1]); + vaf ($c3,$c3,@K[2]); + vaf ($d2,@K[3],$t3); # K[3]+3 + + vperm ($a0,$a3,$a3,$beperm); + vperm ($b0,$b3,$b3,$beperm); + vperm ($c0,$c3,$c3,$beperm); + vperm ($d0,$d3,$d3,$beperm); + +&{$z? \&clgfi:\&clfi} ($len,0x40); + jl (".Ltail_vx"); + + vaf ($d3,$d2,$t1); # K[3]+4 + vlm ($a1,$d1,"0($inp)"); + + vx ($a0,$a0,$a1); + vx ($b0,$b0,$b1); + vx ($c0,$c0,$c1); + vx ($d0,$d0,$d1); + + vstm ($a0,$d0,"0($out)"); + + la ($inp,"0x40($inp)"); + la ($out,"0x40($out)"); +&{$z? \&aghi:\&ahi} ($len,-0x40); + je (".Ldone_vx"); + + vaf ($a4,$a4,@K[0]); + vaf ($b4,$b4,@K[1]); + vaf ($c4,$c4,@K[2]); + vaf ($d4,$d4,$d3); # +K[3]+4 + vaf ($d3,$d3,$t1); # K[3]+5 + vaf (@K[3],$d2,$t3); # K[3]+=6 + + vperm ($a0,$a4,$a4,$beperm); + vperm ($b0,$b4,$b4,$beperm); + vperm ($c0,$c4,$c4,$beperm); + vperm ($d0,$d4,$d4,$beperm); + +&{$z? \&clgfi:\&clfi} ($len,0x40); + jl (".Ltail_vx"); + + vlm ($a1,$d1,"0($inp)"); + + vx ($a0,$a0,$a1); + vx ($b0,$b0,$b1); + vx ($c0,$c0,$c1); + vx ($d0,$d0,$d1); + + vstm ($a0,$d0,"0($out)"); + + la ($inp,"0x40($inp)"); + la ($out,"0x40($out)"); +&{$z? \&aghi:\&ahi} ($len,-0x40); + je (".Ldone_vx"); + + vaf ($a5,$a5,@K[0]); + vaf ($b5,$b5,@K[1]); + vaf ($c5,$c5,@K[2]); + vaf ($d5,$d5,$d3); # +K[3]+5 + + vperm ($a0,$a5,$a5,$beperm); + vperm ($b0,$b5,$b5,$beperm); + vperm ($c0,$c5,$c5,$beperm); + vperm ($d0,$d5,$d5,$beperm); + +&{$z? \&clgfi:\&clfi} ($len,0x40); + jl (".Ltail_vx"); + + vlm ($a1,$d1,"0($inp)"); + + vx ($a0,$a0,$a1); + vx ($b0,$b0,$b1); + vx ($c0,$c0,$c1); + vx ($d0,$d0,$d1); + + vstm ($a0,$d0,"0($out)"); + + la ($inp,"0x40($inp)"); + la ($out,"0x40($out)"); + lhi ("%r0",10); +&{$z? \&aghi:\&ahi} ($len,-0x40); + jne (".Loop_outer_vx"); + +LABEL (".Ldone_vx"); +if (!$z) { + ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); + ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); +} else { + ld ("%f8","$FRAME-8*8($sp)"); + ld ("%f9","$FRAME-8*7($sp)"); + ld ("%f10","$FRAME-8*6($sp)"); + ld ("%f11","$FRAME-8*5($sp)"); + ld ("%f12","$FRAME-8*4($sp)"); + ld ("%f13","$FRAME-8*3($sp)"); + ld ("%f14","$FRAME-8*2($sp)"); + ld ("%f15","$FRAME-8*1($sp)"); +} +&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); + la ($sp,"$FRAME($sp)"); + br ("%r14"); + +ALIGN (16); +LABEL (".Ltail_vx"); +if (!$z) { + ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); + ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); +} else { + ld ("%f8","$FRAME-8*8($sp)"); + ld ("%f9","$FRAME-8*7($sp)"); + ld ("%f10","$FRAME-8*6($sp)"); + ld ("%f11","$FRAME-8*5($sp)"); + ld ("%f12","$FRAME-8*4($sp)"); + ld ("%f13","$FRAME-8*3($sp)"); + ld ("%f14","$FRAME-8*2($sp)"); + ld ("%f15","$FRAME-8*1($sp)"); +} + vstm ($a0,$d0,"$stdframe($sp)"); + lghi ("%r1",0); + +LABEL (".Loop_tail_vx"); + llgc ("%r5","0(%r1,$inp)"); + llgc ("%r6","$stdframe(%r1,$sp)"); + xr ("%r6","%r5"); + stc ("%r6","0(%r1,$out)"); + la ("%r1","1(%r1)"); + brct ($len,".Loop_tail_vx"); + +&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); + la ($sp,"$FRAME($sp)"); + br ("%r14"); +SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx"); +} +################ + +ALIGN (32); +LABEL (".Lsigma"); +LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma +LONG (1,0,0,0); +LONG (2,0,0,0); +LONG (3,0,0,0); +LONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap + +LONG (0,1,2,3); +LONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma +LONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e); +LONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32); +LONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574); + +ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by \""); +ALIGN (4); + +PERLASM_END(); diff -up openssl-1.1.1e/crypto/perlasm/s390x.pm.s390x-update openssl-1.1.1e/crypto/perlasm/s390x.pm --- openssl-1.1.1e/crypto/perlasm/s390x.pm.s390x-update 2020-03-19 16:20:22.039227394 +0100 +++ openssl-1.1.1e/crypto/perlasm/s390x.pm 2020-03-19 16:20:22.039227394 +0100 @@ -0,0 +1,3060 @@ +#!/usr/bin/env perl +# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# Copyright IBM Corp. 2018 +# Author: Patrick Steuer + +package perlasm::s390x; + +use strict; +use warnings; +use Carp qw(confess); +use Exporter qw(import); + +our @EXPORT=qw(PERLASM_BEGIN PERLASM_END); +our @EXPORT_OK=qw(AUTOLOAD LABEL INCLUDE stfle); +our %EXPORT_TAGS=( + MSA => [qw(kmac km kmc kimd klmd)], + MSA4 => [qw(kmf kmo pcc kmctr)], + MSA5 => [qw(ppno prno)], + MSA8 => [qw(kma)], + VX => [qw(vgef vgeg vgbm vzero vone vgm vgmb vgmh vgmf vgmg + vl vlr vlrep vlrepb vlreph vlrepf vlrepg vleb vleh vlef vleg vleib + vleih vleif vleig vlgv vlgvb vlgvh vlgvf vlgvg vllez vllezb vllezh + vllezf vllezg vlm vlbb vlvg vlvgb vlvgh vlvgf vlvgg vlvgp + vll vmrh vmrhb vmrhh vmrhf vmrhg vmrl vmrlb vmrlh vmrlf vmrlg vpk + vpkh vpkf vpkg vpks vpksh vpksf vpksg vpkshs vpksfs vpksgs vpkls + vpklsh vpklsf vpklsg vpklshs vpklsfs vpklsgs vperm vpdi vrep vrepb + vreph vrepf vrepg vrepi vrepib vrepih vrepif vrepig vscef vsceg + vsel vseg vsegb vsegh vsegf vst vsteb vsteh vstef vsteg vstm vstl + vuph vuphb vuphh vuphf vuplh vuplhb vuplhh vuplhf vupl vuplb vuplhw + vuplf vupll vupllb vupllh vupllf va vab vah vaf vag vaq vacc vaccb + vacch vaccf vaccg vaccq vac vacq vaccc vacccq vn vnc vavg vavgb + vavgh vavgf vavgg vavgl vavglb vavglh vavglf vavglg vcksm vec_ vecb + vech vecf vecg vecl veclb veclh veclf veclg vceq vceqb vceqh vceqf + vceqg vceqbs vceqhs vceqfs vceqgs vch vchb vchh vchf vchg vchbs + vchhs vchfs vchgs vchl vchlb vchlh vchlf vchlg vchlbs vchlhs vchlfs + vchlgs vclz vclzb vclzh vclzf vclzg vctz vctzb vctzh vctzf vctzg + vx vgfm vgfmb vgfmh vgfmf vgfmg vgfma vgfmab vgfmah vgfmaf vgfmag + vlc vlcb vlch vlcf vlcg vlp vlpb vlph vlpf vlpg vmx vmxb vmxh vmxf + vmxg vmxl vmxlb vmxlh vmxlf vmxlg vmn vmnb vmnh vmnf vmng vmnl + vmnlb vmnlh vmnlf vmnlg vmal vmalb vmalhw vmalf vmah vmahb vmahh + vmahf vmalh vmalhb vmalhh vmalhf vmae vmaeb vmaeh vmaef vmale + vmaleb vmaleh vmalef vmao vmaob vmaoh vmaof vmalo vmalob vmaloh + vmalof vmh vmhb vmhh vmhf vmlh vmlhb vmlhh vmlhf vml vmlb vmlhw + vmlf vme vmeb vmeh vmef vmle vmleb vmleh vmlef vmo vmob vmoh vmof + vmlo vmlob vmloh vmlof vno vnot vo vpopct verllv verllvb verllvh + verllvf verllvg verll verllb verllh verllf verllg verim verimb + verimh verimf verimg veslv veslvb veslvh veslvf veslvg vesl veslb + veslh veslf veslg vesrav vesravb vesravh vesravf vesravg vesra + vesrab vesrah vesraf vesrag vesrlv vesrlvb vesrlvh vesrlvf vesrlvg + vesrl vesrlb vesrlh vesrlf vesrlg vsl vslb vsldb vsra vsrab vsrl + vsrlb vs vsb vsh vsf vsg vsq vscbi vscbib vscbih vscbif vscbig + vscbiq vsbi vsbiq vsbcbi vsbcbiq vsumg vsumgh vsumgf vsumq vsumqf + vsumqg vsum vsumb vsumh vtm vfae vfaeb vfaeh vfaef vfaebs vfaehs + vfaefs vfaezb vfaezh vfaezf vfaezbs vfaezhs vfaezfs vfee vfeeb + vfeeh vfeef vfeebs vfeehs vfeefs vfeezb vfeezh vfeezf vfeezbs + vfeezhs vfeezfs vfene vfeneb vfeneh vfenef vfenebs vfenehs vfenefs + vfenezb vfenezh vfenezf vfenezbs vfenezhs vfenezfs vistr vistrb + vistrh vistrf vistrbs vistrhs vistrfs vstrc vstrcb vstrch vstrcf + vstrcbs vstrchs vstrcfs vstrczb vstrczh vstrczf vstrczbs vstrczhs + vstrczfs vfa vfadb wfadb wfc wfcdb wfk wfkdb vfce vfcedb wfcedb + vfcedbs wfcedbs vfch vfchdb wfchdb vfchdbs wfchdbs vfche vfchedb + wfchedb vfchedbs wfchedbs vcdg vcdgb wcdgb vcdlg vcdlgb wcdlgb vcgd + vcgdb wcgdb vclgd vclgdb wclgdb vfd vfddb wfddb vfi vfidb wfidb + vlde vldeb wldeb vled vledb wledb vfm vfmdb wfmdb vfma vfmadb + wfmadb vfms vfmsdb wfmsdb vfpso vfpsodb wfpsodb vflcdb wflcdb + vflndb wflndb vflpdb wflpdb vfsq vfsqdb wfsqdb vfs vfsdb wfsdb + vftci vftcidb wftcidb)], + VXE => [qw(vbperm vllezlf vmsl vmslg vnx vnn voc vpopctb vpopcth + vpopctf vpopctg vfasb wfasb wfaxb wfcsb wfcxb wfksb wfkxb vfcesb + vfcesbs wfcesb wfcesbs wfcexb wfcexbs vfchsb vfchsbs wfchsb wfchsbs + wfchxb wfchxbs vfchesb vfchesbs wfchesb wfchesbs wfchexb wfchexbs + vfdsb wfdsb wfdxb vfisb wfisb wfixb vfll vflls wflls wflld vflr + vflrd wflrd wflrx vfmax vfmaxsb vfmaxdb wfmaxsb wfmaxdb wfmaxxb + vfmin vfminsb vfmindb wfminsb wfmindb wfminxb vfmsb wfmsb wfmxb + vfnma vfnms vfmasb wfmasb wfmaxb vfmssb wfmssb wfmsxb vfnmasb + vfnmadb wfnmasb wfnmadb wfnmaxb vfnmssb vfnmsdb wfnmssb wfnmsdb + wfnmsxb vfpsosb wfpsosb vflcsb wflcsb vflnsb wflnsb vflpsb wflpsb + vfpsoxb wfpsoxb vflcxb wflcxb vflnxb wflnxb vflpxb wflpxb vfsqsb + wfsqsb wfsqxb vfssb wfssb wfsxb vftcisb wftcisb wftcixb)], + VXD => [qw(vlrlr vlrl vstrlr vstrl vap vcp vcvb vcvbg vcvd vcvdg vdp + vlip vmp vmsp vpkz vpsop vrp vsdp vsrp vsp vtp vupkz)], +); +Exporter::export_ok_tags(qw(MSA MSA4 MSA5 MSA8 VX VXE VXD)); + +our $AUTOLOAD; + +my $GR='(?:%r)?([0-9]|1[0-5])'; +my $VR='(?:%v)?([0-9]|1[0-9]|2[0-9]|3[0-1])'; + +my ($file,$out); + +sub PERLASM_BEGIN +{ + ($file,$out)=(shift,""); +} +sub PERLASM_END +{ + if (defined($file)) { + open(my $fd,'>',$file)||die("can't open $file: $!"); + print({$fd}$out); + close($fd); + } else { + print($out); + } +} + +sub AUTOLOAD { + confess(err("PARSE")) if (grep(!defined($_),@_)); + my $token; + for ($AUTOLOAD) { + $token=".$1" if (/^.*::([A-Z_]+)$/); # uppercase: directive + $token="\t$1" if (/^.*::([a-z]+)$/); # lowercase: mnemonic + confess(err("PARSE")) if (!defined($token)); + } + $token.="\t" if ($#_>=0); + $out.=$token.join(',',@_)."\n"; +} + +sub LABEL { # label directive + confess(err("ARGNUM")) if ($#_!=0); + my ($label)=@_; + $out.="$label:\n"; +} + +sub INCLUDE { + confess(err("ARGNUM")) if ($#_!=0); + my ($file)=@_; + $out.="#include \"$file\"\n"; +} + +# +# Mnemonics +# + +sub stfle { + confess(err("ARGNUM")) if ($#_!=0); + S(0xb2b0,@_); +} + +# MSA + +sub kmac { + confess(err("ARGNUM")) if ($#_!=1); + RRE(0xb91e,@_); +} + +sub km { + confess(err("ARGNUM")) if ($#_!=1); + RRE(0xb92e,@_); +} + +sub kmc { + confess(err("ARGNUM")) if ($#_!=1); + RRE(0xb92f,@_); +} + +sub kimd { + confess(err("ARGNUM")) if ($#_!=1); + RRE(0xb93e,@_); +} + +sub klmd { + confess(err("ARGNUM")) if ($#_!=1); + RRE(0xb93f,@_); +} + +# MSA4 + +sub kmf { + confess(err("ARGNUM")) if ($#_!=1); + RRE(0xb92a,@_); +} + +sub kmo { + confess(err("ARGNUM")) if ($#_!=1); + RRE(0xb92b,@_); +} + +sub pcc { + confess(err("ARGNUM")) if ($#_!=-1); + RRE(0xb92c,@_); +} + +sub kmctr { + confess(err("ARGNUM")) if ($#_!=2); + RRFb(0xb92d,@_); +} + +# MSA5 + +sub prno { + ppno(@_); +} + +sub ppno { # deprecated, use prno + confess(err("ARGNUM")) if ($#_!=1); + RRE(0xb93c,@_); +} + +# MSA8 + +sub kma { + confess(err("ARGNUM")) if ($#_!=2); + RRFb(0xb929,@_); +} + +# VX - Support Instructions + +sub vgef { + confess(err("ARGNUM")) if ($#_!=2); + VRV(0xe713,@_); +} +sub vgeg { + confess(err("ARGNUM")) if ($#_!=2); + VRV(0xe712,@_); +} + +sub vgbm { + confess(err("ARGNUM")) if ($#_!=1); + VRIa(0xe744,@_); +} +sub vzero { + vgbm(@_,0); +} +sub vone { + vgbm(@_,0xffff); +} + +sub vgm { + confess(err("ARGNUM")) if ($#_!=3); + VRIb(0xe746,@_); +} +sub vgmb { + vgm(@_,0); +} +sub vgmh { + vgm(@_,1); +} +sub vgmf { + vgm(@_,2); +} +sub vgmg { + vgm(@_,3); +} + +sub vl { + confess(err("ARGNUM")) if ($#_<1||$#_>2); + VRX(0xe706,@_); +} + +sub vlr { + confess(err("ARGNUM")) if ($#_!=1); + VRRa(0xe756,@_); +} + +sub vlrep { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe705,@_); +} +sub vlrepb { + vlrep(@_,0); +} +sub vlreph { + vlrep(@_,1); +} +sub vlrepf { + vlrep(@_,2); +} +sub vlrepg { + vlrep(@_,3); +} + +sub vleb { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe700,@_); +} +sub vleh { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe701,@_); +} +sub vlef { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe703,@_); +} +sub vleg { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe702,@_); +} + +sub vleib { + confess(err("ARGNUM")) if ($#_!=2); + VRIa(0xe740,@_); +} +sub vleih { + confess(err("ARGNUM")) if ($#_!=2); + VRIa(0xe741,@_); +} +sub vleif { + confess(err("ARGNUM")) if ($#_!=2); + VRIa(0xe743,@_); +} +sub vleig { + confess(err("ARGNUM")) if ($#_!=2); + VRIa(0xe742,@_); +} + +sub vlgv { + confess(err("ARGNUM")) if ($#_!=3); + VRSc(0xe721,@_); +} +sub vlgvb { + vlgv(@_,0); +} +sub vlgvh { + vlgv(@_,1); +} +sub vlgvf { + vlgv(@_,2); +} +sub vlgvg { + vlgv(@_,3); +} + +sub vllez { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe704,@_); +} +sub vllezb { + vllez(@_,0); +} +sub vllezh { + vllez(@_,1); +} +sub vllezf { + vllez(@_,2); +} +sub vllezg { + vllez(@_,3); +} + +sub vlm { + confess(err("ARGNUM")) if ($#_<2||$#_>3); + VRSa(0xe736,@_); +} + +sub vlbb { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe707,@_); +} + +sub vlvg { + confess(err("ARGNUM")) if ($#_!=3); + VRSb(0xe722,@_); +} +sub vlvgb { + vlvg(@_,0); +} +sub vlvgh { + vlvg(@_,1); +} +sub vlvgf { + vlvg(@_,2); +} +sub vlvgg { + vlvg(@_,3); +} + +sub vlvgp { + confess(err("ARGNUM")) if ($#_!=2); + VRRf(0xe762,@_); +} + +sub vll { + confess(err("ARGNUM")) if ($#_!=2); + VRSb(0xe737,@_); +} + +sub vmrh { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe761,@_); +} +sub vmrhb { + vmrh(@_,0); +} +sub vmrhh { + vmrh(@_,1); +} +sub vmrhf { + vmrh(@_,2); +} +sub vmrhg { + vmrh(@_,3); +} + +sub vmrl { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe760,@_); +} +sub vmrlb { + vmrl(@_,0); +} +sub vmrlh { + vmrl(@_,1); +} +sub vmrlf { + vmrl(@_,2); +} +sub vmrlg { + vmrl(@_,3); +} + +sub vpk { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe794,@_); +} +sub vpkh { + vpk(@_,1); +} +sub vpkf { + vpk(@_,2); +} +sub vpkg { + vpk(@_,3); +} + +sub vpks { + confess(err("ARGNUM")) if ($#_!=4); + VRRb(0xe797,@_); +} +sub vpksh { + vpks(@_,1,0); +} +sub vpksf { + vpks(@_,2,0); +} +sub vpksg { + vpks(@_,3,0); +} +sub vpkshs { + vpks(@_,1,1); +} +sub vpksfs { + vpks(@_,2,1); +} +sub vpksgs { + vpks(@_,3,1); +} + +sub vpkls { + confess(err("ARGNUM")) if ($#_!=4); + VRRb(0xe795,@_); +} +sub vpklsh { + vpkls(@_,1,0); +} +sub vpklsf { + vpkls(@_,2,0); +} +sub vpklsg { + vpkls(@_,3,0); +} +sub vpklshs { + vpkls(@_,1,1); +} +sub vpklsfs { + vpkls(@_,2,1); +} +sub vpklsgs { + vpkls(@_,3,1); +} + +sub vperm { + confess(err("ARGNUM")) if ($#_!=3); + VRRe(0xe78c,@_); +} + +sub vpdi { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe784,@_); +} + +sub vrep { + confess(err("ARGNUM")) if ($#_!=3); + VRIc(0xe74d,@_); +} +sub vrepb { + vrep(@_,0); +} +sub vreph { + vrep(@_,1); +} +sub vrepf { + vrep(@_,2); +} +sub vrepg { + vrep(@_,3); +} + +sub vrepi { + confess(err("ARGNUM")) if ($#_!=2); + VRIa(0xe745,@_); +} +sub vrepib { + vrepi(@_,0); +} +sub vrepih { + vrepi(@_,1); +} +sub vrepif { + vrepi(@_,2); +} +sub vrepig { + vrepi(@_,3); +} + +sub vscef { + confess(err("ARGNUM")) if ($#_!=2); + VRV(0xe71b,@_); +} +sub vsceg { + confess(err("ARGNUM")) if ($#_!=2); + VRV(0xe71a,@_); +} + +sub vsel { + confess(err("ARGNUM")) if ($#_!=3); + VRRe(0xe78d,@_); +} + +sub vseg { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe75f,@_); +} +sub vsegb { + vseg(@_,0); +} +sub vsegh { + vseg(@_,1); +} +sub vsegf { + vseg(@_,2); +} + +sub vst { + confess(err("ARGNUM")) if ($#_<1||$#_>2); + VRX(0xe70e,@_); +} + +sub vsteb { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe708,@_); +} +sub vsteh { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe709,@_); +} +sub vstef { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe70b,@_); +} +sub vsteg { + confess(err("ARGNUM")) if ($#_!=2); + VRX(0xe70a,@_); +} + +sub vstm { + confess(err("ARGNUM")) if ($#_<2||$#_>3); + VRSa(0xe73e,@_); +} + +sub vstl { + confess(err("ARGNUM")) if ($#_!=2); + VRSb(0xe73f,@_); +} + +sub vuph { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe7d7,@_); +} +sub vuphb { + vuph(@_,0); +} +sub vuphh { + vuph(@_,1); +} +sub vuphf { + vuph(@_,2); +} + +sub vuplh { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe7d5,@_); +} +sub vuplhb { + vuplh(@_,0); +} +sub vuplhh { + vuplh(@_,1); +} +sub vuplhf { + vuplh(@_,2); +} + +sub vupl { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe7d6,@_); +} +sub vuplb { + vupl(@_,0); +} +sub vuplhw { + vupl(@_,1); +} +sub vuplf { + vupl(@_,2); +} + +sub vupll { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe7d4,@_); +} +sub vupllb { + vupll(@_,0); +} +sub vupllh { + vupll(@_,1); +} +sub vupllf { + vupll(@_,2); +} + +# VX - Integer Instructions + +sub va { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7f3,@_); +} +sub vab { + va(@_,0); +} +sub vah { + va(@_,1); +} +sub vaf { + va(@_,2); +} +sub vag { + va(@_,3); +} +sub vaq { + va(@_,4); +} + +sub vacc { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7f1,@_); +} +sub vaccb { + vacc(@_,0); +} +sub vacch { + vacc(@_,1); +} +sub vaccf { + vacc(@_,2); +} +sub vaccg { + vacc(@_,3); +} +sub vaccq { + vacc(@_,4); +} + +sub vac { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7bb,@_); +} +sub vacq { + vac(@_,4); +} + +sub vaccc { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7b9,@_); +} +sub vacccq { + vaccc(@_,4); +} + +sub vn { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe768,@_); +} + +sub vnc { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe769,@_); +} + +sub vavg { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7f2,@_); +} +sub vavgb { + vavg(@_,0); +} +sub vavgh { + vavg(@_,1); +} +sub vavgf { + vavg(@_,2); +} +sub vavgg { + vavg(@_,3); +} + +sub vavgl { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7f0,@_); +} +sub vavglb { + vavgl(@_,0); +} +sub vavglh { + vavgl(@_,1); +} +sub vavglf { + vavgl(@_,2); +} +sub vavglg { + vavgl(@_,3); +} + +sub vcksm { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe766,@_); +} + +sub vec_ { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe7db,@_); +} +sub vecb { + vec_(@_,0); +} +sub vech { + vec_(@_,1); +} +sub vecf { + vec_(@_,2); +} +sub vecg { + vec_(@_,3); +} + +sub vecl { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe7d9,@_); +} +sub veclb { + vecl(@_,0); +} +sub veclh { + vecl(@_,1); +} +sub veclf { + vecl(@_,2); +} +sub veclg { + vecl(@_,3); +} + +sub vceq { + confess(err("ARGNUM")) if ($#_!=4); + VRRb(0xe7f8,@_); +} +sub vceqb { + vceq(@_,0,0); +} +sub vceqh { + vceq(@_,1,0); +} +sub vceqf { + vceq(@_,2,0); +} +sub vceqg { + vceq(@_,3,0); +} +sub vceqbs { + vceq(@_,0,1); +} +sub vceqhs { + vceq(@_,1,1); +} +sub vceqfs { + vceq(@_,2,1); +} +sub vceqgs { + vceq(@_,3,1); +} + +sub vch { + confess(err("ARGNUM")) if ($#_!=4); + VRRb(0xe7fb,@_); +} +sub vchb { + vch(@_,0,0); +} +sub vchh { + vch(@_,1,0); +} +sub vchf { + vch(@_,2,0); +} +sub vchg { + vch(@_,3,0); +} +sub vchbs { + vch(@_,0,1); +} +sub vchhs { + vch(@_,1,1); +} +sub vchfs { + vch(@_,2,1); +} +sub vchgs { + vch(@_,3,1); +} + +sub vchl { + confess(err("ARGNUM")) if ($#_!=4); + VRRb(0xe7f9,@_); +} +sub vchlb { + vchl(@_,0,0); +} +sub vchlh { + vchl(@_,1,0); +} +sub vchlf { + vchl(@_,2,0); +} +sub vchlg { + vchl(@_,3,0); +} +sub vchlbs { + vchl(@_,0,1); +} +sub vchlhs { + vchl(@_,1,1); +} +sub vchlfs { + vchl(@_,2,1); +} +sub vchlgs { + vchl(@_,3,1); +} + +sub vclz { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe753,@_); +} +sub vclzb { + vclz(@_,0); +} +sub vclzh { + vclz(@_,1); +} +sub vclzf { + vclz(@_,2); +} +sub vclzg { + vclz(@_,3); +} + +sub vctz { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe752,@_); +} +sub vctzb { + vctz(@_,0); +} +sub vctzh { + vctz(@_,1); +} +sub vctzf { + vctz(@_,2); +} +sub vctzg { + vctz(@_,3); +} + +sub vx { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe76d,@_); +} + +sub vgfm { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7b4,@_); +} +sub vgfmb { + vgfm(@_,0); +} +sub vgfmh { + vgfm(@_,1); +} +sub vgfmf { + vgfm(@_,2); +} +sub vgfmg { + vgfm(@_,3); +} + +sub vgfma { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7bc,@_); +} +sub vgfmab { + vgfma(@_,0); +} +sub vgfmah { + vgfma(@_,1); +} +sub vgfmaf { + vgfma(@_,2); +} +sub vgfmag { + vgfma(@_,3); +} + +sub vlc { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe7de,@_); +} +sub vlcb { + vlc(@_,0); +} +sub vlch { + vlc(@_,1); +} +sub vlcf { + vlc(@_,2); +} +sub vlcg { + vlc(@_,3); +} + +sub vlp { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe7df,@_); +} +sub vlpb { + vlp(@_,0); +} +sub vlph { + vlp(@_,1); +} +sub vlpf { + vlp(@_,2); +} +sub vlpg { + vlp(@_,3); +} + +sub vmx { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7ff,@_); +} +sub vmxb { + vmx(@_,0); +} +sub vmxh { + vmx(@_,1); +} +sub vmxf { + vmx(@_,2); +} +sub vmxg { + vmx(@_,3); +} + +sub vmxl { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7fd,@_); +} +sub vmxlb { + vmxl(@_,0); +} +sub vmxlh { + vmxl(@_,1); +} +sub vmxlf { + vmxl(@_,2); +} +sub vmxlg { + vmxl(@_,3); +} + +sub vmn { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7fe,@_); +} +sub vmnb { + vmn(@_,0); +} +sub vmnh { + vmn(@_,1); +} +sub vmnf { + vmn(@_,2); +} +sub vmng { + vmn(@_,3); +} + +sub vmnl { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7fc,@_); +} +sub vmnlb { + vmnl(@_,0); +} +sub vmnlh { + vmnl(@_,1); +} +sub vmnlf { + vmnl(@_,2); +} +sub vmnlg { + vmnl(@_,3); +} + +sub vmal { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7aa,@_); +} +sub vmalb { + vmal(@_,0); +} +sub vmalhw { + vmal(@_,1); +} +sub vmalf { + vmal(@_,2); +} + +sub vmah { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7ab,@_); +} +sub vmahb { + vmah(@_,0); +} +sub vmahh { + vmah(@_,1); +} +sub vmahf { + vmah(@_,2); +} + +sub vmalh { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7a9,@_); +} +sub vmalhb { + vmalh(@_,0); +} +sub vmalhh { + vmalh(@_,1); +} +sub vmalhf { + vmalh(@_,2); +} + +sub vmae { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7ae,@_); +} +sub vmaeb { + vmae(@_,0); +} +sub vmaeh { + vmae(@_,1); +} +sub vmaef { + vmae(@_,2); +} + +sub vmale { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7ac,@_); +} +sub vmaleb { + vmale(@_,0); +} +sub vmaleh { + vmale(@_,1); +} +sub vmalef { + vmale(@_,2); +} + +sub vmao { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7af,@_); +} +sub vmaob { + vmao(@_,0); +} +sub vmaoh { + vmao(@_,1); +} +sub vmaof { + vmao(@_,2); +} + +sub vmalo { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7ad,@_); +} +sub vmalob { + vmalo(@_,0); +} +sub vmaloh { + vmalo(@_,1); +} +sub vmalof { + vmalo(@_,2); +} + +sub vmh { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7a3,@_); +} +sub vmhb { + vmh(@_,0); +} +sub vmhh { + vmh(@_,1); +} +sub vmhf { + vmh(@_,2); +} + +sub vmlh { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7a1,@_); +} +sub vmlhb { + vmlh(@_,0); +} +sub vmlhh { + vmlh(@_,1); +} +sub vmlhf { + vmlh(@_,2); +} + +sub vml { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7a2,@_); +} +sub vmlb { + vml(@_,0); +} +sub vmlhw { + vml(@_,1); +} +sub vmlf { + vml(@_,2); +} + +sub vme { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7a6,@_); +} +sub vmeb { + vme(@_,0); +} +sub vmeh { + vme(@_,1); +} +sub vmef { + vme(@_,2); +} + +sub vmle { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7a4,@_); +} +sub vmleb { + vmle(@_,0); +} +sub vmleh { + vmle(@_,1); +} +sub vmlef { + vmle(@_,2); +} + +sub vmo { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7a7,@_); +} +sub vmob { + vmo(@_,0); +} +sub vmoh { + vmo(@_,1); +} +sub vmof { + vmo(@_,2); +} + +sub vmlo { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7a5,@_); +} +sub vmlob { + vmlo(@_,0); +} +sub vmloh { + vmlo(@_,1); +} +sub vmlof { + vmlo(@_,2); +} + +sub vno { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe76b,@_); +} +sub vnot { + vno(@_,$_[1]); +} + +sub vo { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe76a,@_); +} + +sub vpopct { + confess(err("ARGNUM")) if ($#_!=2); + VRRa(0xe750,@_); +} + +sub verllv { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe773,@_); +} +sub verllvb { + verllv(@_,0); +} +sub verllvh { + verllv(@_,1); +} +sub verllvf { + verllv(@_,2); +} +sub verllvg { + verllv(@_,3); +} + +sub verll { + confess(err("ARGNUM")) if ($#_!=3); + VRSa(0xe733,@_); +} +sub verllb { + verll(@_,0); +} +sub verllh { + verll(@_,1); +} +sub verllf { + verll(@_,2); +} +sub verllg { + verll(@_,3); +} + +sub verim { + confess(err("ARGNUM")) if ($#_!=4); + VRId(0xe772,@_); +} +sub verimb { + verim(@_,0); +} +sub verimh { + verim(@_,1); +} +sub verimf { + verim(@_,2); +} +sub verimg { + verim(@_,3); +} + +sub veslv { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe770,@_); +} +sub veslvb { + veslv(@_,0); +} +sub veslvh { + veslv(@_,1); +} +sub veslvf { + veslv(@_,2); +} +sub veslvg { + veslv(@_,3); +} + +sub vesl { + confess(err("ARGNUM")) if ($#_!=3); + VRSa(0xe730,@_); +} +sub veslb { + vesl(@_,0); +} +sub veslh { + vesl(@_,1); +} +sub veslf { + vesl(@_,2); +} +sub veslg { + vesl(@_,3); +} + +sub vesrav { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe77a,@_); +} +sub vesravb { + vesrav(@_,0); +} +sub vesravh { + vesrav(@_,1); +} +sub vesravf { + vesrav(@_,2); +} +sub vesravg { + vesrav(@_,3); +} + +sub vesra { + confess(err("ARGNUM")) if ($#_!=3); + VRSa(0xe73a,@_); +} +sub vesrab { + vesra(@_,0); +} +sub vesrah { + vesra(@_,1); +} +sub vesraf { + vesra(@_,2); +} +sub vesrag { + vesra(@_,3); +} + +sub vesrlv { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe778,@_); +} +sub vesrlvb { + vesrlv(@_,0); +} +sub vesrlvh { + vesrlv(@_,1); +} +sub vesrlvf { + vesrlv(@_,2); +} +sub vesrlvg { + vesrlv(@_,3); +} + +sub vesrl { + confess(err("ARGNUM")) if ($#_!=3); + VRSa(0xe738,@_); +} +sub vesrlb { + vesrl(@_,0); +} +sub vesrlh { + vesrl(@_,1); +} +sub vesrlf { + vesrl(@_,2); +} +sub vesrlg { + vesrl(@_,3); +} + +sub vsl { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe774,@_); +} + +sub vslb { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe775,@_); +} + +sub vsldb { + confess(err("ARGNUM")) if ($#_!=3); + VRId(0xe777,@_); +} + +sub vsra { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe77e,@_); +} + +sub vsrab { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe77f,@_); +} + +sub vsrl { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe77c,@_); +} + +sub vsrlb { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe77d,@_); +} + +sub vs { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7f7,@_); +} +sub vsb { + vs(@_,0); +} +sub vsh { + vs(@_,1); +} +sub vsf { + vs(@_,2); +} +sub vsg { + vs(@_,3); +} +sub vsq { + vs(@_,4); +} + +sub vscbi { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe7f5,@_); +} +sub vscbib { + vscbi(@_,0); +} +sub vscbih { + vscbi(@_,1); +} +sub vscbif { + vscbi(@_,2); +} +sub vscbig { + vscbi(@_,3); +} +sub vscbiq { + vscbi(@_,4); +} + +sub vsbi { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7bf,@_); +} +sub vsbiq { + vsbi(@_,4); +} + +sub vsbcbi { + confess(err("ARGNUM")) if ($#_!=4); + VRRd(0xe7bd,@_); +} +sub vsbcbiq { + vsbcbi(@_,4); +} + +sub vsumg { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe765,@_); +} +sub vsumgh { + vsumg(@_,1); +} +sub vsumgf { + vsumg(@_,2); +} + +sub vsumq { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe767,@_); +} +sub vsumqf { + vsumq(@_,2); +} +sub vsumqg { + vsumq(@_,3); +} + +sub vsum { + confess(err("ARGNUM")) if ($#_!=3); + VRRc(0xe764,@_); +} +sub vsumb { + vsum(@_,0); +} +sub vsumh { + vsum(@_,1); +} + +sub vtm { + confess(err("ARGNUM")) if ($#_!=1); + VRRa(0xe7d8,@_); +} + +# VX - String Instructions + +sub vfae { + confess(err("ARGNUM")) if ($#_<3||$#_>4); + VRRb(0xe782,@_); +} +sub vfaeb { + vfae(@_[0..2],0,$_[3]); +} +sub vfaeh { + vfae(@_[0..2],1,$_[3]); +} +sub vfaef { + vfae(@_[0..2],2,$_[3]); +} +sub vfaebs { + $_[3]=0 if (!defined($_[3])); + vfae(@_[0..2],0,0x1|$_[3]); +} +sub vfaehs { + $_[3]=0 if (!defined($_[3])); + vfae(@_[0..2],1,0x1|$_[3]); +} +sub vfaefs { + $_[3]=0 if (!defined($_[3])); + vfae(@_[0..2],2,0x1|$_[3]); +} +sub vfaezb { + $_[3]=0 if (!defined($_[3])); + vfae(@_[0..2],0,0x2|$_[3]); +} +sub vfaezh { + $_[3]=0 if (!defined($_[3])); + vfae(@_[0..2],1,0x2|$_[3]); +} +sub vfaezf { + $_[3]=0 if (!defined($_[3])); + vfae(@_[0..2],2,0x2|$_[3]); +} +sub vfaezbs { + $_[3]=0 if (!defined($_[3])); + vfae(@_[0..2],0,0x3|$_[3]); +} +sub vfaezhs { + $_[3]=0 if (!defined($_[3])); + vfae(@_[0..2],1,0x3|$_[3]); +} +sub vfaezfs { + $_[3]=0 if (!defined($_[3])); + vfae(@_[0..2],2,0x3|$_[3]); +} + +sub vfee { + confess(err("ARGNUM")) if ($#_<3||$#_>4); + VRRb(0xe780,@_); +} +sub vfeeb { + vfee(@_[0..2],0,$_[3]); +} +sub vfeeh { + vfee(@_[0..2],1,$_[3]); +} +sub vfeef { + vfee(@_[0..2],2,$_[3]); +} +sub vfeebs { + vfee(@_,0,1); +} +sub vfeehs { + vfee(@_,1,1); +} +sub vfeefs { + vfee(@_,2,1); +} +sub vfeezb { + vfee(@_,0,2); +} +sub vfeezh { + vfee(@_,1,2); +} +sub vfeezf { + vfee(@_,2,2); +} +sub vfeezbs { + vfee(@_,0,3); +} +sub vfeezhs { + vfee(@_,1,3); +} +sub vfeezfs { + vfee(@_,2,3); +} + +sub vfene { + confess(err("ARGNUM")) if ($#_<3||$#_>4); + VRRb(0xe781,@_); +} +sub vfeneb { + vfene(@_[0..2],0,$_[3]); +} +sub vfeneh { + vfene(@_[0..2],1,$_[3]); +} +sub vfenef { + vfene(@_[0..2],2,$_[3]); +} +sub vfenebs { + vfene(@_,0,1); +} +sub vfenehs { + vfene(@_,1,1); +} +sub vfenefs { + vfene(@_,2,1); +} +sub vfenezb { + vfene(@_,0,2); +} +sub vfenezh { + vfene(@_,1,2); +} +sub vfenezf { + vfene(@_,2,2); +} +sub vfenezbs { + vfene(@_,0,3); +} +sub vfenezhs { + vfene(@_,1,3); +} +sub vfenezfs { + vfene(@_,2,3); +} + +sub vistr { + confess(err("ARGNUM")) if ($#_<2||$#_>3); + VRRa(0xe75c,@_[0..2],0,$_[3]); +} +sub vistrb { + vistr(@_[0..1],0,$_[2]); +} +sub vistrh { + vistr(@_[0..1],1,$_[2]); +} +sub vistrf { + vistr(@_[0..1],2,$_[2]); +} +sub vistrbs { + vistr(@_,0,1); +} +sub vistrhs { + vistr(@_,1,1); +} +sub vistrfs { + vistr(@_,2,1); +} + +sub vstrc { + confess(err("ARGNUM")) if ($#_<4||$#_>5); + VRRd(0xe78a,@_); +} +sub vstrcb { + vstrc(@_[0..3],0,$_[4]); +} +sub vstrch { + vstrc(@_[0..3],1,$_[4]); +} +sub vstrcf { + vstrc(@_[0..3],2,$_[4]); +} +sub vstrcbs { + $_[4]=0 if (!defined($_[4])); + vstrc(@_[0..3],0,0x1|$_[4]); +} +sub vstrchs { + $_[4]=0 if (!defined($_[4])); + vstrc(@_[0..3],1,0x1|$_[4]); +} +sub vstrcfs { + $_[4]=0 if (!defined($_[4])); + vstrc(@_[0..3],2,0x1|$_[4]); +} +sub vstrczb { + $_[4]=0 if (!defined($_[4])); + vstrc(@_[0..3],0,0x2|$_[4]); +} +sub vstrczh { + $_[4]=0 if (!defined($_[4])); + vstrc(@_[0..3],1,0x2|$_[4]); +} +sub vstrczf { + $_[4]=0 if (!defined($_[4])); + vstrc(@_[0..3],2,0x2|$_[4]); +} +sub vstrczbs { + $_[4]=0 if (!defined($_[4])); + vstrc(@_[0..3],0,0x3|$_[4]); +} +sub vstrczhs { + $_[4]=0 if (!defined($_[4])); + vstrc(@_[0..3],1,0x3|$_[4]); +} +sub vstrczfs { + $_[4]=0 if (!defined($_[4])); + vstrc(@_[0..3],2,0x3|$_[4]); +} + +# VX - Floating-point Instructions + +sub vfa { + confess(err("ARGNUM")) if ($#_!=4); + VRRc(0xe7e3,@_); +} +sub vfadb { + vfa(@_,3,0); +} +sub wfadb { + vfa(@_,3,8); +} + +sub wfc { + confess(err("ARGNUM")) if ($#_!=3); + VRRa(0xe7cb,@_); +} +sub wfcdb { + wfc(@_,3,0); +} + +sub wfk { + confess(err("ARGNUM")) if ($#_!=3); + VRRa(0xe7ca,@_); +} +sub wfksb { + wfk(@_,2,0); +} +sub wfkdb { + wfk(@_,3,0); +} +sub wfkxb { + wfk(@_,4,0); +} + +sub vfce { + confess(err("ARGNUM")) if ($#_!=5); + VRRc(0xe7e8,@_); +} +sub vfcedb { + vfce(@_,3,0,0); +} +sub vfcedbs { + vfce(@_,3,0,1); +} +sub wfcedb { + vfce(@_,3,8,0); +} +sub wfcedbs { + vfce(@_,3,8,1); +} + +sub vfch { + confess(err("ARGNUM")) if ($#_!=5); + VRRc(0xe7eb,@_); +} +sub vfchdb { + vfch(@_,3,0,0); +} +sub vfchdbs { + vfch(@_,3,0,1); +} +sub wfchdb { + vfch(@_,3,8,0); +} +sub wfchdbs { + vfch(@_,3,8,1); +} + +sub vfche { + confess(err("ARGNUM")) if ($#_!=5); + VRRc(0xe7ea,@_); +} +sub vfchedb { + vfche(@_,3,0,0); +} +sub vfchedbs { + vfche(@_,3,0,1); +} +sub wfchedb { + vfche(@_,3,8,0); +} +sub wfchedbs { + vfche(@_,3,8,1); +} + +sub vcdg { + confess(err("ARGNUM")) if ($#_!=4); + VRRa(0xe7c3,@_); +} +sub vcdgb { + vcdg(@_[0..1],3,@_[2..3]); +} +sub wcdgb { + vcdg(@_[0..1],3,0x8|$_[2],$_[3]); +} + +sub vcdlg { + confess(err("ARGNUM")) if ($#_!=4); + VRRa(0xe7c1,@_); +} +sub vcdlgb { + vcdlg(@_[0..1],3,@_[2..3]); +} +sub wcdlgb { + vcdlg(@_[0..1],3,0x8|$_[2],$_[3]); +} + +sub vcgd { + confess(err("ARGNUM")) if ($#_!=4); + VRRa(0xe7c2,@_); +} +sub vcgdb { + vcgd(@_[0..1],3,@_[2..3]); +} +sub wcgdb { + vcgd(@_[0..1],3,0x8|$_[2],$_[3]); +} + +sub vclgd { + confess(err("ARGNUM")) if ($#_!=4); + VRRa(0xe7c0,@_); +} +sub vclgdb { + vclgd(@_[0..1],3,@_[2..3]); +} +sub wclgdb { + vclgd(@_[0..1],3,0x8|$_[2],$_[3]); +} + +sub vfd { + confess(err("ARGNUM")) if ($#_!=4); + VRRc(0xe7e5,@_); +} +sub vfddb { + vfd(@_,3,0); +} +sub wfddb { + vfd(@_,3,8); +} + +sub vfi { + confess(err("ARGNUM")) if ($#_!=4); + VRRa(0xe7c7,@_); +} +sub vfidb { + vfi(@_[0..1],3,@_[2..3]); +} +sub wfidb { + vfi(@_[0..1],3,0x8|$_[2],$_[3]); +} + +sub vlde { # deprecated, use vfll + confess(err("ARGNUM")) if ($#_!=3); + VRRa(0xe7c4,@_); +} +sub vldeb { # deprecated, use vflls + vlde(@_,2,0); +} +sub wldeb { # deprecated, use wflls + vlde(@_,2,8); +} + +sub vled { # deprecated, use vflr + confess(err("ARGNUM")) if ($#_!=4); + VRRa(0xe7c5,@_); +} +sub vledb { # deprecated, use vflrd + vled(@_[0..1],3,@_[2..3]); +} +sub wledb { # deprecated, use wflrd + vled(@_[0..1],3,0x8|$_[2],$_[3]); +} + +sub vfm { + confess(err("ARGNUM")) if ($#_!=4); + VRRc(0xe7e7,@_); +} +sub vfmdb { + vfm(@_,3,0); +} +sub wfmdb { + vfm(@_,3,8); +} + +sub vfma { + confess(err("ARGNUM")) if ($#_!=5); + VRRe(0xe78f,@_); +} +sub vfmadb { + vfma(@_,0,3); +} +sub wfmadb { + vfma(@_,8,3); +} + +sub vfms { + confess(err("ARGNUM")) if ($#_!=5); + VRRe(0xe78e,@_); +} +sub vfmsdb { + vfms(@_,0,3); +} +sub wfmsdb { + vfms(@_,8,3); +} + +sub vfpso { + confess(err("ARGNUM")) if ($#_!=4); + VRRa(0xe7cc,@_); +} +sub vfpsodb { + vfpso(@_[0..1],3,0,$_[2]); +} +sub wfpsodb { + vfpso(@_[0..1],3,8,$_[2]); +} +sub vflcdb { + vfpso(@_,3,0,0); +} +sub wflcdb { + vfpso(@_,3,8,0); +} +sub vflndb { + vfpso(@_,3,0,1); +} +sub wflndb { + vfpso(@_,3,8,1); +} +sub vflpdb { + vfpso(@_,3,0,2); +} +sub wflpdb { + vfpso(@_,3,8,2); +} + +sub vfsq { + confess(err("ARGNUM")) if ($#_!=3); + VRRa(0xe7ce,@_); +} +sub vfsqdb { + vfsq(@_,3,0); +} +sub wfsqdb { + vfsq(@_,3,8); +} + +sub vfs { + confess(err("ARGNUM")) if ($#_!=4); + VRRc(0xe7e2,@_); +} +sub vfsdb { + vfs(@_,3,0); +} +sub wfsdb { + vfs(@_,3,8); +} + +sub vftci { + confess(err("ARGNUM")) if ($#_!=4); + VRIe(0xe74a,@_); +} +sub vftcidb { + vftci(@_,3,0); +} +sub wftcidb { + vftci(@_,3,8); +} + +# VXE - Support Instructions + +sub vbperm { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe785,@_); +} + +sub vllezlf { + vllez(@_,6); +} + +# VXE - Integer Instructions + +sub vmsl { + confess(err("ARGNUM")) if ($#_!=5); + VRRd(0xe7b8,@_); +} +sub vmslg { + vmsl(@_[0..3],3,$_[4]); +} + +sub vnx { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe76c,@_); +} + +sub vnn { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe76e,@_); +} + +sub voc { + confess(err("ARGNUM")) if ($#_!=2); + VRRc(0xe76f,@_); +} + +sub vpopctb { + vpopct(@_,0); +} +sub vpopcth { + vpopct(@_,1); +} +sub vpopctf { + vpopct(@_,2); +} +sub vpopctg { + vpopct(@_,3); +} + +# VXE - Floating-Point Instructions + +sub vfasb { + vfa(@_,2,0); +} +sub wfasb { + vfa(@_,2,8); +} +sub wfaxb { + vfa(@_,4,8); +} + +sub wfcsb { + wfc(@_,2,0); +} +sub wfcxb { + wfc(@_,4,0); +} + +sub vfcesb { + vfce(@_,2,0,0); +} +sub vfcesbs { + vfce(@_,2,0,1); +} +sub wfcesb { + vfce(@_,2,8,0); +} +sub wfcesbs { + vfce(@_,2,8,1); +} +sub wfcexb { + vfce(@_,4,8,0); +} +sub wfcexbs { + vfce(@_,4,8,1); +} + +sub vfchsb { + vfch(@_,2,0,0); +} +sub vfchsbs { + vfch(@_,2,0,1); +} +sub wfchsb { + vfch(@_,2,8,0); +} +sub wfchsbs { + vfch(@_,2,8,1); +} +sub wfchxb { + vfch(@_,4,8,0); +} +sub wfchxbs { + vfch(@_,4,8,1); +} + +sub vfchesb { + vfche(@_,2,0,0); +} +sub vfchesbs { + vfche(@_,2,0,1); +} +sub wfchesb { + vfche(@_,2,8,0); +} +sub wfchesbs { + vfche(@_,2,8,1); +} +sub wfchexb { + vfche(@_,4,8,0); +} +sub wfchexbs { + vfche(@_,4,8,1); +} + +sub vfdsb { + vfd(@_,2,0); +} +sub wfdsb { + vfd(@_,2,8); +} +sub wfdxb { + vfd(@_,4,8); +} + +sub vfisb { + vfi(@_[0..1],2,@_[2..3]); +} +sub wfisb { + vfi(@_[0..1],2,0x8|$_[2],$_[3]); +} +sub wfixb { + vfi(@_[0..1],4,0x8|$_[2],$_[3]); +} + +sub vfll { + vlde(@_); +} +sub vflls { + vfll(@_,2,0); +} +sub wflls { + vfll(@_,2,8); +} +sub wflld { + vfll(@_,3,8); +} + +sub vflr { + vled(@_); +} +sub vflrd { + vflr(@_[0..1],3,@_[2..3]); +} +sub wflrd { + vflr(@_[0..1],3,0x8|$_[2],$_[3]); +} +sub wflrx { + vflr(@_[0..1],4,0x8|$_[2],$_[3]); +} + +sub vfmax { + confess(err("ARGNUM")) if ($#_!=5); + VRRc(0xe7ef,@_); +} +sub vfmaxsb { + vfmax(@_[0..2],2,0,$_[3]); +} +sub vfmaxdb { + vfmax(@_[0..2],3,0,$_[3]); +} +sub wfmaxsb { + vfmax(@_[0..2],2,8,$_[3]); +} +sub wfmaxdb { + vfmax(@_[0..2],3,8,$_[3]); +} +sub wfmaxxb { + vfmax(@_[0..2],4,8,$_[3]); +} + +sub vfmin { + confess(err("ARGNUM")) if ($#_!=5); + VRRc(0xe7ee,@_); +} +sub vfminsb { + vfmin(@_[0..2],2,0,$_[5]); +} +sub vfmindb { + vfmin(@_[0..2],3,0,$_[5]); +} +sub wfminsb { + vfmin(@_[0..2],2,8,$_[5]); +} +sub wfmindb { + vfmin(@_[0..2],3,8,$_[5]); +} +sub wfminxb { + vfmin(@_[0..2],4,8,$_[5]); +} + +sub vfmsb { + vfm(@_,2,0); +} +sub wfmsb { + vfm(@_,2,8); +} +sub wfmxb { + vfm(@_,4,8); +} + +sub vfmasb { + vfma(@_,0,2); +} +sub wfmasb { + vfma(@_,8,2); +} +sub wfmaxb { + vfma(@_,8,4); +} + +sub vfmssb { + vfms(@_,0,2); +} +sub wfmssb { + vfms(@_,8,2); +} +sub wfmsxb { + vfms(@_,8,4); +} + +sub vfnma { + confess(err("ARGNUM")) if ($#_!=5); + VRRe(0xe79f,@_); +} +sub vfnmasb { + vfnma(@_,0,2); +} +sub vfnmadb { + vfnma(@_,0,3); +} +sub wfnmasb { + vfnma(@_,8,2); +} +sub wfnmadb { + vfnma(@_,8,3); +} +sub wfnmaxb { + vfnma(@_,8,4); +} + +sub vfnms { + confess(err("ARGNUM")) if ($#_!=5); + VRRe(0xe79e,@_); +} +sub vfnmssb { + vfnms(@_,0,2); +} +sub vfnmsdb { + vfnms(@_,0,3); +} +sub wfnmssb { + vfnms(@_,8,2); +} +sub wfnmsdb { + vfnms(@_,8,3); +} +sub wfnmsxb { + vfnms(@_,8,4); +} + +sub vfpsosb { + vfpso(@_[0..1],2,0,$_[2]); +} +sub wfpsosb { + vfpso(@_[0..1],2,8,$_[2]); +} +sub vflcsb { + vfpso(@_,2,0,0); +} +sub wflcsb { + vfpso(@_,2,8,0); +} +sub vflnsb { + vfpso(@_,2,0,1); +} +sub wflnsb { + vfpso(@_,2,8,1); +} +sub vflpsb { + vfpso(@_,2,0,2); +} +sub wflpsb { + vfpso(@_,2,8,2); +} +sub vfpsoxb { + vfpso(@_[0..1],4,0,$_[2]); +} +sub wfpsoxb { + vfpso(@_[0..1],4,8,$_[2]); +} +sub vflcxb { + vfpso(@_,4,0,0); +} +sub wflcxb { + vfpso(@_,4,8,0); +} +sub vflnxb { + vfpso(@_,4,0,1); +} +sub wflnxb { + vfpso(@_,4,8,1); +} +sub vflpxb { + vfpso(@_,4,0,2); +} +sub wflpxb { + vfpso(@_,4,8,2); +} + +sub vfsqsb { + vfsq(@_,2,0); +} +sub wfsqsb { + vfsq(@_,2,8); +} +sub wfsqxb { + vfsq(@_,4,8); +} + +sub vfssb { + vfs(@_,2,0); +} +sub wfssb { + vfs(@_,2,8); +} +sub wfsxb { + vfs(@_,4,8); +} + +sub vftcisb { + vftci(@_,2,0); +} +sub wftcisb { + vftci(@_,2,8); +} +sub wftcixb { + vftci(@_,4,8); +} + +# VXD - Support Instructions + +sub vlrlr { + confess(err("ARGNUM")) if ($#_!=2); + VRSd(0xe637,@_); +} + +sub vlrl { + confess(err("ARGNUM")) if ($#_!=2); + VSI(0xe635,@_); +} + +sub vstrlr { + confess(err("ARGNUM")) if ($#_!=2); + VRSd(0xe63f,@_); +} + +sub vstrl { + confess(err("ARGNUM")) if ($#_!=2); + VSI(0xe63d,@_); +} + +sub vap { + confess(err("ARGNUM")) if ($#_!=4); + VRIf(0xe671,@_); +} + +sub vcp { + confess(err("ARGNUM")) if ($#_!=2); + VRRh(0xe677,@_); +} + +sub vcvb { + confess(err("ARGNUM")) if ($#_!=2); + VRRi(0xe650,@_); +} + +sub vcvbg { + confess(err("ARGNUM")) if ($#_!=2); + VRRi(0xe652,@_); +} + +sub vcvd { + confess(err("ARGNUM")) if ($#_!=3); + VRIi(0xe658,@_); +} + +sub vcvdg { + confess(err("ARGNUM")) if ($#_!=3); + VRIi(0xe65a,@_); +} + +sub vdp { + confess(err("ARGNUM")) if ($#_!=4); + VRIf(0xe67a,@_); +} + +sub vlip { + confess(err("ARGNUM")) if ($#_!=2); + VRIh(0xe649,@_); +} + +sub vmp { + confess(err("ARGNUM")) if ($#_!=4); + VRIf(0xe678,@_); +} + +sub vmsp { + confess(err("ARGNUM")) if ($#_!=4); + VRIf(0xe679,@_); +} + +sub vpkz { + confess(err("ARGNUM")) if ($#_!=2); + VSI(0xe634,@_); +} + +sub vpsop { + confess(err("ARGNUM")) if ($#_!=4); + VRIg(0xe65b,@_); +} + +sub vrp { + confess(err("ARGNUM")) if ($#_!=4); + VRIf(0xe67b,@_); +} + +sub vsdp { + confess(err("ARGNUM")) if ($#_!=4); + VRIf(0xe67e,@_); +} + +sub vsrp { + confess(err("ARGNUM")) if ($#_!=4); + VRIg(0xe659,@_); +} + +sub vsp { + confess(err("ARGNUM")) if ($#_!=4); + VRIf(0xe673,@_); +} + +sub vtp { + confess(err("ARGNUM")) if ($#_!=0); + VRRg(0xe65f,@_); +} + +sub vupkz { + confess(err("ARGNUM")) if ($#_!=2); + VSI(0xe63c,@_); +} + +# +# Instruction Formats +# + +sub RRE { + confess(err("ARGNUM")) if ($#_<0||2<$#_); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$r1,$r2)=(shift,get_R(shift),get_R(shift)); + + $out.="\t.long\t".sprintf("%#010x",($opcode<<16|$r1<<4|$r2)); + $out.="\t# $memn\t$ops\n" +} + +sub RRFb { + confess(err("ARGNUM")) if ($#_<3||4<$#_); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$r1,$r3,$r2,$m4)=(shift,get_R(shift),get_R(shift) + ,get_R(shift),get_M(shift)); + + $out.="\t.long\t" + .sprintf("%#010x",($opcode<<16|$r3<<12|$m4<<8|$r1<<4|$r2)); + $out.="\t# $memn\t$ops\n" +} + +sub S { + confess(err("ARGNUM")) if ($#_<0||1<$#_); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$d2,$b2)=(shift,get_DB(shift)); + + $out.="\t.long\t".sprintf("%#010x",($opcode<<16|$b2<<12|$d2)); + $out.="\t# $memn\t$ops\n" +} + +sub VRIa { + confess(err("ARGNUM")) if ($#_<2||3<$#_); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$i2,$m3)=(shift,get_V(shift),get_I(shift,16), + get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)).","; + $out.=sprintf("%#06x",$i2).","; + $out.=sprintf("%#06x",($m3<<12|RXB($v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRIb { + confess(err("ARGNUM")) if ($#_!=4); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$i2,$i3,$m4)=(shift,get_V(shift),get_I(shift,8), + ,get_I(shift,8),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)).","; + $out.=sprintf("%#06x",($i2<<8|$i3)).","; + $out.=sprintf("%#06x",($m4<<12|RXB($v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRIc { + confess(err("ARGNUM")) if ($#_!=4); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v3,$i2,$m4)=(shift,get_V(shift),get_V(shift), + ,get_I(shift,16),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v3&0xf)).","; + $out.=sprintf("%#06x",$i2).","; + $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v3)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRId { + confess(err("ARGNUM")) if ($#_<4||$#_>5); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$v3,$i4,$m5)=(shift,get_V(shift),get_V(shift), + ,get_V(shift),get_I(shift,8),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).","; + $out.=sprintf("%#06x",(($v3&0xf)<<12|$i4)).","; + $out.=sprintf("%#06x",($m5<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRIe { + confess(err("ARGNUM")) if ($#_!=5); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$i3,$m4,$m5)=(shift,get_V(shift),get_V(shift), + ,get_I(shift,12),get_M(shift),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).","; + $out.=sprintf("%#06x",($i3<<4|$m5)).","; + $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v2)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRIf { + confess(err("ARGNUM")) if ($#_!=5); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$v3,$i4,$m5)=(shift,get_V(shift),get_V(shift), + ,get_V(shift),get_I(shift,8),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).","; + $out.=sprintf("%#06x",(($v3&0xf)<<12|$m5<<4)|$i4>>4).","; + $out.=sprintf("%#06x",(($i4&0xf)<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRIg { + confess(err("ARGNUM")) if ($#_!=5); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$i3,$i4,$m5)=(shift,get_V(shift),get_V(shift), + ,get_I(shift,8),get_I(shift,8),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).","; + $out.=sprintf("%#06x",($i4<<8|$m5<<4|$i3>>4)).","; + $out.=sprintf("%#06x",(($i3&0xf)<<12|RXB($v1,$v2)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRIh { + confess(err("ARGNUM")) if ($#_!=3); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$i2,$i3)=(shift,get_V(shift),get_I(shift,16), + get_I(shift,4)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)).","; + $out.=sprintf("%#06x",$i2).","; + $out.=sprintf("%#06x",($i3<<12|RXB($v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRIi { + confess(err("ARGNUM")) if ($#_!=4); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$r2,$i3,$m4)=(shift,get_V(shift),get_R(shift), + ,get_I(shift,8),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|$r2).","; + $out.=sprintf("%#06x",($m4<<4|$i3>>4)).","; + $out.=sprintf("%#06x",(($i3&0xf)<<12|RXB($v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRRa { + confess(err("ARGNUM")) if ($#_<2||5<$#_); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$m3,$m4,$m5)=(shift,get_V(shift),get_V(shift), + get_M(shift),get_M(shift),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).","; + $out.=sprintf("%#06x",($m5<<4|$m4)).","; + $out.=sprintf("%#06x",($m3<<12|RXB($v1,$v2)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRRb { + confess(err("ARGNUM")) if ($#_<3||5<$#_); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$v3,$m4,$m5)=(shift,get_V(shift),get_V(shift), + get_V(shift),get_M(shift),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).","; + $out.=sprintf("%#06x",(($v3&0xf)<<12|$m5<<4)).","; + $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRRc { + confess(err("ARGNUM")) if ($#_<3||6<$#_); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$v3,$m4,$m5,$m6)=(shift,get_V(shift),get_V(shift), + get_V(shift),get_M(shift),get_M(shift),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).","; + $out.=sprintf("%#06x",(($v3&0xf)<<12|$m6<<4|$m5)).","; + $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRRd { + confess(err("ARGNUM")) if ($#_<4||6<$#_); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$v3,$v4,$m5,$m6)=(shift,get_V(shift),get_V(shift), + get_V(shift),get_V(shift),get_M(shift),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).","; + $out.=sprintf("%#06x",(($v3&0xf)<<12|$m5<<8|$m6<<4)).","; + $out.=sprintf("%#06x",(($v4&0xf)<<12|RXB($v1,$v2,$v3,$v4)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRRe { + confess(err("ARGNUM")) if ($#_<4||6<$#_); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$v3,$v4,$m5,$m6)=(shift,get_V(shift),get_V(shift), + get_V(shift),get_V(shift),get_M(shift),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).","; + $out.=sprintf("%#06x",(($v3&0xf)<<12|$m6<<8|$m5)).","; + $out.=sprintf("%#06x",(($v4&0xf)<<12|RXB($v1,$v2,$v3,$v4)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRRf { + confess(err("ARGNUM")) if ($#_!=3); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$r2,$r3)=(shift,get_V(shift),get_R(shift), + get_R(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|$r2)).","; + $out.=sprintf("%#06x",($r3<<12)).","; + $out.=sprintf("%#06x",(RXB($v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRRg { + confess(err("ARGNUM")) if ($#_!=1); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1)=(shift,get_V(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf))).","; + $out.=sprintf("%#06x",0x0000).","; + $out.=sprintf("%#06x",(RXB(0,$v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRRh { + confess(err("ARGNUM")) if ($#_<2||$#_>3); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v2,$m3)=(shift,get_V(shift),get_V(shift), + get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf))).","; + $out.=sprintf("%#06x",(($v2&0xf)<<12|$m3<<4)).","; + $out.=sprintf("%#06x",(RXB(0,$v1,$v2)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRRi { + confess(err("ARGNUM")) if ($#_!=3); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$r1,$v2,$m3)=(shift,get_R(shift),get_V(shift), + get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|$r1<<4|($v2&0xf))).","; + $out.=sprintf("%#06x",($m3<<4))."\,"; + $out.=sprintf("%#06x",(RXB(0,$v2)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRSa { + confess(err("ARGNUM")) if ($#_<3||$#_>4); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$v3,$d2,$b2,$m4)=(shift,get_V(shift),get_V(shift), + get_DB(shift),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v3&0xf))).","; + $out.=sprintf("%#06x",($b2<<12|$d2)).","; + $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v3)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRSb { + confess(err("ARGNUM")) if ($#_<3||$#_>4); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$r3,$d2,$b2,$m4)=(shift,get_V(shift),get_R(shift), + get_DB(shift),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|$r3)).","; + $out.=sprintf("%#06x",($b2<<12|$d2)).","; + $out.=sprintf("%#06x",($m4<<12|RXB($v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRSc { + confess(err("ARGNUM")) if ($#_!=4); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$r1,$v3,$d2,$b2,$m4)=(shift,get_R(shift),get_V(shift), + get_DB(shift),get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|$r1<<4|($v3&0xf))).","; + $out.=sprintf("%#06x",($b2<<12|$d2)).","; + $out.=sprintf("%#06x",($m4<<12|RXB(0,$v3)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRSd { + confess(err("ARGNUM")) if ($#_!=3); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$r3,$d2,$b2)=(shift,get_V(shift),get_R(shift), + get_DB(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|$r3)).","; + $out.=sprintf("%#06x",($b2<<12|$d2)).","; + $out.=sprintf("%#06x",(($v1&0xf)<<12|RXB(0,0,0,$v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRV { + confess(err("ARGNUM")) if ($#_<2||$#_>3); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$d2,$v2,$b2,$m3)=(shift,get_V(shift),get_DVB(shift), + get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).","; + $out.=sprintf("%#06x",($b2<<12|$d2)).","; + $out.=sprintf("%#06x",($m3<<12|RXB($v1,$v2)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VRX { + confess(err("ARGNUM")) if ($#_<2||$#_>3); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$d2,$x2,$b2,$m3)=(shift,get_V(shift),get_DXB(shift), + get_M(shift)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($x2))).","; + $out.=sprintf("%#06x",($b2<<12|$d2)).","; + $out.=sprintf("%#06x",($m3<<12|RXB($v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +sub VSI { + confess(err("ARGNUM")) if ($#_!=3); + my $ops=join(',',@_[1..$#_]); + my $memn=(caller(1))[3]; + $memn=~s/^.*:://; + my ($opcode,$v1,$d2,$b2,$i3)=(shift,get_V(shift),get_DB(shift), + get_I(shift,8)); + + $out.="\t.word\t"; + $out.=sprintf("%#06x",($opcode&0xff00|$i3)).","; + $out.=sprintf("%#06x",($b2<<12|$d2)).","; + $out.=sprintf("%#06x",(($v1&0xf)<<12|RXB(0,0,0,$v1)<<8|$opcode&0xff)); + $out.="\t# $memn\t$ops\n" +} + +# +# Internal +# + +sub get_R { + confess(err("ARGNUM")) if ($#_!=0); + my $r; + + for (shift) { + if (!defined) { + $r=0; + } elsif (/^$GR$/) { + $r=$1; + } else { + confess(err("PARSE")); + } + } + confess(err("ARGRANGE")) if ($r&~0xf); + + return $r; +} + +sub get_V { + confess(err("ARGNUM")) if ($#_!=0); + my $v; + + for (shift) { + if (!defined) { + $v=0; + } elsif (/^$VR$/) { + $v=$1; + } else { + confess(err("PARSE")); + } + } + confess(err("ARGRANGE")) if ($v&~0x1f); + + return $v; +} + +sub get_I { + confess(err("ARGNUM")) if ($#_!=1); + my ($i,$bits)=(shift,shift); + + $i=defined($i)?(eval($i)):(0); + confess(err("PARSE")) if (!defined($i)); + confess(err("ARGRANGE")) if (abs($i)&~(2**$bits-1)); + + return $i&(2**$bits-1); +} + +sub get_M { + confess(err("ARGNUM")) if ($#_!=0); + my $m=shift; + + $m=defined($m)?(eval($m)):(0); + confess(err("PARSE")) if (!defined($m)); + confess(err("ARGRANGE")) if ($m&~0xf); + + return $m; +} + +sub get_DB +{ + confess(err("ARGNUM")) if ($#_!=0); + my ($d,$b); + + for (shift) { + if (!defined) { + ($d,$b)=(0,0); + } elsif (/^(.+)\($GR\)$/) { + ($d,$b)=(eval($1),$2); + confess(err("PARSE")) if (!defined($d)); + } elsif (/^(.+)$/) { + ($d,$b)=(eval($1),0); + confess(err("PARSE")) if (!defined($d)); + } else { + confess(err("PARSE")); + } + } + confess(err("ARGRANGE")) if ($d&~0xfff||$b&~0xf); + + return ($d,$b); +} + +sub get_DVB +{ + confess(err("ARGNUM")) if ($#_!=0); + my ($d,$v,$b); + + for (shift) { + if (!defined) { + ($d,$v,$b)=(0,0,0); + } elsif (/^(.+)\($VR,$GR\)$/) { + ($d,$v,$b)=(eval($1),$2,$3); + confess(err("PARSE")) if (!defined($d)); + } elsif (/^(.+)\($GR\)$/) { + ($d,$v,$b)=(eval($1),0,$2); + confess(err("PARSE")) if (!defined($d)); + } elsif (/^(.+)$/) { + ($d,$v,$b)=(eval($1),0,0); + confess(err("PARSE")) if (!defined($d)); + } else { + confess(err("PARSE")); + } + } + confess(err("ARGRANGE")) if ($d&~0xfff||$v&~0x1f||$b&~0xf); + + return ($d,$v,$b); +} + +sub get_DXB +{ + confess(err("ARGNUM")) if ($#_!=0); + my ($d,$x,$b); + + for (shift) { + if (!defined) { + ($d,$x,$b)=(0,0,0); + } elsif (/^(.+)\($GR,$GR\)$/) { + ($d,$x,$b)=(eval($1),$2,$3); + confess(err("PARSE")) if (!defined($d)); + } elsif (/^(.+)\($GR\)$/) { + ($d,$x,$b)=(eval($1),0,$2); + confess(err("PARSE")) if (!defined($d)); + } elsif (/^(.+)$/) { + ($d,$x,$b)=(eval($1),0,0); + confess(err("PARSE")) if (!defined($d)); + } else { + confess(err("PARSE")); + } + } + confess(err("ARGRANGE")) if ($d&~0xfff||$x&~0xf||$b&~0xf); + + return ($d,$x,$b); +} + +sub RXB +{ + confess(err("ARGNUM")) if ($#_<0||3<$#_); + my $rxb=0; + + $rxb|=0x08 if (defined($_[0])&&($_[0]&0x10)); + $rxb|=0x04 if (defined($_[1])&&($_[1]&0x10)); + $rxb|=0x02 if (defined($_[2])&&($_[2]&0x10)); + $rxb|=0x01 if (defined($_[3])&&($_[3]&0x10)); + + return $rxb; +} + +sub err { + my %ERR = + ( + ARGNUM => 'Wrong number of arguments', + ARGRANGE=> 'Argument out of range', + PARSE => 'Parse error', + ); + confess($ERR{ARGNUM}) if ($#_!=0); + + return $ERR{$_[0]}; +} + +1; diff -up openssl-1.1.1e/crypto/poly1305/asm/poly1305-s390x.pl.s390x-update openssl-1.1.1e/crypto/poly1305/asm/poly1305-s390x.pl --- openssl-1.1.1e/crypto/poly1305/asm/poly1305-s390x.pl.s390x-update 2020-03-19 16:20:22.041227359 +0100 +++ openssl-1.1.1e/crypto/poly1305/asm/poly1305-s390x.pl 2020-03-19 16:23:22.364098257 +0100 @@ -24,204 +24,961 @@ # # On side note, z13 enables vector base 2^26 implementation... -$flavour = shift; +# +# January 2019 +# +# Add vx code path (base 2^26). +# +# Copyright IBM Corp. 2019 +# Author: Patrick Steuer + +# +# January 2019 +# +# Add vector base 2^26 implementation. It's problematic to accurately +# measure performance, because reference system is hardly idle. But +# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's +# >=20% faster than IBM's submission on long inputs, and much faster on +# short ones, because calculation of key powers is postponed till we +# know that input is long enough to justify the additional overhead. + +use strict; +use FindBin qw($Bin); +use lib "$Bin/../.."; +use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE); + +my $flavour = shift; +my ($z,$SIZE_T); if ($flavour =~ /3[12]/) { + $z=0; # S/390 ABI $SIZE_T=4; - $g=""; } else { + $z=1; # zSeries ABI $SIZE_T=8; - $g="g"; } +my $output; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; -$sp="%r15"; +my $stdframe=16*$SIZE_T+4*8; +my $sp="%r15"; my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); -$code.=<<___; -.text +PERLASM_BEGIN($output); -.globl poly1305_init -.type poly1305_init,\@function -.align 16 -poly1305_init: - lghi %r0,0 - lghi %r1,-1 - stg %r0,0($ctx) # zero hash value - stg %r0,8($ctx) - stg %r0,16($ctx) - - cl${g}r $inp,%r0 - je .Lno_key - - lrvg %r4,0($inp) # load little-endian key - lrvg %r5,8($inp) - - nihl %r1,0xffc0 # 0xffffffc0ffffffff - srlg %r0,%r1,4 # 0x0ffffffc0fffffff - srlg %r1,%r1,4 - nill %r1,0xfffc # 0x0ffffffc0ffffffc - - ngr %r4,%r0 - ngr %r5,%r1 - - stg %r4,32($ctx) - stg %r5,40($ctx) - -.Lno_key: - lghi %r2,0 - br %r14 -.size poly1305_init,.-poly1305_init -___ +INCLUDE ("s390x_arch.h"); +TEXT (); + +################ +# static void poly1305_init(void *ctx, const unsigned char key[16]) +{ +GLOBL ("poly1305_init"); +TYPE ("poly1305_init","\@function"); +ALIGN (16); +LABEL ("poly1305_init"); + lghi ("%r0",0); + lghi ("%r1",-1); + stg ("%r0","0($ctx)"); # zero hash value + stg ("%r0","8($ctx)"); + stg ("%r0","16($ctx)"); + st ("%r0","24($ctx)"); # clear is_base2_26 + lgr ("%r5",$ctx); # reassign $ctx + lghi ("%r2",0); + +&{$z? \&clgr:\&clr} ($inp,"%r0"); + je (".Lno_key"); + + lrvg ("%r2","0($inp)"); # load little-endian key + lrvg ("%r3","8($inp)"); + + nihl ("%r1",0xffc0); # 0xffffffc0ffffffff + srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff + srlg ("%r1","%r1",4); + nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc + + ngr ("%r2","%r0"); + ngr ("%r3","%r1"); + + stmg ("%r2","%r3","32(%r5)"); + + larl ("%r1","OPENSSL_s390xcap_P"); + lg ("%r0","16(%r1)"); + srlg ("%r0","%r0",62); + nill ("%r0",1); # extract vx bit + lcgr ("%r0","%r0"); + larl ("%r1",".Lpoly1305_blocks"); + larl ("%r2",".Lpoly1305_blocks_vx"); + larl ("%r3",".Lpoly1305_emit"); +&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector +&{$z? \&ngr:\&nr} ("%r2","%r0"); +&{$z? \&xgr:\&xr} ("%r2","%r1"); +&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)"); + lghi ("%r2",1); +LABEL (".Lno_key"); + br ("%r14"); +SIZE ("poly1305_init",".-poly1305_init"); +} + +################ +# static void poly1305_blocks(void *ctx, const unsigned char *inp, +# size_t len, u32 padbit) { my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14)); my ($r0,$r1,$s1) = map("%r$_",(0..2)); -$code.=<<___; -.globl poly1305_blocks -.type poly1305_blocks,\@function -.align 16 -poly1305_blocks: - srl${g} $len,4 # fixed-up in 64-bit build - lghi %r0,0 - cl${g}r $len,%r0 - je .Lno_data - - stm${g} %r6,%r14,`6*$SIZE_T`($sp) - - llgfr $padbit,$padbit # clear upper half, much needed with - # non-64-bit ABI - lg $r0,32($ctx) # load key - lg $r1,40($ctx) - - lg $h0,0($ctx) # load hash value - lg $h1,8($ctx) - lg $h2,16($ctx) - - st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx - srlg $s1,$r1,2 - algr $s1,$r1 # s1 = r1 + r1>>2 - j .Loop - -.align 16 -.Loop: - lrvg $d0lo,0($inp) # load little-endian input - lrvg $d1lo,8($inp) - la $inp,16($inp) - - algr $d0lo,$h0 # accumulate input - alcgr $d1lo,$h1 - - lgr $h0,$d0lo - mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo - lgr $h1,$d1lo - mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo - - mlgr $t0,$r1 # h0*r1 -> $t0:$h0 - mlgr $t1,$r0 # h1*r0 -> $t1:$h1 - alcgr $h2,$padbit - - algr $d0lo,$d1lo - lgr $d1lo,$h2 - alcgr $d0hi,$d1hi - lghi $d1hi,0 - - algr $h1,$h0 - alcgr $t1,$t0 - - msgr $d1lo,$s1 # h2*s1 - msgr $h2,$r0 # h2*r0 - - algr $h1,$d1lo - alcgr $t1,$d1hi # $d1hi is zero - - algr $h1,$d0hi - alcgr $h2,$t1 - - lghi $h0,-4 # final reduction step - ngr $h0,$h2 - srlg $t0,$h2,2 - algr $h0,$t0 - lghi $t1,3 - ngr $h2,$t1 - - algr $h0,$d0lo - alcgr $h1,$d1hi # $d1hi is still zero - alcgr $h2,$d1hi # $d1hi is still zero - - brct$g $len,.Loop - - l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx - - stg $h0,0($ctx) # store hash value - stg $h1,8($ctx) - stg $h2,16($ctx) - - lm${g} %r6,%r14,`6*$SIZE_T`($sp) -.Lno_data: - br %r14 -.size poly1305_blocks,.-poly1305_blocks -___ +GLOBL ("poly1305_blocks"); +TYPE ("poly1305_blocks","\@function"); +ALIGN (16); +LABEL ("poly1305_blocks"); +LABEL (".Lpoly1305_blocks"); +&{$z? \<gr:\<r} ("%r0",$len); + jz (".Lno_data"); + +&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)"); + + lg ($h0,"0($ctx)"); # load hash value + lg ($h1,"8($ctx)"); + lg ($h2,"16($ctx)"); + +LABEL (".Lpoly1305_blocks_entry"); +if ($z) { + srlg ($len,$len,4); +} else { + srl ($len,4); +} + llgfr ($padbit,$padbit); # clear upper half, much needed with + # non-64-bit ABI + lg ($r0,"32($ctx)"); # load key + lg ($r1,"40($ctx)"); + +&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx + srlg ($s1,$r1,2); + algr ($s1,$r1); # s1 = r1 + r1>>2 + j (".Loop"); + +ALIGN (16); +LABEL (".Loop"); + lrvg ($d0lo,"0($inp)"); # load little-endian input + lrvg ($d1lo,"8($inp)"); + la ($inp,"16($inp)"); + + algr ($d0lo,$h0); # accumulate input + alcgr ($d1lo,$h1); + alcgr ($h2,$padbit); + + lgr ($h0,$d0lo); + mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo + lgr ($h1,$d1lo); + mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo + + mlgr ($t0,$r1); # h0*r1 -> $t0:$h0 + mlgr ($t1,$r0); # h1*r0 -> $t1:$h1 + + algr ($d0lo,$d1lo); + lgr ($d1lo,$h2); + alcgr ($d0hi,$d1hi); + lghi ($d1hi,0); + + algr ($h1,$h0); + alcgr ($t1,$t0); + + msgr ($d1lo,$s1); # h2*s1 + msgr ($h2,$r0); # h2*r0 + + algr ($h1,$d1lo); + alcgr ($t1,$d1hi); # $d1hi is zero + + algr ($h1,$d0hi); + alcgr ($h2,$t1); + + lghi ($h0,-4); # final reduction step + ngr ($h0,$h2); + srlg ($t0,$h2,2); + algr ($h0,$t0); + lghi ($t1,3); + ngr ($h2,$t1); + + algr ($h0,$d0lo); + alcgr ($h1,$d1hi); # $d1hi is still zero + alcgr ($h2,$d1hi); # $d1hi is still zero + +&{$z? \&brctg:\&brct} ($len,".Loop"); + +&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx + + stg ($h0,"0($ctx)"); # store hash value + stg ($h1,"8($ctx)"); + stg ($h2,"16($ctx)"); + +&{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)"); +LABEL (".Lno_data"); + br ("%r14"); +SIZE ("poly1305_blocks",".-poly1305_blocks"); } + +################ +# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp, +# size_t len, u32 padbit) +{ +my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4)); +my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9)); +my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14)); +my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18)); +my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23)); +my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27)); +my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31)); + +my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14)); + +TYPE ("poly1305_blocks_vx","\@function"); +ALIGN (16); +LABEL ("poly1305_blocks_vx"); +LABEL (".Lpoly1305_blocks_vx"); +&{$z? \&clgfi:\&clfi} ($len,128); + jhe ("__poly1305_blocks_vx"); + +&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)"); + + lg ($d0,"0($ctx)"); + lg ($d1,"8($ctx)"); + lg ($d2,"16($ctx)"); + + llgfr ("%r0",$d0); # base 2^26 -> base 2^64 + srlg ($h0,$d0,32); + llgfr ("%r1",$d1); + srlg ($h1,$d1,32); + srlg ($h2,$d2,32); + + sllg ("%r0","%r0",26); + algr ($h0,"%r0"); + sllg ("%r0",$h1,52); + srlg ($h1,$h1,12); + sllg ("%r1","%r1",14); + algr ($h0,"%r0"); + alcgr ($h1,"%r1"); + sllg ("%r0",$h2,40); + srlg ($h2,$h2,24); + lghi ("%r1",0); + algr ($h1,"%r0"); + alcgr ($h2,"%r1"); + + llgf ("%r0","24($ctx)"); # is_base2_26 + lcgr ("%r0","%r0"); + + xgr ($h0,$d0); # choose between radixes + xgr ($h1,$d1); + xgr ($h2,$d2); + ngr ($h0,"%r0"); + ngr ($h1,"%r0"); + ngr ($h2,"%r0"); + xgr ($h0,$d0); + xgr ($h1,$d1); + xgr ($h2,$d2); + + lhi ("%r0",0); + st ("%r0","24($ctx)"); # clear is_base2_26 + + j (".Lpoly1305_blocks_entry"); +SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx"); + +TYPE ("__poly1305_mul","\@function"); +ALIGN (16); +LABEL ("__poly1305_mul"); + vmlof ($ACC0,$H0,$R0); + vmlof ($ACC1,$H0,$R1); + vmlof ($ACC2,$H0,$R2); + vmlof ($ACC3,$H0,$R3); + vmlof ($ACC4,$H0,$R4); + + vmalof ($ACC0,$H1,$S4,$ACC0); + vmalof ($ACC1,$H1,$R0,$ACC1); + vmalof ($ACC2,$H1,$R1,$ACC2); + vmalof ($ACC3,$H1,$R2,$ACC3); + vmalof ($ACC4,$H1,$R3,$ACC4); + + vmalof ($ACC0,$H2,$S3,$ACC0); + vmalof ($ACC1,$H2,$S4,$ACC1); + vmalof ($ACC2,$H2,$R0,$ACC2); + vmalof ($ACC3,$H2,$R1,$ACC3); + vmalof ($ACC4,$H2,$R2,$ACC4); + + vmalof ($ACC0,$H3,$S2,$ACC0); + vmalof ($ACC1,$H3,$S3,$ACC1); + vmalof ($ACC2,$H3,$S4,$ACC2); + vmalof ($ACC3,$H3,$R0,$ACC3); + vmalof ($ACC4,$H3,$R1,$ACC4); + + vmalof ($ACC0,$H4,$S1,$ACC0); + vmalof ($ACC1,$H4,$S2,$ACC1); + vmalof ($ACC2,$H4,$S3,$ACC2); + vmalof ($ACC3,$H4,$S4,$ACC3); + vmalof ($ACC4,$H4,$R0,$ACC4); + + ################################################################ + # lazy reduction + + vesrlg ($H4,$ACC3,26); + vesrlg ($H1,$ACC0,26); + vn ($H3,$ACC3,$mask26); + vn ($H0,$ACC0,$mask26); + vag ($H4,$H4,$ACC4); # h3 -> h4 + vag ($H1,$H1,$ACC1); # h0 -> h1 + + vesrlg ($ACC4,$H4,26); + vesrlg ($ACC1,$H1,26); + vn ($H4,$H4,$mask26); + vn ($H1,$H1,$mask26); + vag ($H0,$H0,$ACC4); + vag ($H2,$ACC2,$ACC1); # h1 -> h2 + + veslg ($ACC4,$ACC4,2); # <<2 + vesrlg ($ACC2,$H2,26); + vn ($H2,$H2,$mask26); + vag ($H0,$H0,$ACC4); # h4 -> h0 + vag ($H3,$H3,$ACC2); # h2 -> h3 + + vesrlg ($ACC0,$H0,26); + vesrlg ($ACC3,$H3,26); + vn ($H0,$H0,$mask26); + vn ($H3,$H3,$mask26); + vag ($H1,$H1,$ACC0); # h0 -> h1 + vag ($H4,$H4,$ACC3); # h3 -> h4 + br ("%r14"); +SIZE ("__poly1305_mul",".-__poly1305_mul"); + +TYPE ("__poly1305_blocks_vx","\@function"); +ALIGN (16); +LABEL ("__poly1305_blocks_vx"); +&{$z? \&lgr:\&lr} ("%r0",$sp); +&{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)"); +if (!$z) { + std ("%f4","16*$SIZE_T+2*8($sp)"); + std ("%f6","16*$SIZE_T+3*8($sp)"); + ahi ($sp,-$stdframe); + st ("%r0","0($sp)"); # back-chain + + llgfr ($len,$len); # so that srlg works on $len +} else { + aghi ($sp,"-($stdframe+8*8)"); + stg ("%r0","0($sp)"); # back-chain + + std ("%f8","$stdframe+0*8($sp)"); + std ("%f9","$stdframe+1*8($sp)"); + std ("%f10","$stdframe+2*8($sp)"); + std ("%f11","$stdframe+3*8($sp)"); + std ("%f12","$stdframe+4*8($sp)"); + std ("%f13","$stdframe+5*8($sp)"); + std ("%f14","$stdframe+6*8($sp)"); + std ("%f15","$stdframe+7*8($sp)"); +} + larl ("%r1",".Lconst"); + vgmg ($mask26,38,63); + vlm ($bswaplo,$bswapmi,"16(%r1)"); + + < ("%r0","24($ctx)"); # is_base2_26? + jnz (".Lskip_init"); + + lg ($h0,"32($ctx)"); # load key base 2^64 + lg ($h1,"40($ctx)"); + + risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26 + srlg ($d1,$h0,52); + risbg ($h0,$h0,38,0x80+63,0); + vlvgg ($R0,$h0,0); + risbg ($d1,$h1,38,51,12); + vlvgg ($R1,$d0,0); + risbg ($d0,$h1,38,63,50); + vlvgg ($R2,$d1,0); + srlg ($d1,$h1,40); + vlvgg ($R3,$d0,0); + vlvgg ($R4,$d1,0); + + veslg ($S1,$R1,2); + veslg ($S2,$R2,2); + veslg ($S3,$R3,2); + veslg ($S4,$R4,2); + vlr ($H0,$R0); + vlr ($H1,$R1); + vlr ($H2,$R2); + vlr ($H3,$R3); + vlr ($H4,$R4); + vag ($S1,$S1,$R1); # * 5 + vag ($S2,$S2,$R2); + vag ($S3,$S3,$R3); + vag ($S4,$S4,$R4); + + brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:- + + vpdi ($R0,$H0,$R0,0); # r^2:r^1 + vpdi ($R1,$H1,$R1,0); + vpdi ($R2,$H2,$R2,0); + vpdi ($R3,$H3,$R3,0); + vpdi ($R4,$H4,$R4,0); + vpdi ($H0,$H0,$H0,0); # r^2:r^2 + vpdi ($H1,$H1,$H1,0); + vpdi ($H2,$H2,$H2,0); + vpdi ($H3,$H3,$H3,0); + vpdi ($H4,$H4,$H4,0); + veslg ($S1,$R1,2); + veslg ($S2,$R2,2); + veslg ($S3,$R3,2); + veslg ($S4,$R4,2); + vag ($S1,$S1,$R1); # * 5 + vag ($S2,$S2,$R2); + vag ($S3,$S3,$R3); + vag ($S4,$S4,$R4); + + brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1 + + vl ($I0,"0(%r1)"); # borrow $I0 + vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3 + vperm ($R1,$R1,$H1,$I0); + vperm ($R2,$R2,$H2,$I0); + vperm ($R3,$R3,$H3,$I0); + vperm ($R4,$R4,$H4,$I0); + veslf ($S1,$R1,2); + veslf ($S2,$R2,2); + veslf ($S3,$R3,2); + veslf ($S4,$R4,2); + vaf ($S1,$S1,$R1); # * 5 + vaf ($S2,$S2,$R2); + vaf ($S3,$S3,$R3); + vaf ($S4,$S4,$R4); + + lg ($h0,"0($ctx)"); # load hash base 2^64 + lg ($h1,"8($ctx)"); + lg ($h2,"16($ctx)"); + + vzero ($H0); + vzero ($H1); + vzero ($H2); + vzero ($H3); + vzero ($H4); + + risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26 + srlg ($d1,$h0,52); + risbg ($h0,$h0,38,0x80+63,0); + vlvgg ($H0,$h0,0); + risbg ($d1,$h1,38,51,12); + vlvgg ($H1,$d0,0); + risbg ($d0,$h1,38,63,50); + vlvgg ($H2,$d1,0); + srlg ($d1,$h1,40); + vlvgg ($H3,$d0,0); + risbg ($d1,$h2,37,39,24); + vlvgg ($H4,$d1,0); + + lhi ("%r0",1); + st ("%r0","24($ctx)"); # set is_base2_26 + + vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26 + + vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4 + vpdi ($R1,$R1,$R1,0); + vpdi ($S1,$S1,$S1,0); + vpdi ($R2,$R2,$R2,0); + vpdi ($S2,$S2,$S2,0); + vpdi ($R3,$R3,$R3,0); + vpdi ($S3,$S3,$S3,0); + vpdi ($R4,$R4,$R4,0); + vpdi ($S4,$S4,$S4,0); + + j (".Loaded_hash"); + +ALIGN (16); +LABEL (".Lskip_init"); + vllezf ($H0,"0($ctx)"); # load hash base 2^26 + vllezf ($H1,"4($ctx)"); + vllezf ($H2,"8($ctx)"); + vllezf ($H3,"12($ctx)"); + vllezf ($H4,"16($ctx)"); + + vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4 + vlrepg ($R1,"0x40($ctx)"); + vlrepg ($S1,"0x50($ctx)"); + vlrepg ($R2,"0x60($ctx)"); + vlrepg ($S2,"0x70($ctx)"); + vlrepg ($R3,"0x80($ctx)"); + vlrepg ($S3,"0x90($ctx)"); + vlrepg ($R4,"0xa0($ctx)"); + vlrepg ($S4,"0xb0($ctx)"); + +LABEL (".Loaded_hash"); + vzero ($I1); + vzero ($I3); + + vlm ($T1,$T4,"0x00($inp)"); # load first input block + la ($inp,"0x40($inp)"); + vgmg ($mask26,6,31); + vgmf ($I4,5,5); # padbit<<2 + + vperm ($I0,$T3,$T4,$bswaplo); + vperm ($I2,$T3,$T4,$bswapmi); + vperm ($T3,$T3,$T4,$bswaphi); + + verimg ($I1,$I0,$mask26,6); # >>26 + veslg ($I0,$I0,32); + veslg ($I2,$I2,28); # >>4 + verimg ($I3,$T3,$mask26,18); # >>14 + verimg ($I4,$T3,$mask26,58); # >>38 + vn ($I0,$I0,$mask26); + vn ($I2,$I2,$mask26); + vesrlf ($I4,$I4,2); # >>2 + + vgmg ($mask26,38,63); + vperm ($T3,$T1,$T2,$bswaplo); + vperm ($T4,$T1,$T2,$bswaphi); + vperm ($T2,$T1,$T2,$bswapmi); + + verimg ($I0,$T3,$mask26,0); + verimg ($I1,$T3,$mask26,38); # >>26 + verimg ($I2,$T2,$mask26,60); # >>4 + verimg ($I3,$T4,$mask26,50); # >>14 + vesrlg ($T4,$T4,40); + vo ($I4,$I4,$T4); + + srlg ("%r0",$len,6); +&{$z? \&aghi:\&ahi} ("%r0",-1); + +ALIGN (16); +LABEL (".Loop_vx"); + vmlef ($ACC0,$I0,$R0); + vmlef ($ACC1,$I0,$R1); + vmlef ($ACC2,$I0,$R2); + vmlef ($ACC3,$I0,$R3); + vmlef ($ACC4,$I0,$R4); + + vmalef ($ACC0,$I1,$S4,$ACC0); + vmalef ($ACC1,$I1,$R0,$ACC1); + vmalef ($ACC2,$I1,$R1,$ACC2); + vmalef ($ACC3,$I1,$R2,$ACC3); + vmalef ($ACC4,$I1,$R3,$ACC4); + + vaf ($H2,$H2,$I2); + vaf ($H0,$H0,$I0); + vaf ($H3,$H3,$I3); + vaf ($H1,$H1,$I1); + vaf ($H4,$H4,$I4); + + vmalef ($ACC0,$I2,$S3,$ACC0); + vmalef ($ACC1,$I2,$S4,$ACC1); + vmalef ($ACC2,$I2,$R0,$ACC2); + vmalef ($ACC3,$I2,$R1,$ACC3); + vmalef ($ACC4,$I2,$R2,$ACC4); + + vlm ($T1,$T4,"0x00($inp)"); # load next input block + la ($inp,"0x40($inp)"); + vgmg ($mask26,6,31); + + vmalef ($ACC0,$I3,$S2,$ACC0); + vmalef ($ACC1,$I3,$S3,$ACC1); + vmalef ($ACC2,$I3,$S4,$ACC2); + vmalef ($ACC3,$I3,$R0,$ACC3); + vmalef ($ACC4,$I3,$R1,$ACC4); + + vperm ($I0,$T3,$T4,$bswaplo); + vperm ($I2,$T3,$T4,$bswapmi); + vperm ($T3,$T3,$T4,$bswaphi); + + vmalef ($ACC0,$I4,$S1,$ACC0); + vmalef ($ACC1,$I4,$S2,$ACC1); + vmalef ($ACC2,$I4,$S3,$ACC2); + vmalef ($ACC3,$I4,$S4,$ACC3); + vmalef ($ACC4,$I4,$R0,$ACC4); + + verimg ($I1,$I0,$mask26,6); # >>26 + veslg ($I0,$I0,32); + veslg ($I2,$I2,28); # >>4 + verimg ($I3,$T3,$mask26,18); # >>14 + + vmalof ($ACC0,$H0,$R0,$ACC0); + vmalof ($ACC1,$H0,$R1,$ACC1); + vmalof ($ACC2,$H0,$R2,$ACC2); + vmalof ($ACC3,$H0,$R3,$ACC3); + vmalof ($ACC4,$H0,$R4,$ACC4); + + vgmf ($I4,5,5); # padbit<<2 + verimg ($I4,$T3,$mask26,58); # >>38 + vn ($I0,$I0,$mask26); + vn ($I2,$I2,$mask26); + vesrlf ($I4,$I4,2); # >>2 + + vmalof ($ACC0,$H1,$S4,$ACC0); + vmalof ($ACC1,$H1,$R0,$ACC1); + vmalof ($ACC2,$H1,$R1,$ACC2); + vmalof ($ACC3,$H1,$R2,$ACC3); + vmalof ($ACC4,$H1,$R3,$ACC4); + + vgmg ($mask26,38,63); + vperm ($T3,$T1,$T2,$bswaplo); + vperm ($T4,$T1,$T2,$bswaphi); + vperm ($T2,$T1,$T2,$bswapmi); + + vmalof ($ACC0,$H2,$S3,$ACC0); + vmalof ($ACC1,$H2,$S4,$ACC1); + vmalof ($ACC2,$H2,$R0,$ACC2); + vmalof ($ACC3,$H2,$R1,$ACC3); + vmalof ($ACC4,$H2,$R2,$ACC4); + + verimg ($I0,$T3,$mask26,0); + verimg ($I1,$T3,$mask26,38); # >>26 + verimg ($I2,$T2,$mask26,60); # >>4 + + vmalof ($ACC0,$H3,$S2,$ACC0); + vmalof ($ACC1,$H3,$S3,$ACC1); + vmalof ($ACC2,$H3,$S4,$ACC2); + vmalof ($ACC3,$H3,$R0,$ACC3); + vmalof ($ACC4,$H3,$R1,$ACC4); + + verimg ($I3,$T4,$mask26,50); # >>14 + vesrlg ($T4,$T4,40); + vo ($I4,$I4,$T4); + + vmalof ($ACC0,$H4,$S1,$ACC0); + vmalof ($ACC1,$H4,$S2,$ACC1); + vmalof ($ACC2,$H4,$S3,$ACC2); + vmalof ($ACC3,$H4,$S4,$ACC3); + vmalof ($ACC4,$H4,$R0,$ACC4); + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vesrlg ($H4,$ACC3,26); + vesrlg ($H1,$ACC0,26); + vn ($H3,$ACC3,$mask26); + vn ($H0,$ACC0,$mask26); + vag ($H4,$H4,$ACC4); # h3 -> h4 + vag ($H1,$H1,$ACC1); # h0 -> h1 + + vesrlg ($ACC4,$H4,26); + vesrlg ($ACC1,$H1,26); + vn ($H4,$H4,$mask26); + vn ($H1,$H1,$mask26); + vag ($H0,$H0,$ACC4); + vag ($H2,$ACC2,$ACC1); # h1 -> h2 + + veslg ($ACC4,$ACC4,2); # <<2 + vesrlg ($ACC2,$H2,26); + vn ($H2,$H2,$mask26); + vag ($H0,$H0,$ACC4); # h4 -> h0 + vag ($H3,$H3,$ACC2); # h2 -> h3 + + vesrlg ($ACC0,$H0,26); + vesrlg ($ACC3,$H3,26); + vn ($H0,$H0,$mask26); + vn ($H3,$H3,$mask26); + vag ($H1,$H1,$ACC0); # h0 -> h1 + vag ($H4,$H4,$ACC3); # h3 -> h4 + +&{$z? \&brctg:\&brct} ("%r0",".Loop_vx"); + + vlm ($R0,$S4,"48($ctx)"); # load all powers + + lghi ("%r0",0x30); +&{$z? \&lcgr:\&lcr} ($len,$len); +&{$z? \&ngr:\&nr} ($len,"%r0"); +&{$z? \&slgr:\&slr} ($inp,$len); + +LABEL (".Last"); + vmlef ($ACC0,$I0,$R0); + vmlef ($ACC1,$I0,$R1); + vmlef ($ACC2,$I0,$R2); + vmlef ($ACC3,$I0,$R3); + vmlef ($ACC4,$I0,$R4); + + vmalef ($ACC0,$I1,$S4,$ACC0); + vmalef ($ACC1,$I1,$R0,$ACC1); + vmalef ($ACC2,$I1,$R1,$ACC2); + vmalef ($ACC3,$I1,$R2,$ACC3); + vmalef ($ACC4,$I1,$R3,$ACC4); + + vaf ($H0,$H0,$I0); + vaf ($H1,$H1,$I1); + vaf ($H2,$H2,$I2); + vaf ($H3,$H3,$I3); + vaf ($H4,$H4,$I4); + + vmalef ($ACC0,$I2,$S3,$ACC0); + vmalef ($ACC1,$I2,$S4,$ACC1); + vmalef ($ACC2,$I2,$R0,$ACC2); + vmalef ($ACC3,$I2,$R1,$ACC3); + vmalef ($ACC4,$I2,$R2,$ACC4); + + vmalef ($ACC0,$I3,$S2,$ACC0); + vmalef ($ACC1,$I3,$S3,$ACC1); + vmalef ($ACC2,$I3,$S4,$ACC2); + vmalef ($ACC3,$I3,$R0,$ACC3); + vmalef ($ACC4,$I3,$R1,$ACC4); + + vmalef ($ACC0,$I4,$S1,$ACC0); + vmalef ($ACC1,$I4,$S2,$ACC1); + vmalef ($ACC2,$I4,$S3,$ACC2); + vmalef ($ACC3,$I4,$S4,$ACC3); + vmalef ($ACC4,$I4,$R0,$ACC4); + + vmalof ($ACC0,$H0,$R0,$ACC0); + vmalof ($ACC1,$H0,$R1,$ACC1); + vmalof ($ACC2,$H0,$R2,$ACC2); + vmalof ($ACC3,$H0,$R3,$ACC3); + vmalof ($ACC4,$H0,$R4,$ACC4); + + vmalof ($ACC0,$H1,$S4,$ACC0); + vmalof ($ACC1,$H1,$R0,$ACC1); + vmalof ($ACC2,$H1,$R1,$ACC2); + vmalof ($ACC3,$H1,$R2,$ACC3); + vmalof ($ACC4,$H1,$R3,$ACC4); + + vmalof ($ACC0,$H2,$S3,$ACC0); + vmalof ($ACC1,$H2,$S4,$ACC1); + vmalof ($ACC2,$H2,$R0,$ACC2); + vmalof ($ACC3,$H2,$R1,$ACC3); + vmalof ($ACC4,$H2,$R2,$ACC4); + + vmalof ($ACC0,$H3,$S2,$ACC0); + vmalof ($ACC1,$H3,$S3,$ACC1); + vmalof ($ACC2,$H3,$S4,$ACC2); + vmalof ($ACC3,$H3,$R0,$ACC3); + vmalof ($ACC4,$H3,$R1,$ACC4); + + vmalof ($ACC0,$H4,$S1,$ACC0); + vmalof ($ACC1,$H4,$S2,$ACC1); + vmalof ($ACC2,$H4,$S3,$ACC2); + vmalof ($ACC3,$H4,$S4,$ACC3); + vmalof ($ACC4,$H4,$R0,$ACC4); + + ################################################################ + # horizontal addition + + vzero ($H0); + vsumqg ($ACC0,$ACC0,$H0); + vsumqg ($ACC1,$ACC1,$H0); + vsumqg ($ACC2,$ACC2,$H0); + vsumqg ($ACC3,$ACC3,$H0); + vsumqg ($ACC4,$ACC4,$H0); + + ################################################################ + # lazy reduction + + vesrlg ($H4,$ACC3,26); + vesrlg ($H1,$ACC0,26); + vn ($H3,$ACC3,$mask26); + vn ($H0,$ACC0,$mask26); + vag ($H4,$H4,$ACC4); # h3 -> h4 + vag ($H1,$H1,$ACC1); # h0 -> h1 + + vesrlg ($ACC4,$H4,26); + vesrlg ($ACC1,$H1,26); + vn ($H4,$H4,$mask26); + vn ($H1,$H1,$mask26); + vag ($H0,$H0,$ACC4); + vag ($H2,$ACC2,$ACC1); # h1 -> h2 + + veslg ($ACC4,$ACC4,2); # <<2 + vesrlg ($ACC2,$H2,26); + vn ($H2,$H2,$mask26); + vag ($H0,$H0,$ACC4); # h4 -> h0 + vag ($H3,$H3,$ACC2); # h2 -> h3 + + vesrlg ($ACC0,$H0,26); + vesrlg ($ACC3,$H3,26); + vn ($H0,$H0,$mask26); + vn ($H3,$H3,$mask26); + vag ($H1,$H1,$ACC0); # h0 -> h1 + vag ($H4,$H4,$ACC3); # h3 -> h4 + +&{$z? \&clgfi:\&clfi} ($len,0); + je (".Ldone"); + + vlm ($T1,$T4,"0x00($inp)"); # load last partial block + vgmg ($mask26,6,31); + vgmf ($I4,5,5); # padbit<<2 + + vperm ($I0,$T3,$T4,$bswaplo); + vperm ($I2,$T3,$T4,$bswapmi); + vperm ($T3,$T3,$T4,$bswaphi); + + vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1 + vl ($ACC1,"0x60($len,%r1)"); + + verimg ($I1,$I0,$mask26,6); # >>26 + veslg ($I0,$I0,32); + veslg ($I2,$I2,28); # >>4 + verimg ($I3,$T3,$mask26,18); # >>14 + verimg ($I4,$T3,$mask26,58); # >>38 + vn ($I0,$I0,$mask26); + vn ($I2,$I2,$mask26); + vesrlf ($I4,$I4,2); # >>2 + + vgmg ($mask26,38,63); + vperm ($T3,$T1,$T2,$bswaplo); + vperm ($T4,$T1,$T2,$bswaphi); + vperm ($T2,$T1,$T2,$bswapmi); + + verimg ($I0,$T3,$mask26,0); + verimg ($I1,$T3,$mask26,38); # >>26 + verimg ($I2,$T2,$mask26,60); # >>4 + verimg ($I3,$T4,$mask26,50); # >>14 + vesrlg ($T4,$T4,40); + vo ($I4,$I4,$T4); + + vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane + vn ($I0,$I0,$ACC1); # mask redundant lane[s] + vperm ($H1,$H1,$H1,$ACC0); + vn ($I1,$I1,$ACC1); + vperm ($H2,$H2,$H2,$ACC0); + vn ($I2,$I2,$ACC1); + vperm ($H3,$H3,$H3,$ACC0); + vn ($I3,$I3,$ACC1); + vperm ($H4,$H4,$H4,$ACC0); + vn ($I4,$I4,$ACC1); + + vaf ($I0,$I0,$H0); # accumulate hash + vzero ($H0); # wipe hash value + vaf ($I1,$I1,$H1); + vzero ($H1); + vaf ($I2,$I2,$H2); + vzero ($H2); + vaf ($I3,$I3,$H3); + vzero ($H3); + vaf ($I4,$I4,$H4); + vzero ($H4); + +&{$z? \&lghi:\&lhi} ($len,0); + j (".Last"); + # I don't bother to tell apart cases when only one multiplication + # pass is sufficient, because I argue that mispredicted branch + # penalties are comparable to overhead of sometimes redundant + # multiplication pass... + +LABEL (".Ldone"); + vstef ($H0,"0($ctx)",3); # store hash base 2^26 + vstef ($H1,"4($ctx)",3); + vstef ($H2,"8($ctx)",3); + vstef ($H3,"12($ctx)",3); + vstef ($H4,"16($ctx)",3); + +if ($z) { + ld ("%f8","$stdframe+0*8($sp)"); + ld ("%f9","$stdframe+1*8($sp)"); + ld ("%f10","$stdframe+2*8($sp)"); + ld ("%f11","$stdframe+3*8($sp)"); + ld ("%f12","$stdframe+4*8($sp)"); + ld ("%f13","$stdframe+5*8($sp)"); + ld ("%f14","$stdframe+6*8($sp)"); + ld ("%f15","$stdframe+7*8($sp)"); +&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)"); +} else { + ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)"); + ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)"); +&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)"); +} + br ("%r14"); +SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx"); +} + +################ +# static void poly1305_emit(void *ctx, unsigned char mac[16], +# const u32 nonce[4]) { my ($mac,$nonce)=($inp,$len); -my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9)); +my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10)); -$code.=<<___; -.globl poly1305_emit -.type poly1305_emit,\@function -.align 16 -poly1305_emit: - stm${g} %r6,%r9,`6*$SIZE_T`($sp) - - lg $h0,0($ctx) - lg $h1,8($ctx) - lg $h2,16($ctx) - - lghi %r0,5 - lghi %r1,0 - lgr $d0,$h0 - lgr $d1,$h1 - - algr $h0,%r0 # compare to modulus - alcgr $h1,%r1 - alcgr $h2,%r1 - - srlg $h2,$h2,2 # did it borrow/carry? - slgr %r1,$h2 # 0-$h2>>2 - lg $h2,0($nonce) # load nonce - lghi %r0,-1 - lg $ctx,8($nonce) - xgr %r0,%r1 # ~%r1 - - ngr $h0,%r1 - ngr $d0,%r0 - ngr $h1,%r1 - ngr $d1,%r0 - ogr $h0,$d0 - rllg $d0,$h2,32 # flip nonce words - ogr $h1,$d1 - rllg $d1,$ctx,32 - - algr $h0,$d0 # accumulate nonce - alcgr $h1,$d1 - - strvg $h0,0($mac) # write little-endian result - strvg $h1,8($mac) - - lm${g} %r6,%r9,`6*$SIZE_T`($sp) - br %r14 -.size poly1305_emit,.-poly1305_emit - -.string "Poly1305 for s390x, CRYPTOGAMS by " -___ +GLOBL ("poly1305_emit"); +TYPE ("poly1305_emit","\@function"); +ALIGN (16); +LABEL ("poly1305_emit"); +LABEL (".Lpoly1305_emit"); +&{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)"); + + lg ($d0,"0($ctx)"); + lg ($d1,"8($ctx)"); + lg ($d2,"16($ctx)"); + + llgfr ("%r0",$d0); # base 2^26 -> base 2^64 + srlg ($h0,$d0,32); + llgfr ("%r1",$d1); + srlg ($h1,$d1,32); + srlg ($h2,$d2,32); + + sllg ("%r0","%r0",26); + algr ($h0,"%r0"); + sllg ("%r0",$h1,52); + srlg ($h1,$h1,12); + sllg ("%r1","%r1",14); + algr ($h0,"%r0"); + alcgr ($h1,"%r1"); + sllg ("%r0",$h2,40); + srlg ($h2,$h2,24); + lghi ("%r1",0); + algr ($h1,"%r0"); + alcgr ($h2,"%r1"); + + llgf ("%r0","24($ctx)"); # is_base2_26 + lcgr ("%r0","%r0"); + + xgr ($h0,$d0); # choose between radixes + xgr ($h1,$d1); + xgr ($h2,$d2); + ngr ($h0,"%r0"); + ngr ($h1,"%r0"); + ngr ($h2,"%r0"); + xgr ($h0,$d0); + xgr ($h1,$d1); + xgr ($h2,$d2); + + lghi ("%r0",5); + lgr ($d0,$h0); + lgr ($d1,$h1); + + algr ($h0,"%r0"); # compare to modulus + alcgr ($h1,"%r1"); + alcgr ($h2,"%r1"); + + srlg ($h2,$h2,2); # did it borrow/carry? + slgr ("%r1",$h2); # 0-$h2>>2 + lg ($d2,"0($nonce)"); # load nonce + lg ($ctx,"8($nonce)"); + + xgr ($h0,$d0); + xgr ($h1,$d1); + ngr ($h0,"%r1"); + ngr ($h1,"%r1"); + xgr ($h0,$d0); + rllg ($d0,$d2,32); # flip nonce words + xgr ($h1,$d1); + rllg ($d1,$ctx,32); + + algr ($h0,$d0); # accumulate nonce + alcgr ($h1,$d1); + + strvg ($h0,"0($mac)"); # write little-endian result + strvg ($h1,"8($mac)"); + +&{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)"); + br ("%r14"); +SIZE ("poly1305_emit",".-poly1305_emit"); } -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm; +################ + +ALIGN (16); +LABEL (".Lconst"); +LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd +LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks +LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918); +LONG (0x00000000,0x09080706,0x00000000,0x19181716); + +LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks +LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000); +LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000); + +LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff); +LONG (0xffffffff,0x00000000,0xffffffff,0x00000000); +LONG (0x00000000,0x00000000,0xffffffff,0x00000000); + +STRING ("\"Poly1305 for s390x, CRYPTOGAMS by \""); -print $code; -close STDOUT or die "error closing STDOUT: $!"; +PERLASM_END(); diff -up openssl-1.1.1e/crypto/poly1305/build.info.s390x-update openssl-1.1.1e/crypto/poly1305/build.info --- openssl-1.1.1e/crypto/poly1305/build.info.s390x-update 2020-03-17 15:31:17.000000000 +0100 +++ openssl-1.1.1e/crypto/poly1305/build.info 2020-03-19 16:20:22.042227342 +0100 @@ -18,6 +18,7 @@ INCLUDE[poly1305-armv8.o]=.. GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME) INCLUDE[poly1305-mips.o]=.. GENERATE[poly1305-s390x.S]=asm/poly1305-s390x.pl $(PERLASM_SCHEME) +INCLUDE[poly1305-s390x.o]=.. BEGINRAW[Makefile(unix)] {- $builddir -}/poly1305-%.S: {- $sourcedir -}/asm/poly1305-%.pl