Tree - rpms/openssl - CentOS Git server

rpms / openssl

Blame SOURCES/0067-ppc64le-Montgomery-multiply.patch

Blob History Raw

		d8c783	`From 33ffd36afa7594aeb958a925f521cb287ca850c8 Mon Sep 17 00:00:00 2001`
		d8c783	`From: Rohan McLure <rohanmclure@linux.ibm.com>`
		d8c783	`Date: Mon, 27 Jun 2022 12:14:55 +1000`
		d8c783	`Subject: [PATCH 1/2] Revert "Revert "bn: Add fixed length (n=6), unrolled PPC`
		d8c783	`Montgomery Multiplication""`
		d8c783
		d8c783	`This reverts commit 712d9cc90e355b2c98a959d4e9398610d2269c9e.`
		d8c783	`---`
		d8c783	`crypto/bn/asm/ppc64-mont-fixed.pl \| 581 ++++++++++++++++++++++++++++++`
		d8c783	`crypto/bn/bn_ppc.c \| 15 +`
		d8c783	`crypto/bn/build.info \| 3 +-`
		d8c783	`3 files changed, 598 insertions(+), 1 deletion(-)`
		d8c783
		d8c783	`diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl`
		d8c783	`index e69de29bb2d1..0fb397bc5f12 100755`
		d8c783	`--- a/crypto/bn/asm/ppc64-mont-fixed.pl`
		d8c783	`+++ b/crypto/bn/asm/ppc64-mont-fixed.pl`
		d8c783	`@@ -0,0 +1,581 @@`
		d8c783	`+#! /usr/bin/env perl`
		d8c783	`+# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.`
		d8c783	`+#`
		d8c783	`+# Licensed under the Apache License 2.0 (the "License"). You may not use`
		d8c783	`+# this file except in compliance with the License. You can obtain a copy`
		d8c783	`+# in the file LICENSE in the source distribution or at`
		d8c783	`+# https://www.openssl.org/source/license.html`
		d8c783	`+`
		d8c783	`+# ====================================================================`
		d8c783	`+# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke`
		d8c783	`+# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for`
		d8c783	`+# the OpenSSL project.`
		d8c783	`+# ====================================================================`
		d8c783	`+`
		d8c783	`+#`
		d8c783	`+# Fixed length (n=6), unrolled PPC Montgomery Multiplication`
		d8c783	`+#`
		d8c783	`+`
		d8c783	`+# 2021`
		d8c783	`+#`
		d8c783	`+# Although this is a generic implementation for unrolling Montgomery`
		d8c783	`+# Multiplication for arbitrary values of n, this is currently only`
		d8c783	`+# used for n = 6 to improve the performance of ECC p384.`
		d8c783	`+#`
		d8c783	`+# Unrolling allows intermediate results to be stored in registers,`
		d8c783	`+# rather than on the stack, improving performance by ~7% compared to`
		d8c783	`+# the existing PPC assembly code.`
		d8c783	`+#`
		d8c783	`+# The ISA 3.0 implementation uses combination multiply/add`
		d8c783	`+# instructions (maddld, maddhdu) to improve performance by an`
		d8c783	`+# additional ~10% on Power 9.`
		d8c783	`+#`
		d8c783	`+# Finally, saving non-volatile registers into volatile vector`
		d8c783	`+# registers instead of onto the stack saves a little more.`
		d8c783	`+#`
		d8c783	`+# On a Power 9 machine we see an overall improvement of ~18%.`
		d8c783	`+#`
		d8c783	`+`
		d8c783	`+use strict;`
		d8c783	`+use warnings;`
		d8c783	`+`
		d8c783	`+my ($flavour, $output, $dir, $xlate);`
		d8c783	`+`
		d8c783	`+# $output is the last argument if it looks like a file (it has an extension)`
		d8c783	`+# $flavour is the first argument if it doesn't look like a file`
		d8c783	`+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;`
		d8c783	`+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;`
		d8c783	`+`
		d8c783	`+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;`
		d8c783	`+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or`
		d8c783	`+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or`
		d8c783	`+die "can't locate ppc-xlate.pl";`
		d8c783	`+`
		d8c783	`+open STDOUT,"\| $^X $xlate $flavour \"$output\""`
		d8c783	`+ or die "can't call $xlate: $!";`
		d8c783	`+`
		d8c783	`+if ($flavour !~ /64/) {`
		d8c783	`+ die "bad flavour ($flavour) - only ppc64 permitted";`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+my $SIZE_T= 8;`
		d8c783	`+`
		d8c783	`+# Registers are global so the code is remotely readable`
		d8c783	`+`
		d8c783	`+# Parameters for Montgomery multiplication`
		d8c783	`+my $sp = "r1";`
		d8c783	`+my $toc = "r2";`
		d8c783	`+my $rp = "r3";`
		d8c783	`+my $ap = "r4";`
		d8c783	`+my $bp = "r5";`
		d8c783	`+my $np = "r6";`
		d8c783	`+my $n0 = "r7";`
		d8c783	`+my $num = "r8";`
		d8c783	`+`
		d8c783	`+my $i = "r9";`
		d8c783	`+my $c0 = "r10";`
		d8c783	`+my $bp0 = "r11";`
		d8c783	`+my $bpi = "r11";`
		d8c783	`+my $bpj = "r11";`
		d8c783	`+my $tj = "r12";`
		d8c783	`+my $apj = "r12";`
		d8c783	`+my $npj = "r12";`
		d8c783	`+my $lo = "r14";`
		d8c783	`+my $c1 = "r14";`
		d8c783	`+`
		d8c783	`+# Non-volatile registers used for tp[i]`
		d8c783	`+#`
		d8c783	`+# 12 registers are available but the limit on unrolling is 10,`
		d8c783	`+# since registers from $tp[0] to $tp[$n+1] are used.`
		d8c783	`+my @tp = ("r20" .. "r31");`
		d8c783	`+`
		d8c783	`+# volatile VSRs for saving non-volatile GPRs - faster than stack`
		d8c783	`+my @vsrs = ("v32" .. "v46");`
		d8c783	`+`
		d8c783	`+package Mont;`
		d8c783	`+`
		d8c783	`+sub new($$)`
		d8c783	`+{`
		d8c783	`+ my ($class, $n) = @_;`
		d8c783	`+`
		d8c783	`+ if ($n > 10) {`
		d8c783	`+ die "Can't unroll for BN length ${n} (maximum 10)"`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+ my $self = {`
		d8c783	`+ code => "",`
		d8c783	`+ n => $n,`
		d8c783	`+ };`
		d8c783	`+ bless $self, $class;`
		d8c783	`+`
		d8c783	`+ return $self;`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub add_code($$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->{code} .= $c;`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub get_code($)`
		d8c783	`+{`
		d8c783	`+ my ($self) = @_;`
		d8c783	`+`
		d8c783	`+ return $self->{code};`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub get_function_name($)`
		d8c783	`+{`
		d8c783	`+ my ($self) = @_;`
		d8c783	`+`
		d8c783	`+ return "bn_mul_mont_fixed_n" . $self->{n};`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub get_label($$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $l) = @_;`
		d8c783	`+`
		d8c783	`+ return "L" . $l . "_" . $self->{n};`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub get_labels($@)`
		d8c783	`+{`
		d8c783	`+ my ($self, @labels) = @_;`
		d8c783	`+`
		d8c783	`+ my %out = ();`
		d8c783	`+`
		d8c783	`+ foreach my $l (@labels) {`
		d8c783	`+ $out{"$l"} = $self->get_label("$l");`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+ return \%out;`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub nl($)`
		d8c783	`+{`
		d8c783	`+ my ($self) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code("\n");`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub copy_result($)`
		d8c783	`+{`
		d8c783	`+ my ($self) = @_;`
		d8c783	`+`
		d8c783	`+ my ($n) = $self->{n};`
		d8c783	`+`
		d8c783	`+ for (my $j = 0; $j < $n; $j++) {`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	+ std $tp[$j],`$j*$SIZE_T`($rp)
		d8c783	`+___`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub mul_mont_fixed($)`
		d8c783	`+{`
		d8c783	`+ my ($self) = @_;`
		d8c783	`+`
		d8c783	`+ my ($n) = $self->{n};`
		d8c783	`+ my $fname = $self->get_function_name();`
		d8c783	`+ my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+`
		d8c783	`+.globl .${fname}`
		d8c783	`+.align 5`
		d8c783	`+.${fname}:`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->save_registers();`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ ld $n0,0($n0)`
		d8c783	`+`
		d8c783	`+ ld $bp0,0($bp)`
		d8c783	`+`
		d8c783	`+ ld $apj,0($ap)`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->mul_c_0($tp[0], $apj, $bp0, $c0);`
		d8c783	`+`
		d8c783	`+ for (my $j = 1; $j < $n - 1; $j++) {`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	+ ld $apj,`$j*$SIZE_T`($ap)
		d8c783	`+___`
		d8c783	`+ $self->mul($tp[$j], $apj, $bp0, $c0);`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	+ ld $apj,`($n-1)*$SIZE_T`($ap)
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ li $tp[$n+1],0`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ li $i,0`
		d8c783	`+ mtctr $num`
		d8c783	`+ b $label->{"enter"}`
		d8c783	`+`
		d8c783	`+.align 4`
		d8c783	`+$label->{"outer"}:`
		d8c783	`+ ldx $bpi,$bp,$i`
		d8c783	`+`
		d8c783	`+ ld $apj,0($ap)`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);`
		d8c783	`+`
		d8c783	`+ for (my $j = 1; $j < $n; $j++) {`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	+ ld $apj,`$j*$SIZE_T`($ap)
		d8c783	`+___`
		d8c783	`+ $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ addc $tp[$n],$tp[$n],$c0`
		d8c783	`+ addze $tp[$n+1],$tp[$n+1]`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+.align 4`
		d8c783	`+$label->{"enter"}:`
		d8c783	`+ mulld $bpi,$tp[0],$n0`
		d8c783	`+`
		d8c783	`+ ld $npj,0($np)`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);`
		d8c783	`+`
		d8c783	`+ for (my $j = 1; $j < $n; $j++) {`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	+ ld $npj,`$j*$SIZE_T`($np)
		d8c783	`+___`
		d8c783	`+ $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ addc $tp[$n-1],$tp[$n],$c0`
		d8c783	`+ addze $tp[$n],$tp[$n+1]`
		d8c783	`+`
		d8c783	`+ addi $i,$i,$SIZE_T`
		d8c783	`+ bdnz $label->{"outer"}`
		d8c783	`+`
		d8c783	`+ and. $tp[$n],$tp[$n],$tp[$n]`
		d8c783	`+ bne $label->{"sub"}`
		d8c783	`+`
		d8c783	`+ cmpld $tp[$n-1],$npj`
		d8c783	`+ blt $label->{"copy"}`
		d8c783	`+`
		d8c783	`+$label->{"sub"}:`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ #`
		d8c783	`+ # Reduction`
		d8c783	`+ #`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	+ ld $bpj,`0*$SIZE_T`($np)
		d8c783	`+ subfc $c1,$bpj,$tp[0]`
		d8c783	+ std $c1,`0*$SIZE_T`($rp)
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+ for (my $j = 1; $j < $n - 1; $j++) {`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	+ ld $bpj,`$j*$SIZE_T`($np)
		d8c783	`+ subfe $c1,$bpj,$tp[$j]`
		d8c783	+ std $c1,`$j*$SIZE_T`($rp)
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ subfe $c1,$npj,$tp[$n-1]`
		d8c783	+ std $c1,`($n-1)*$SIZE_T`($rp)
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ addme. $tp[$n],$tp[$n]`
		d8c783	`+ beq $label->{"end"}`
		d8c783	`+`
		d8c783	`+$label->{"copy"}:`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->copy_result();`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+`
		d8c783	`+$label->{"end"}:`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ $self->restore_registers();`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ li r3,1`
		d8c783	`+ blr`
		d8c783	`+.size .${fname},.-.${fname}`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+package Mont::GPR;`
		d8c783	`+`
		d8c783	`+our @ISA = ('Mont');`
		d8c783	`+`
		d8c783	`+sub new($$)`
		d8c783	`+{`
		d8c783	`+ my ($class, $n) = @_;`
		d8c783	`+`
		d8c783	`+ return $class->SUPER::new($n);`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub save_registers($)`
		d8c783	`+{`
		d8c783	`+ my ($self) = @_;`
		d8c783	`+`
		d8c783	`+ my $n = $self->{n};`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ std $lo,-8($sp)`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ for (my $j = 0; $j <= $n+1; $j++) {`
		d8c783	`+ $self->{code}.=<<___;`
		d8c783	+ std $tp[$j],-`($j+2)*8`($sp)
		d8c783	`+___`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub restore_registers($)`
		d8c783	`+{`
		d8c783	`+ my ($self) = @_;`
		d8c783	`+`
		d8c783	`+ my $n = $self->{n};`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ ld $lo,-8($sp)`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ for (my $j = 0; $j <= $n+1; $j++) {`
		d8c783	`+ $self->{code}.=<<___;`
		d8c783	+ ld $tp[$j],-`($j+2)*8`($sp)
		d8c783	`+___`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+ $self->{code} .=<<___;`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Direct translation of C mul()`
		d8c783	`+sub mul($$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r, $a, $w, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ mulld $lo,$a,$w`
		d8c783	`+ addc $r,$lo,$c`
		d8c783	`+ mulhdu $c,$a,$w`
		d8c783	`+ addze $c,$c`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Like mul() but $c is ignored as an input - an optimisation to save a`
		d8c783	`+# preliminary instruction that would set input $c to 0`
		d8c783	`+sub mul_c_0($$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r, $a, $w, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ mulld $r,$a,$w`
		d8c783	`+ mulhdu $c,$a,$w`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Like mul() but does not to the final addition of CA into $c - an`
		d8c783	`+# optimisation to save an instruction`
		d8c783	`+sub mul_last($$$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r1, $r2, $a, $w, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ mulld $lo,$a,$w`
		d8c783	`+ addc $r1,$lo,$c`
		d8c783	`+ mulhdu $c,$a,$w`
		d8c783	`+`
		d8c783	`+ addze $r2,$c`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Like C mul_add() but allow $r_out and $r_in to be different`
		d8c783	`+sub mul_add($$$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ mulld $lo,$a,$w`
		d8c783	`+ addc $lo,$lo,$c`
		d8c783	`+ mulhdu $c,$a,$w`
		d8c783	`+ addze $c,$c`
		d8c783	`+ addc $r_out,$r_in,$lo`
		d8c783	`+ addze $c,$c`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Like mul_add() but $c is ignored as an input - an optimisation to save a`
		d8c783	`+# preliminary instruction that would set input $c to 0`
		d8c783	`+sub mul_add_c_0($$$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ mulld $lo,$a,$w`
		d8c783	`+ addc $r_out,$r_in,$lo`
		d8c783	`+ mulhdu $c,$a,$w`
		d8c783	`+ addze $c,$c`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+package Mont::GPR_300;`
		d8c783	`+`
		d8c783	`+our @ISA = ('Mont::GPR');`
		d8c783	`+`
		d8c783	`+sub new($$)`
		d8c783	`+{`
		d8c783	`+ my ($class, $n) = @_;`
		d8c783	`+`
		d8c783	`+ my $mont = $class->SUPER::new($n);`
		d8c783	`+`
		d8c783	`+ return $mont;`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub get_function_name($)`
		d8c783	`+{`
		d8c783	`+ my ($self) = @_;`
		d8c783	`+`
		d8c783	`+ return "bn_mul_mont_300_fixed_n" . $self->{n};`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+sub get_label($$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $l) = @_;`
		d8c783	`+`
		d8c783	`+ return "L" . $l . "_300_" . $self->{n};`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Direct translation of C mul()`
		d8c783	`+sub mul($$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r, $a, $w, $c, $last) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ maddld $r,$a,$w,$c`
		d8c783	`+ maddhdu $c,$a,$w,$c`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Save the last carry as the final entry`
		d8c783	`+sub mul_last($$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r1, $r2, $a, $w, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ maddld $r1,$a,$w,$c`
		d8c783	`+ maddhdu $r2,$a,$w,$c`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Like mul() but $c is ignored as an input - an optimisation to save a`
		d8c783	`+# preliminary instruction that would set input $c to 0`
		d8c783	`+sub mul_c_0($$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r, $a, $w, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ mulld $r,$a,$w`
		d8c783	`+ mulhdu $c,$a,$w`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Like C mul_add() but allow $r_out and $r_in to be different`
		d8c783	`+sub mul_add($$$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ maddld $lo,$a,$w,$c`
		d8c783	`+ maddhdu $c,$a,$w,$c`
		d8c783	`+ addc $r_out,$r_in,$lo`
		d8c783	`+ addze $c,$c`
		d8c783	`+`
		d8c783	`+___`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+# Like mul_add() but $c is ignored as an input - an optimisation to save a`
		d8c783	`+# preliminary instruction that would set input $c to 0`
		d8c783	`+sub mul_add_c_0($$$$$$)`
		d8c783	`+{`
		d8c783	`+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;`
		d8c783	`+`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ maddld $lo,$a,$w,$r_in`
		d8c783	`+ maddhdu $c,$a,$w,$r_in`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+ if ($r_out ne $lo) {`
		d8c783	`+ $self->add_code(<<___);`
		d8c783	`+ mr $r_out,$lo`
		d8c783	`+___`
		d8c783	`+ }`
		d8c783	`+`
		d8c783	`+ $self->nl();`
		d8c783	`+}`
		d8c783	`+`
		d8c783	`+`
		d8c783	`+package main;`
		d8c783	`+`
		d8c783	`+my $code;`
		d8c783	`+`
		d8c783	`+$code.=<<___;`
		d8c783	`+.machine "any"`
		d8c783	`+.text`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+my $mont;`
		d8c783	`+`
		d8c783	`+$mont = new Mont::GPR(6);`
		d8c783	`+$mont->mul_mont_fixed();`
		d8c783	`+$code .= $mont->get_code();`
		d8c783	`+`
		d8c783	`+$mont = new Mont::GPR_300(6);`
		d8c783	`+$mont->mul_mont_fixed();`
		d8c783	`+$code .= $mont->get_code();`
		d8c783	`+`
		d8c783	+$code =~ s/\`([^\`]*)\`/eval $1/gem;
		d8c783	`+`
		d8c783	`+$code.=<<___;`
		d8c783	`+.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"`
		d8c783	`+___`
		d8c783	`+`
		d8c783	`+print $code;`
		d8c783	`+close STDOUT or die "error closing STDOUT: $!";`
		d8c783	`diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c`
		d8c783	`index 3ee76ea96574..1e9421bee213 100644`
		d8c783	`--- a/crypto/bn/bn_ppc.c`
		d8c783	`+++ b/crypto/bn/bn_ppc.c`
		d8c783	`@@ -19,6 +19,12 @@ int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,`
		d8c783	`const BN_ULONG np, const BN_ULONG n0, int num);`
		d8c783	`int bn_mul4x_mont_int(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,`
		d8c783	`const BN_ULONG np, const BN_ULONG n0, int num);`
		d8c783	`+ int bn_mul_mont_fixed_n6(BN_ULONG rp, const BN_ULONG ap,`
		d8c783	`+ const BN_ULONG bp, const BN_ULONG np,`
		d8c783	`+ const BN_ULONG *n0, int num);`
		d8c783	`+ int bn_mul_mont_300_fixed_n6(BN_ULONG rp, const BN_ULONG ap,`
		d8c783	`+ const BN_ULONG bp, const BN_ULONG np,`
		d8c783	`+ const BN_ULONG *n0, int num);`
		d8c783
		d8c783	`if (num < 4)`
		d8c783	`return 0;`
		d8c783	`@@ -34,5 +40,14 @@ int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,`
		d8c783	`* no opportunity to figure it out...`
		d8c783	`*/`
		d8c783
		d8c783	`+#if defined(_ARCH_PPC64) && !defined(__ILP32__)`
		d8c783	`+ if (num == 6) {`
		d8c783	`+ if (OPENSSL_ppccap_P & PPC_MADD300)`
		d8c783	`+ return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);`
		d8c783	`+ else`
		d8c783	`+ return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);`
		d8c783	`+ }`
		d8c783	`+#endif`
		d8c783	`+`
		d8c783	`return bn_mul_mont_int(rp, ap, bp, np, n0, num);`
		d8c783	`}`
		d8c783	`diff --git a/crypto/bn/build.info b/crypto/bn/build.info`
		d8c783	`index 4f8d0689b5ea..987a70ae263b 100644`
		d8c783	`--- a/crypto/bn/build.info`
		d8c783	`+++ b/crypto/bn/build.info`
		d8c783	`@@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}]`
		d8c783
		d8c783	`$BNASM_ppc32=bn_ppc.c bn-ppc.s ppc-mont.s`
		d8c783	`$BNDEF_ppc32=OPENSSL_BN_ASM_MONT`
		d8c783	`- $BNASM_ppc64=$BNASM_ppc32`
		d8c783	`+ $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s`
		d8c783	`$BNDEF_ppc64=$BNDEF_ppc32`
		d8c783
		d8c783	`$BNASM_c64xplus=asm/bn-c64xplus.asm`
		d8c783	`@@ -173,6 +173,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl`
		d8c783	`GENERATE[bn-ppc.s]=asm/ppc.pl`
		d8c783	`GENERATE[ppc-mont.s]=asm/ppc-mont.pl`
		d8c783	`GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl`
		d8c783	`+GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl`
		d8c783
		d8c783	`GENERATE[alpha-mont.S]=asm/alpha-mont.pl`
		d8c783
		d8c783
		d8c783	`From 01ebad0d6e3a09bc9e32350b402901471610a3dc Mon Sep 17 00:00:00 2001`
		d8c783	`From: Rohan McLure <rohanmclure@linux.ibm.com>`
		d8c783	`Date: Thu, 30 Jun 2022 16:21:06 +1000`
		d8c783	`Subject: [PATCH 2/2] Fix unrolled montgomery multiplication for POWER9`
		d8c783
		d8c783	`In the reference C implementation in bn_asm.c, tp[num + 1] contains the`
		d8c783	`carry bit for accumulations into tp[num]. tp[num + 1] is only ever`
		d8c783	`assigned, never itself incremented.`
		d8c783	`---`
		d8c783	`crypto/bn/asm/ppc64-mont-fixed.pl \| 6 ++++--`
		d8c783	`1 file changed, 4 insertions(+), 2 deletions(-)`
		d8c783
		d8c783	`diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl`
		d8c783	`index 0fb397bc5f12..e27d0ad93d85 100755`
		d8c783	`--- a/crypto/bn/asm/ppc64-mont-fixed.pl`
		d8c783	`+++ b/crypto/bn/asm/ppc64-mont-fixed.pl`
		d8c783	`@@ -63,6 +63,7 @@`
		d8c783	`# Registers are global so the code is remotely readable`
		d8c783
		d8c783	`# Parameters for Montgomery multiplication`
		d8c783	`+my $ze = "r0";`
		d8c783	`my $sp = "r1";`
		d8c783	`my $toc = "r2";`
		d8c783	`my $rp = "r3";`
		d8c783	`@@ -192,6 +193,7 @@ ($)`
		d8c783	`$self->save_registers();`
		d8c783
		d8c783	`$self->add_code(<<___);`
		d8c783	`+ li $ze,0`
		d8c783	`ld $n0,0($n0)`
		d8c783
		d8c783	`ld $bp0,0($bp)`
		d8c783	`@@ -242,7 +244,7 @@ ($)`
		d8c783
		d8c783	`$self->add_code(<<___);`
		d8c783	`addc $tp[$n],$tp[$n],$c0`
		d8c783	`- addze $tp[$n+1],$tp[$n+1]`
		d8c783	`+ addze $tp[$n+1],$ze`
		d8c783	`___`
		d8c783
		d8c783	`$self->add_code(<<___);`
		d8c783	`@@ -272,7 +274,7 @@ ($)`
		d8c783	`and. $tp[$n],$tp[$n],$tp[$n]`
		d8c783	`bne $label->{"sub"}`
		d8c783
		d8c783	`- cmpld $tp[$n-1],$npj`
		d8c783	`+ cmpld $tp[$n-1],$npj`
		d8c783	`blt $label->{"copy"}`
		d8c783
		d8c783	`$label->{"sub"}:`

rpms / openssl

Source Code

Blame SOURCES/0067-ppc64le-Montgomery-multiply.patch