Blame SOURCES/0067-ppc64le-Montgomery-multiply.patch

1ac26c
From 33ffd36afa7594aeb958a925f521cb287ca850c8 Mon Sep 17 00:00:00 2001
1ac26c
From: Rohan McLure <rohanmclure@linux.ibm.com>
1ac26c
Date: Mon, 27 Jun 2022 12:14:55 +1000
1ac26c
Subject: [PATCH 1/2] Revert "Revert "bn: Add fixed length (n=6), unrolled PPC
1ac26c
 Montgomery Multiplication""
1ac26c
1ac26c
This reverts commit 712d9cc90e355b2c98a959d4e9398610d2269c9e.
1ac26c
---
1ac26c
 crypto/bn/asm/ppc64-mont-fixed.pl | 581 ++++++++++++++++++++++++++++++
1ac26c
 crypto/bn/bn_ppc.c                |  15 +
1ac26c
 crypto/bn/build.info              |   3 +-
1ac26c
 3 files changed, 598 insertions(+), 1 deletion(-)
1ac26c
1ac26c
diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
1ac26c
index e69de29bb2d1..0fb397bc5f12 100755
1ac26c
--- a/crypto/bn/asm/ppc64-mont-fixed.pl
1ac26c
+++ b/crypto/bn/asm/ppc64-mont-fixed.pl
1ac26c
@@ -0,0 +1,581 @@
1ac26c
+#! /usr/bin/env perl
1ac26c
+# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
1ac26c
+#
1ac26c
+# Licensed under the Apache License 2.0 (the "License").  You may not use
1ac26c
+# this file except in compliance with the License.  You can obtain a copy
1ac26c
+# in the file LICENSE in the source distribution or at
1ac26c
+# https://www.openssl.org/source/license.html
1ac26c
+
1ac26c
+# ====================================================================
1ac26c
+# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
1ac26c
+# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
1ac26c
+# the OpenSSL project.
1ac26c
+# ====================================================================
1ac26c
+
1ac26c
+#
1ac26c
+# Fixed length (n=6), unrolled PPC Montgomery Multiplication
1ac26c
+#
1ac26c
+
1ac26c
+# 2021
1ac26c
+#
1ac26c
+# Although this is a generic implementation for unrolling Montgomery
1ac26c
+# Multiplication for arbitrary values of n, this is currently only
1ac26c
+# used for n = 6 to improve the performance of ECC p384.
1ac26c
+#
1ac26c
+# Unrolling allows intermediate results to be stored in registers,
1ac26c
+# rather than on the stack, improving performance by ~7% compared to
1ac26c
+# the existing PPC assembly code.
1ac26c
+#
1ac26c
+# The ISA 3.0 implementation uses combination multiply/add
1ac26c
+# instructions (maddld, maddhdu) to improve performance by an
1ac26c
+# additional ~10% on Power 9.
1ac26c
+#
1ac26c
+# Finally, saving non-volatile registers into volatile vector
1ac26c
+# registers instead of onto the stack saves a little more.
1ac26c
+#
1ac26c
+# On a Power 9 machine we see an overall improvement of ~18%.
1ac26c
+#
1ac26c
+
1ac26c
+use strict;
1ac26c
+use warnings;
1ac26c
+
1ac26c
+my ($flavour, $output, $dir, $xlate);
1ac26c
+
1ac26c
+# $output is the last argument if it looks like a file (it has an extension)
1ac26c
+# $flavour is the first argument if it doesn't look like a file
1ac26c
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
1ac26c
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
1ac26c
+
1ac26c
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
1ac26c
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
1ac26c
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
1ac26c
+die "can't locate ppc-xlate.pl";
1ac26c
+
1ac26c
+open STDOUT,"| $^X $xlate $flavour \"$output\""
1ac26c
+    or die "can't call $xlate: $!";
1ac26c
+
1ac26c
+if ($flavour !~ /64/) {
1ac26c
+	die "bad flavour ($flavour) - only ppc64 permitted";
1ac26c
+}
1ac26c
+
1ac26c
+my $SIZE_T= 8;
1ac26c
+
1ac26c
+# Registers are global so the code is remotely readable
1ac26c
+
1ac26c
+# Parameters for Montgomery multiplication
1ac26c
+my $sp	= "r1";
1ac26c
+my $toc	= "r2";
1ac26c
+my $rp	= "r3";
1ac26c
+my $ap	= "r4";
1ac26c
+my $bp	= "r5";
1ac26c
+my $np	= "r6";
1ac26c
+my $n0	= "r7";
1ac26c
+my $num	= "r8";
1ac26c
+
1ac26c
+my $i	= "r9";
1ac26c
+my $c0	= "r10";
1ac26c
+my $bp0	= "r11";
1ac26c
+my $bpi	= "r11";
1ac26c
+my $bpj	= "r11";
1ac26c
+my $tj	= "r12";
1ac26c
+my $apj	= "r12";
1ac26c
+my $npj	= "r12";
1ac26c
+my $lo	= "r14";
1ac26c
+my $c1	= "r14";
1ac26c
+
1ac26c
+# Non-volatile registers used for tp[i]
1ac26c
+#
1ac26c
+# 12 registers are available but the limit on unrolling is 10,
1ac26c
+# since registers from $tp[0] to $tp[$n+1] are used.
1ac26c
+my @tp = ("r20" .. "r31");
1ac26c
+
1ac26c
+# volatile VSRs for saving non-volatile GPRs - faster than stack
1ac26c
+my @vsrs = ("v32" .. "v46");
1ac26c
+
1ac26c
+package Mont;
1ac26c
+
1ac26c
+sub new($$)
1ac26c
+{
1ac26c
+	my ($class, $n) = @_;
1ac26c
+
1ac26c
+	if ($n > 10) {
1ac26c
+		die "Can't unroll for BN length ${n} (maximum 10)"
1ac26c
+	}
1ac26c
+
1ac26c
+	my $self = {
1ac26c
+		code => "",
1ac26c
+		n => $n,
1ac26c
+	};
1ac26c
+	bless $self, $class;
1ac26c
+
1ac26c
+	return $self;
1ac26c
+}
1ac26c
+
1ac26c
+sub add_code($$)
1ac26c
+{
1ac26c
+	my ($self, $c) = @_;
1ac26c
+
1ac26c
+	$self->{code} .= $c;
1ac26c
+}
1ac26c
+
1ac26c
+sub get_code($)
1ac26c
+{
1ac26c
+	my ($self) = @_;
1ac26c
+
1ac26c
+	return $self->{code};
1ac26c
+}
1ac26c
+
1ac26c
+sub get_function_name($)
1ac26c
+{
1ac26c
+	my ($self) = @_;
1ac26c
+
1ac26c
+	return "bn_mul_mont_fixed_n" . $self->{n};
1ac26c
+}
1ac26c
+
1ac26c
+sub get_label($$)
1ac26c
+{
1ac26c
+	my ($self, $l) = @_;
1ac26c
+
1ac26c
+	return "L" . $l . "_" . $self->{n};
1ac26c
+}
1ac26c
+
1ac26c
+sub get_labels($@)
1ac26c
+{
1ac26c
+	my ($self, @labels) = @_;
1ac26c
+
1ac26c
+	my %out = ();
1ac26c
+
1ac26c
+	foreach my $l (@labels) {
1ac26c
+		$out{"$l"} = $self->get_label("$l");
1ac26c
+	}
1ac26c
+
1ac26c
+	return \%out;
1ac26c
+}
1ac26c
+
1ac26c
+sub nl($)
1ac26c
+{
1ac26c
+	my ($self) = @_;
1ac26c
+
1ac26c
+	$self->add_code("\n");
1ac26c
+}
1ac26c
+
1ac26c
+sub copy_result($)
1ac26c
+{
1ac26c
+	my ($self) = @_;
1ac26c
+
1ac26c
+	my ($n) = $self->{n};
1ac26c
+
1ac26c
+	for (my $j = 0; $j < $n; $j++) {
1ac26c
+		$self->add_code(<<___);
1ac26c
+	std		$tp[$j],`$j*$SIZE_T`($rp)
1ac26c
+___
1ac26c
+	}
1ac26c
+
1ac26c
+}
1ac26c
+
1ac26c
+sub mul_mont_fixed($)
1ac26c
+{
1ac26c
+	my ($self) = @_;
1ac26c
+
1ac26c
+	my ($n) = $self->{n};
1ac26c
+	my $fname = $self->get_function_name();
1ac26c
+	my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+
1ac26c
+.globl	.${fname}
1ac26c
+.align	5
1ac26c
+.${fname}:
1ac26c
+
1ac26c
+___
1ac26c
+
1ac26c
+	$self->save_registers();
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	ld		$n0,0($n0)
1ac26c
+
1ac26c
+	ld		$bp0,0($bp)
1ac26c
+
1ac26c
+	ld		$apj,0($ap)
1ac26c
+___
1ac26c
+
1ac26c
+	$self->mul_c_0($tp[0], $apj, $bp0, $c0);
1ac26c
+
1ac26c
+	for (my $j = 1; $j < $n - 1; $j++) {
1ac26c
+		$self->add_code(<<___);
1ac26c
+	ld		$apj,`$j*$SIZE_T`($ap)
1ac26c
+___
1ac26c
+		$self->mul($tp[$j], $apj, $bp0, $c0);
1ac26c
+	}
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	ld		$apj,`($n-1)*$SIZE_T`($ap)
1ac26c
+___
1ac26c
+
1ac26c
+	$self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	li		$tp[$n+1],0
1ac26c
+
1ac26c
+___
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	li		$i,0
1ac26c
+	mtctr		$num
1ac26c
+	b		$label->{"enter"}
1ac26c
+
1ac26c
+.align	4
1ac26c
+$label->{"outer"}:
1ac26c
+	ldx		$bpi,$bp,$i
1ac26c
+
1ac26c
+	ld		$apj,0($ap)
1ac26c
+___
1ac26c
+
1ac26c
+	$self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
1ac26c
+
1ac26c
+	for (my $j = 1; $j < $n; $j++) {
1ac26c
+		$self->add_code(<<___);
1ac26c
+	ld		$apj,`$j*$SIZE_T`($ap)
1ac26c
+___
1ac26c
+		$self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
1ac26c
+	}
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	addc		$tp[$n],$tp[$n],$c0
1ac26c
+	addze		$tp[$n+1],$tp[$n+1]
1ac26c
+___
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+.align	4
1ac26c
+$label->{"enter"}:
1ac26c
+	mulld		$bpi,$tp[0],$n0
1ac26c
+
1ac26c
+	ld		$npj,0($np)
1ac26c
+___
1ac26c
+
1ac26c
+	$self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
1ac26c
+
1ac26c
+	for (my $j = 1; $j < $n; $j++) {
1ac26c
+		$self->add_code(<<___);
1ac26c
+	ld		$npj,`$j*$SIZE_T`($np)
1ac26c
+___
1ac26c
+		$self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
1ac26c
+	}
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	addc		$tp[$n-1],$tp[$n],$c0
1ac26c
+	addze		$tp[$n],$tp[$n+1]
1ac26c
+
1ac26c
+	addi		$i,$i,$SIZE_T
1ac26c
+	bdnz		$label->{"outer"}
1ac26c
+
1ac26c
+	and.		$tp[$n],$tp[$n],$tp[$n]
1ac26c
+	bne		$label->{"sub"}
1ac26c
+
1ac26c
+	cmpld	$tp[$n-1],$npj
1ac26c
+	blt		$label->{"copy"}
1ac26c
+
1ac26c
+$label->{"sub"}:
1ac26c
+___
1ac26c
+
1ac26c
+	#
1ac26c
+	# Reduction
1ac26c
+	#
1ac26c
+
1ac26c
+		$self->add_code(<<___);
1ac26c
+	ld		$bpj,`0*$SIZE_T`($np)
1ac26c
+	subfc		$c1,$bpj,$tp[0]
1ac26c
+	std		$c1,`0*$SIZE_T`($rp)
1ac26c
+
1ac26c
+___
1ac26c
+	for (my $j = 1; $j < $n - 1; $j++) {
1ac26c
+		$self->add_code(<<___);
1ac26c
+	ld		$bpj,`$j*$SIZE_T`($np)
1ac26c
+	subfe		$c1,$bpj,$tp[$j]
1ac26c
+	std		$c1,`$j*$SIZE_T`($rp)
1ac26c
+
1ac26c
+___
1ac26c
+	}
1ac26c
+
1ac26c
+		$self->add_code(<<___);
1ac26c
+	subfe		$c1,$npj,$tp[$n-1]
1ac26c
+	std		$c1,`($n-1)*$SIZE_T`($rp)
1ac26c
+
1ac26c
+___
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	addme.		$tp[$n],$tp[$n]
1ac26c
+	beq		$label->{"end"}
1ac26c
+
1ac26c
+$label->{"copy"}:
1ac26c
+___
1ac26c
+
1ac26c
+	$self->copy_result();
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+
1ac26c
+$label->{"end"}:
1ac26c
+___
1ac26c
+
1ac26c
+	$self->restore_registers();
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	li		r3,1
1ac26c
+	blr
1ac26c
+.size .${fname},.-.${fname}
1ac26c
+___
1ac26c
+
1ac26c
+}
1ac26c
+
1ac26c
+package Mont::GPR;
1ac26c
+
1ac26c
+our @ISA = ('Mont');
1ac26c
+
1ac26c
+sub new($$)
1ac26c
+{
1ac26c
+    my ($class, $n) = @_;
1ac26c
+
1ac26c
+    return $class->SUPER::new($n);
1ac26c
+}
1ac26c
+
1ac26c
+sub save_registers($)
1ac26c
+{
1ac26c
+	my ($self) = @_;
1ac26c
+
1ac26c
+	my $n = $self->{n};
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	std	$lo,-8($sp)
1ac26c
+___
1ac26c
+
1ac26c
+	for (my $j = 0; $j <= $n+1; $j++) {
1ac26c
+		$self->{code}.=<<___;
1ac26c
+	std	$tp[$j],-`($j+2)*8`($sp)
1ac26c
+___
1ac26c
+	}
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+sub restore_registers($)
1ac26c
+{
1ac26c
+	my ($self) = @_;
1ac26c
+
1ac26c
+	my $n = $self->{n};
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	ld	$lo,-8($sp)
1ac26c
+___
1ac26c
+
1ac26c
+	for (my $j = 0; $j <= $n+1; $j++) {
1ac26c
+		$self->{code}.=<<___;
1ac26c
+	ld	$tp[$j],-`($j+2)*8`($sp)
1ac26c
+___
1ac26c
+	}
1ac26c
+
1ac26c
+	$self->{code} .=<<___;
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+# Direct translation of C mul()
1ac26c
+sub mul($$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r, $a, $w, $c) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	mulld		$lo,$a,$w
1ac26c
+	addc		$r,$lo,$c
1ac26c
+	mulhdu		$c,$a,$w
1ac26c
+	addze		$c,$c
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+# Like mul() but $c is ignored as an input - an optimisation to save a
1ac26c
+# preliminary instruction that would set input $c to 0
1ac26c
+sub mul_c_0($$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r, $a, $w, $c) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	mulld		$r,$a,$w
1ac26c
+	mulhdu		$c,$a,$w
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+# Like mul() but does not to the final addition of CA into $c - an
1ac26c
+# optimisation to save an instruction
1ac26c
+sub mul_last($$$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r1, $r2, $a, $w, $c) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	mulld		$lo,$a,$w
1ac26c
+	addc		$r1,$lo,$c
1ac26c
+	mulhdu		$c,$a,$w
1ac26c
+
1ac26c
+	addze		$r2,$c
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+# Like C mul_add() but allow $r_out and $r_in to be different
1ac26c
+sub mul_add($$$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	mulld		$lo,$a,$w
1ac26c
+	addc		$lo,$lo,$c
1ac26c
+	mulhdu		$c,$a,$w
1ac26c
+	addze		$c,$c
1ac26c
+	addc		$r_out,$r_in,$lo
1ac26c
+	addze		$c,$c
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
1ac26c
+# preliminary instruction that would set input $c to 0
1ac26c
+sub mul_add_c_0($$$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	mulld		$lo,$a,$w
1ac26c
+	addc		$r_out,$r_in,$lo
1ac26c
+	mulhdu		$c,$a,$w
1ac26c
+	addze		$c,$c
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+package Mont::GPR_300;
1ac26c
+
1ac26c
+our @ISA = ('Mont::GPR');
1ac26c
+
1ac26c
+sub new($$)
1ac26c
+{
1ac26c
+	my ($class, $n) = @_;
1ac26c
+
1ac26c
+	my $mont = $class->SUPER::new($n);
1ac26c
+
1ac26c
+	return $mont;
1ac26c
+}
1ac26c
+
1ac26c
+sub get_function_name($)
1ac26c
+{
1ac26c
+	my ($self) = @_;
1ac26c
+
1ac26c
+	return "bn_mul_mont_300_fixed_n" . $self->{n};
1ac26c
+}
1ac26c
+
1ac26c
+sub get_label($$)
1ac26c
+{
1ac26c
+	my ($self, $l) = @_;
1ac26c
+
1ac26c
+	return "L" . $l . "_300_" . $self->{n};
1ac26c
+}
1ac26c
+
1ac26c
+# Direct translation of C mul()
1ac26c
+sub mul($$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r, $a, $w, $c, $last) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	maddld		$r,$a,$w,$c
1ac26c
+	maddhdu		$c,$a,$w,$c
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+# Save the last carry as the final entry
1ac26c
+sub mul_last($$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r1, $r2, $a, $w, $c) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	maddld		$r1,$a,$w,$c
1ac26c
+	maddhdu		$r2,$a,$w,$c
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+# Like mul() but $c is ignored as an input - an optimisation to save a
1ac26c
+# preliminary instruction that would set input $c to 0
1ac26c
+sub mul_c_0($$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r, $a, $w, $c) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	mulld          $r,$a,$w
1ac26c
+	mulhdu          $c,$a,$w
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+# Like C mul_add() but allow $r_out and $r_in to be different
1ac26c
+sub mul_add($$$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	maddld		$lo,$a,$w,$c
1ac26c
+	maddhdu		$c,$a,$w,$c
1ac26c
+	addc		$r_out,$r_in,$lo
1ac26c
+	addze		$c,$c
1ac26c
+
1ac26c
+___
1ac26c
+}
1ac26c
+
1ac26c
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
1ac26c
+# preliminary instruction that would set input $c to 0
1ac26c
+sub mul_add_c_0($$$$$$)
1ac26c
+{
1ac26c
+	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
1ac26c
+
1ac26c
+	$self->add_code(<<___);
1ac26c
+	maddld		$lo,$a,$w,$r_in
1ac26c
+	maddhdu		$c,$a,$w,$r_in
1ac26c
+___
1ac26c
+
1ac26c
+	if ($r_out ne $lo) {
1ac26c
+		$self->add_code(<<___);
1ac26c
+	mr			$r_out,$lo
1ac26c
+___
1ac26c
+	}
1ac26c
+
1ac26c
+	$self->nl();
1ac26c
+}
1ac26c
+
1ac26c
+
1ac26c
+package main;
1ac26c
+
1ac26c
+my $code;
1ac26c
+
1ac26c
+$code.=<<___;
1ac26c
+.machine "any"
1ac26c
+.text
1ac26c
+___
1ac26c
+
1ac26c
+my $mont;
1ac26c
+
1ac26c
+$mont = new Mont::GPR(6);
1ac26c
+$mont->mul_mont_fixed();
1ac26c
+$code .= $mont->get_code();
1ac26c
+
1ac26c
+$mont = new Mont::GPR_300(6);
1ac26c
+$mont->mul_mont_fixed();
1ac26c
+$code .= $mont->get_code();
1ac26c
+
1ac26c
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
1ac26c
+
1ac26c
+$code.=<<___;
1ac26c
+.asciz  "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
1ac26c
+___
1ac26c
+
1ac26c
+print $code;
1ac26c
+close STDOUT or die "error closing STDOUT: $!";
1ac26c
diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c
1ac26c
index 3ee76ea96574..1e9421bee213 100644
1ac26c
--- a/crypto/bn/bn_ppc.c
1ac26c
+++ b/crypto/bn/bn_ppc.c
1ac26c
@@ -19,6 +19,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1ac26c
                         const BN_ULONG *np, const BN_ULONG *n0, int num);
1ac26c
     int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1ac26c
                           const BN_ULONG *np, const BN_ULONG *n0, int num);
1ac26c
+    int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
1ac26c
+                             const BN_ULONG *bp, const BN_ULONG *np,
1ac26c
+                             const BN_ULONG *n0, int num);
1ac26c
+    int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
1ac26c
+                                 const BN_ULONG *bp, const BN_ULONG *np,
1ac26c
+                                 const BN_ULONG *n0, int num);
1ac26c
 
1ac26c
     if (num < 4)
1ac26c
         return 0;
1ac26c
@@ -34,5 +40,14 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1ac26c
      * no opportunity to figure it out...
1ac26c
      */
1ac26c
 
1ac26c
+#if defined(_ARCH_PPC64) && !defined(__ILP32__)
1ac26c
+    if (num == 6) {
1ac26c
+        if (OPENSSL_ppccap_P & PPC_MADD300)
1ac26c
+            return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
1ac26c
+        else
1ac26c
+            return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
1ac26c
+    }
1ac26c
+#endif
1ac26c
+
1ac26c
     return bn_mul_mont_int(rp, ap, bp, np, n0, num);
1ac26c
 }
1ac26c
diff --git a/crypto/bn/build.info b/crypto/bn/build.info
1ac26c
index 4f8d0689b5ea..987a70ae263b 100644
1ac26c
--- a/crypto/bn/build.info
1ac26c
+++ b/crypto/bn/build.info
1ac26c
@@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}]
1ac26c
 
1ac26c
   $BNASM_ppc32=bn_ppc.c bn-ppc.s ppc-mont.s
1ac26c
   $BNDEF_ppc32=OPENSSL_BN_ASM_MONT
1ac26c
-  $BNASM_ppc64=$BNASM_ppc32
1ac26c
+  $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s
1ac26c
   $BNDEF_ppc64=$BNDEF_ppc32
1ac26c
 
1ac26c
   $BNASM_c64xplus=asm/bn-c64xplus.asm
1ac26c
@@ -173,6 +173,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl
1ac26c
 GENERATE[bn-ppc.s]=asm/ppc.pl
1ac26c
 GENERATE[ppc-mont.s]=asm/ppc-mont.pl
1ac26c
 GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl
1ac26c
+GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl
1ac26c
 
1ac26c
 GENERATE[alpha-mont.S]=asm/alpha-mont.pl
1ac26c
 
1ac26c
1ac26c
From 01ebad0d6e3a09bc9e32350b402901471610a3dc Mon Sep 17 00:00:00 2001
1ac26c
From: Rohan McLure <rohanmclure@linux.ibm.com>
1ac26c
Date: Thu, 30 Jun 2022 16:21:06 +1000
1ac26c
Subject: [PATCH 2/2] Fix unrolled montgomery multiplication for POWER9
1ac26c
1ac26c
In the reference C implementation in bn_asm.c, tp[num + 1] contains the
1ac26c
carry bit for accumulations into tp[num]. tp[num + 1] is only ever
1ac26c
assigned, never itself incremented.
1ac26c
---
1ac26c
 crypto/bn/asm/ppc64-mont-fixed.pl | 6 ++++--
1ac26c
 1 file changed, 4 insertions(+), 2 deletions(-)
1ac26c
1ac26c
diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
1ac26c
index 0fb397bc5f12..e27d0ad93d85 100755
1ac26c
--- a/crypto/bn/asm/ppc64-mont-fixed.pl
1ac26c
+++ b/crypto/bn/asm/ppc64-mont-fixed.pl
1ac26c
@@ -63,6 +63,7 @@
1ac26c
 # Registers are global so the code is remotely readable
1ac26c
 
1ac26c
 # Parameters for Montgomery multiplication
1ac26c
+my $ze	= "r0";
1ac26c
 my $sp	= "r1";
1ac26c
 my $toc	= "r2";
1ac26c
 my $rp	= "r3";
1ac26c
@@ -192,6 +193,7 @@ ($)
1ac26c
 	$self->save_registers();
1ac26c
 
1ac26c
 	$self->add_code(<<___);
1ac26c
+	li		$ze,0
1ac26c
 	ld		$n0,0($n0)
1ac26c
 
1ac26c
 	ld		$bp0,0($bp)
1ac26c
@@ -242,7 +244,7 @@ ($)
1ac26c
 
1ac26c
 	$self->add_code(<<___);
1ac26c
 	addc		$tp[$n],$tp[$n],$c0
1ac26c
-	addze		$tp[$n+1],$tp[$n+1]
1ac26c
+	addze		$tp[$n+1],$ze
1ac26c
 ___
1ac26c
 
1ac26c
 	$self->add_code(<<___);
1ac26c
@@ -272,7 +274,7 @@ ($)
1ac26c
 	and.		$tp[$n],$tp[$n],$tp[$n]
1ac26c
 	bne		$label->{"sub"}
1ac26c
 
1ac26c
-	cmpld	$tp[$n-1],$npj
1ac26c
+	cmpld		$tp[$n-1],$npj
1ac26c
 	blt		$label->{"copy"}
1ac26c
 
1ac26c
 $label->{"sub"}: