Blame SOURCES/openssl-1.0.1e-cve-2014-3570.patch

78ef1d
From e078642ddea29bbb6ba29788a6a513796387fbbb Mon Sep 17 00:00:00 2001
78ef1d
From: Andy Polyakov <appro@openssl.org>
78ef1d
Date: Mon, 5 Jan 2015 14:52:56 +0100
78ef1d
Subject: [PATCH] Fix for CVE-2014-3570.
78ef1d
78ef1d
Reviewed-by: Emilia Kasper <emilia@openssl.org>
78ef1d
(cherry picked from commit e793809ba50c1e90ab592fb640a856168e50f3de)
78ef1d
(with 1.0.1-specific addendum)
78ef1d
---
78ef1d
 crypto/bn/asm/mips.pl      |  611 +++---------
78ef1d
 crypto/bn/asm/mips3.s      | 2201 --------------------------------------------
78ef1d
 crypto/bn/asm/x86_64-gcc.c |   34 +-
78ef1d
 crypto/bn/bn_asm.c         |   16 +-
78ef1d
 crypto/bn/bntest.c         |  102 +-
78ef1d
 5 files changed, 234 insertions(+), 2730 deletions(-)
78ef1d
 delete mode 100644 crypto/bn/asm/mips3.s
78ef1d
78ef1d
diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
78ef1d
index d2f3ef7..215c9a7 100644
78ef1d
--- a/crypto/bn/asm/mips.pl
78ef1d
+++ b/crypto/bn/asm/mips.pl
78ef1d
@@ -1872,6 +1872,41 @@ ___
78ef1d
 
78ef1d
 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
78ef1d
 
78ef1d
+sub add_c2 () {
78ef1d
+my ($hi,$lo,$c0,$c1,$c2,
78ef1d
+    $warm,      # !$warm denotes first call with specific sequence of
78ef1d
+                # $c_[XYZ] when there is no Z-carry to accumulate yet;
78ef1d
+    $an,$bn     # these two are arguments for multiplication which
78ef1d
+                # result is used in *next* step [which is why it's
78ef1d
+                # commented as "forward multiplication" below];
78ef1d
+    )=@_;
78ef1d
+$code.=<<___;
78ef1d
+	mflo	$lo
78ef1d
+	mfhi	$hi
78ef1d
+	$ADDU	$c0,$lo
78ef1d
+	sltu	$at,$c0,$lo
78ef1d
+	 $MULTU	$an,$bn			# forward multiplication
78ef1d
+	$ADDU	$c0,$lo
78ef1d
+	$ADDU	$at,$hi
78ef1d
+	sltu	$lo,$c0,$lo
78ef1d
+	$ADDU	$c1,$at
78ef1d
+	$ADDU	$hi,$lo
78ef1d
+___
78ef1d
+$code.=<<___	if (!$warm);
78ef1d
+	sltu	$c2,$c1,$at
78ef1d
+	$ADDU	$c1,$hi
78ef1d
+	sltu	$hi,$c1,$hi
78ef1d
+	$ADDU	$c2,$hi
78ef1d
+___
78ef1d
+$code.=<<___	if ($warm);
78ef1d
+	sltu	$at,$c1,$at
78ef1d
+	$ADDU	$c1,$hi
78ef1d
+	$ADDU	$c2,$at
78ef1d
+	sltu	$hi,$c1,$hi
78ef1d
+	$ADDU	$c2,$hi
78ef1d
+___
78ef1d
+}
78ef1d
+
78ef1d
 $code.=<<___;
78ef1d
 
78ef1d
 .align	5
78ef1d
@@ -1920,21 +1955,10 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_2,$t_1
78ef1d
 	$ADDU	$c_3,$t_2,$at
78ef1d
 	$ST	$c_2,$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_2,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
78ef1d
+		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
78ef1d
+$code.=<<___;
78ef1d
 	mflo	$t_1
78ef1d
 	mfhi	$t_2
78ef1d
 	$ADDU	$c_3,$t_1
78ef1d
@@ -1945,67 +1969,19 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_1,$t_2
78ef1d
 	$ADDU	$c_2,$at
78ef1d
 	$ST	$c_3,2*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_3,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
78ef1d
+		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
78ef1d
+		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
78ef1d
+$code.=<<___;
78ef1d
 	$ST	$c_1,3*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_1,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
78ef1d
+		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
78ef1d
+		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
78ef1d
+$code.=<<___;
78ef1d
 	mflo	$t_1
78ef1d
 	mfhi	$t_2
78ef1d
 	$ADDU	$c_2,$t_1
78ef1d
@@ -2016,97 +1992,23 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_3,$t_2
78ef1d
 	$ADDU	$c_1,$at
78ef1d
 	$ST	$c_2,4*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_2,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
78ef1d
+		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
78ef1d
+		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
78ef1d
+		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
78ef1d
+$code.=<<___;
78ef1d
 	$ST	$c_3,5*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_3,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
78ef1d
+		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
78ef1d
+		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
78ef1d
+		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
78ef1d
+$code.=<<___;
78ef1d
 	mflo	$t_1
78ef1d
 	mfhi	$t_2
78ef1d
 	$ADDU	$c_1,$t_1
78ef1d
@@ -2117,112 +2019,25 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_2,$t_2
78ef1d
 	$ADDU	$c_3,$at
78ef1d
 	$ST	$c_1,6*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_1,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
78ef1d
+		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
78ef1d
+		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
78ef1d
+		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
78ef1d
+		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
78ef1d
+$code.=<<___;
78ef1d
 	$ST	$c_2,7*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_2,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
78ef1d
+		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
78ef1d
+		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
78ef1d
+		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
78ef1d
+$code.=<<___;
78ef1d
 	mflo	$t_1
78ef1d
 	mfhi	$t_2
78ef1d
 	$ADDU	$c_3,$t_1
78ef1d
@@ -2233,82 +2048,21 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_1,$t_2
78ef1d
 	$ADDU	$c_2,$at
78ef1d
 	$ST	$c_3,8*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_3,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
78ef1d
+		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
78ef1d
+		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
78ef1d
+		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
78ef1d
+$code.=<<___;
78ef1d
 	$ST	$c_1,9*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_1,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_1,$at
78ef1d
-	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
78ef1d
+		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
78ef1d
+		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
78ef1d
+$code.=<<___;
78ef1d
 	mflo	$t_1
78ef1d
 	mfhi	$t_2
78ef1d
 	$ADDU	$c_2,$t_1
78ef1d
@@ -2319,52 +2073,17 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_3,$t_2
78ef1d
 	$ADDU	$c_1,$at
78ef1d
 	$ST	$c_2,10*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_2,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_2,$at
78ef1d
-	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
78ef1d
+		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
78ef1d
+		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
78ef1d
+$code.=<<___;
78ef1d
 	$ST	$c_3,11*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_3,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
78ef1d
+		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
78ef1d
+$code.=<<___;
78ef1d
 	mflo	$t_1
78ef1d
 	mfhi	$t_2
78ef1d
 	$ADDU	$c_1,$t_1
78ef1d
@@ -2375,21 +2094,10 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_2,$t_2
78ef1d
 	$ADDU	$c_3,$at
78ef1d
 	$ST	$c_1,12*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_1,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
78ef1d
+		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
78ef1d
+$code.=<<___;
78ef1d
 	$ST	$c_2,13*$BNSZ($a0)
78ef1d
 
78ef1d
 	mflo	$t_1
78ef1d
@@ -2457,21 +2165,10 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_2,$t_1
78ef1d
 	$ADDU	$c_3,$t_2,$at
78ef1d
 	$ST	$c_2,$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_2,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
78ef1d
+		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
78ef1d
+$code.=<<___;
78ef1d
 	mflo	$t_1
78ef1d
 	mfhi	$t_2
78ef1d
 	$ADDU	$c_3,$t_1
78ef1d
@@ -2482,52 +2179,17 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_1,$t_2
78ef1d
 	$ADDU	$c_2,$at
78ef1d
 	$ST	$c_3,2*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_3,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$at,$t_2,$zero
78ef1d
-	$ADDU	$c_3,$at
78ef1d
-	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
78ef1d
-	$SLL	$t_2,1
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_1,$t_1
78ef1d
-	sltu	$at,$c_1,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_2,$t_2
78ef1d
-	sltu	$at,$c_2,$t_2
78ef1d
-	$ADDU	$c_3,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
78ef1d
+		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
78ef1d
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
78ef1d
+		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
78ef1d
+$code.=<<___;
78ef1d
 	$ST	$c_1,3*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_1,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_2,$t_1
78ef1d
-	sltu	$at,$c_2,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_3,$t_2
78ef1d
-	sltu	$at,$c_3,$t_2
78ef1d
-	$ADDU	$c_1,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
78ef1d
+		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
78ef1d
+$code.=<<___;
78ef1d
 	mflo	$t_1
78ef1d
 	mfhi	$t_2
78ef1d
 	$ADDU	$c_2,$t_1
78ef1d
@@ -2538,21 +2200,10 @@ $code.=<<___;
78ef1d
 	sltu	$at,$c_3,$t_2
78ef1d
 	$ADDU	$c_1,$at
78ef1d
 	$ST	$c_2,4*$BNSZ($a0)
78ef1d
-
78ef1d
-	mflo	$t_1
78ef1d
-	mfhi	$t_2
78ef1d
-	slt	$c_2,$t_2,$zero
78ef1d
-	$SLL	$t_2,1
78ef1d
-	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
78ef1d
-	slt	$a2,$t_1,$zero
78ef1d
-	$ADDU	$t_2,$a2
78ef1d
-	$SLL	$t_1,1
78ef1d
-	$ADDU	$c_3,$t_1
78ef1d
-	sltu	$at,$c_3,$t_1
78ef1d
-	$ADDU	$t_2,$at
78ef1d
-	$ADDU	$c_1,$t_2
78ef1d
-	sltu	$at,$c_1,$t_2
78ef1d
-	$ADDU	$c_2,$at
78ef1d
+___
78ef1d
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
78ef1d
+		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
78ef1d
+$code.=<<___;
78ef1d
 	$ST	$c_3,5*$BNSZ($a0)
78ef1d
 
78ef1d
 	mflo	$t_1
78ef1d
diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s
78ef1d
deleted file mode 100644
78ef1d
index dca4105..0000000
78ef1d
--- a/crypto/bn/asm/mips3.s
78ef1d
+++ /dev/null
78ef1d
@@ -1,2201 +0,0 @@
78ef1d
-.rdata
78ef1d
-.asciiz	"mips3.s, Version 1.1"
78ef1d
-.asciiz	"MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
78ef1d
-
78ef1d
-/*
78ef1d
- * ====================================================================
78ef1d
- * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
78ef1d
- * project.
78ef1d
- *
78ef1d
- * Rights for redistribution and usage in source and binary forms are
78ef1d
- * granted according to the OpenSSL license. Warranty of any kind is
78ef1d
- * disclaimed.
78ef1d
- * ====================================================================
78ef1d
- */
78ef1d
-
78ef1d
-/*
78ef1d
- * This is my modest contributon to the OpenSSL project (see
78ef1d
- * http://www.openssl.org/ for more information about it) and is
78ef1d
- * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
78ef1d
- * module. For updates see http://fy.chalmers.se/~appro/hpe/.
78ef1d
- *
78ef1d
- * The module is designed to work with either of the "new" MIPS ABI(5),
78ef1d
- * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
78ef1d
- * IRIX 5.x not only because it doesn't support new ABIs but also
78ef1d
- * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
78ef1d
- * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
78ef1d
- * cause illegal instruction exception:-(
78ef1d
- *
78ef1d
- * In addition the code depends on preprocessor flags set up by MIPSpro
78ef1d
- * compiler driver (either as or cc) and therefore (probably?) can't be
78ef1d
- * compiled by the GNU assembler. GNU C driver manages fine though...
78ef1d
- * I mean as long as -mmips-as is specified or is the default option,
78ef1d
- * because then it simply invokes /usr/bin/as which in turn takes
78ef1d
- * perfect care of the preprocessor definitions. Another neat feature
78ef1d
- * offered by the MIPSpro assembler is an optimization pass. This gave
78ef1d
- * me the opportunity to have the code looking more regular as all those
78ef1d
- * architecture dependent instruction rescheduling details were left to
78ef1d
- * the assembler. Cool, huh?
78ef1d
- *
78ef1d
- * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
78ef1d
- * goes way over 3 times faster!
78ef1d
- *
78ef1d
- *					<appro@fy.chalmers.se>
78ef1d
- */
78ef1d
-#include <asm.h>
78ef1d
-#include <regdef.h>
78ef1d
-
78ef1d
-#if _MIPS_ISA>=4
78ef1d
-#define	MOVNZ(cond,dst,src)	\
78ef1d
-	movn	dst,src,cond
78ef1d
-#else
78ef1d
-#define	MOVNZ(cond,dst,src)	\
78ef1d
-	.set	noreorder;	\
78ef1d
-	bnezl	cond,.+8;	\
78ef1d
-	move	dst,src;	\
78ef1d
-	.set	reorder
78ef1d
-#endif
78ef1d
-
78ef1d
-.text
78ef1d
-
78ef1d
-.set	noat
78ef1d
-.set	reorder
78ef1d
-
78ef1d
-#define	MINUS4	v1
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_mul_add_words)
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	a2,.L_bn_mul_add_words_proceed
78ef1d
-	ld	t0,0(a1)
78ef1d
-	jr	ra
78ef1d
-	move	v0,zero
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_mul_add_words_proceed:
78ef1d
-	li	MINUS4,-4
78ef1d
-	and	ta0,a2,MINUS4
78ef1d
-	move	v0,zero
78ef1d
-	beqz	ta0,.L_bn_mul_add_words_tail
78ef1d
-
78ef1d
-.L_bn_mul_add_words_loop:
78ef1d
-	dmultu	t0,a3
78ef1d
-	ld	t1,0(a0)
78ef1d
-	ld	t2,8(a1)
78ef1d
-	ld	t3,8(a0)
78ef1d
-	ld	ta0,16(a1)
78ef1d
-	ld	ta1,16(a0)
78ef1d
-	daddu	t1,v0
78ef1d
-	sltu	v0,t1,v0	/* All manuals say it "compares 32-bit
78ef1d
-				 * values", but it seems to work fine
78ef1d
-				 * even on 64-bit registers. */
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t0
78ef1d
-	daddu	t1,AT
78ef1d
-	daddu	v0,t0
78ef1d
-	sltu	AT,t1,AT
78ef1d
-	sd	t1,0(a0)
78ef1d
-	daddu	v0,AT
78ef1d
-
78ef1d
-	dmultu	t2,a3
78ef1d
-	ld	ta2,24(a1)
78ef1d
-	ld	ta3,24(a0)
78ef1d
-	daddu	t3,v0
78ef1d
-	sltu	v0,t3,v0
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t2
78ef1d
-	daddu	t3,AT
78ef1d
-	daddu	v0,t2
78ef1d
-	sltu	AT,t3,AT
78ef1d
-	sd	t3,8(a0)
78ef1d
-	daddu	v0,AT
78ef1d
-
78ef1d
-	dmultu	ta0,a3
78ef1d
-	subu	a2,4
78ef1d
-	PTR_ADD	a0,32
78ef1d
-	PTR_ADD	a1,32
78ef1d
-	daddu	ta1,v0
78ef1d
-	sltu	v0,ta1,v0
78ef1d
-	mflo	AT
78ef1d
-	mfhi	ta0
78ef1d
-	daddu	ta1,AT
78ef1d
-	daddu	v0,ta0
78ef1d
-	sltu	AT,ta1,AT
78ef1d
-	sd	ta1,-16(a0)
78ef1d
-	daddu	v0,AT
78ef1d
-
78ef1d
-
78ef1d
-	dmultu	ta2,a3
78ef1d
-	and	ta0,a2,MINUS4
78ef1d
-	daddu	ta3,v0
78ef1d
-	sltu	v0,ta3,v0
78ef1d
-	mflo	AT
78ef1d
-	mfhi	ta2
78ef1d
-	daddu	ta3,AT
78ef1d
-	daddu	v0,ta2
78ef1d
-	sltu	AT,ta3,AT
78ef1d
-	sd	ta3,-8(a0)
78ef1d
-	daddu	v0,AT
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	ta0,.L_bn_mul_add_words_loop
78ef1d
-	ld	t0,0(a1)
78ef1d
-
78ef1d
-	bnezl	a2,.L_bn_mul_add_words_tail
78ef1d
-	ld	t0,0(a1)
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_mul_add_words_return:
78ef1d
-	jr	ra
78ef1d
-
78ef1d
-.L_bn_mul_add_words_tail:
78ef1d
-	dmultu	t0,a3
78ef1d
-	ld	t1,0(a0)
78ef1d
-	subu	a2,1
78ef1d
-	daddu	t1,v0
78ef1d
-	sltu	v0,t1,v0
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t0
78ef1d
-	daddu	t1,AT
78ef1d
-	daddu	v0,t0
78ef1d
-	sltu	AT,t1,AT
78ef1d
-	sd	t1,0(a0)
78ef1d
-	daddu	v0,AT
78ef1d
-	beqz	a2,.L_bn_mul_add_words_return
78ef1d
-
78ef1d
-	ld	t0,8(a1)
78ef1d
-	dmultu	t0,a3
78ef1d
-	ld	t1,8(a0)
78ef1d
-	subu	a2,1
78ef1d
-	daddu	t1,v0
78ef1d
-	sltu	v0,t1,v0
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t0
78ef1d
-	daddu	t1,AT
78ef1d
-	daddu	v0,t0
78ef1d
-	sltu	AT,t1,AT
78ef1d
-	sd	t1,8(a0)
78ef1d
-	daddu	v0,AT
78ef1d
-	beqz	a2,.L_bn_mul_add_words_return
78ef1d
-
78ef1d
-	ld	t0,16(a1)
78ef1d
-	dmultu	t0,a3
78ef1d
-	ld	t1,16(a0)
78ef1d
-	daddu	t1,v0
78ef1d
-	sltu	v0,t1,v0
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t0
78ef1d
-	daddu	t1,AT
78ef1d
-	daddu	v0,t0
78ef1d
-	sltu	AT,t1,AT
78ef1d
-	sd	t1,16(a0)
78ef1d
-	daddu	v0,AT
78ef1d
-	jr	ra
78ef1d
-END(bn_mul_add_words)
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_mul_words)
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	a2,.L_bn_mul_words_proceed
78ef1d
-	ld	t0,0(a1)
78ef1d
-	jr	ra
78ef1d
-	move	v0,zero
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_mul_words_proceed:
78ef1d
-	li	MINUS4,-4
78ef1d
-	and	ta0,a2,MINUS4
78ef1d
-	move	v0,zero
78ef1d
-	beqz	ta0,.L_bn_mul_words_tail
78ef1d
-
78ef1d
-.L_bn_mul_words_loop:
78ef1d
-	dmultu	t0,a3
78ef1d
-	ld	t2,8(a1)
78ef1d
-	ld	ta0,16(a1)
78ef1d
-	ld	ta2,24(a1)
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t0
78ef1d
-	daddu	v0,AT
78ef1d
-	sltu	t1,v0,AT
78ef1d
-	sd	v0,0(a0)
78ef1d
-	daddu	v0,t1,t0
78ef1d
-
78ef1d
-	dmultu	t2,a3
78ef1d
-	subu	a2,4
78ef1d
-	PTR_ADD	a0,32
78ef1d
-	PTR_ADD	a1,32
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t2
78ef1d
-	daddu	v0,AT
78ef1d
-	sltu	t3,v0,AT
78ef1d
-	sd	v0,-24(a0)
78ef1d
-	daddu	v0,t3,t2
78ef1d
-
78ef1d
-	dmultu	ta0,a3
78ef1d
-	mflo	AT
78ef1d
-	mfhi	ta0
78ef1d
-	daddu	v0,AT
78ef1d
-	sltu	ta1,v0,AT
78ef1d
-	sd	v0,-16(a0)
78ef1d
-	daddu	v0,ta1,ta0
78ef1d
-
78ef1d
-
78ef1d
-	dmultu	ta2,a3
78ef1d
-	and	ta0,a2,MINUS4
78ef1d
-	mflo	AT
78ef1d
-	mfhi	ta2
78ef1d
-	daddu	v0,AT
78ef1d
-	sltu	ta3,v0,AT
78ef1d
-	sd	v0,-8(a0)
78ef1d
-	daddu	v0,ta3,ta2
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	ta0,.L_bn_mul_words_loop
78ef1d
-	ld	t0,0(a1)
78ef1d
-
78ef1d
-	bnezl	a2,.L_bn_mul_words_tail
78ef1d
-	ld	t0,0(a1)
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_mul_words_return:
78ef1d
-	jr	ra
78ef1d
-
78ef1d
-.L_bn_mul_words_tail:
78ef1d
-	dmultu	t0,a3
78ef1d
-	subu	a2,1
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t0
78ef1d
-	daddu	v0,AT
78ef1d
-	sltu	t1,v0,AT
78ef1d
-	sd	v0,0(a0)
78ef1d
-	daddu	v0,t1,t0
78ef1d
-	beqz	a2,.L_bn_mul_words_return
78ef1d
-
78ef1d
-	ld	t0,8(a1)
78ef1d
-	dmultu	t0,a3
78ef1d
-	subu	a2,1
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t0
78ef1d
-	daddu	v0,AT
78ef1d
-	sltu	t1,v0,AT
78ef1d
-	sd	v0,8(a0)
78ef1d
-	daddu	v0,t1,t0
78ef1d
-	beqz	a2,.L_bn_mul_words_return
78ef1d
-
78ef1d
-	ld	t0,16(a1)
78ef1d
-	dmultu	t0,a3
78ef1d
-	mflo	AT
78ef1d
-	mfhi	t0
78ef1d
-	daddu	v0,AT
78ef1d
-	sltu	t1,v0,AT
78ef1d
-	sd	v0,16(a0)
78ef1d
-	daddu	v0,t1,t0
78ef1d
-	jr	ra
78ef1d
-END(bn_mul_words)
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_sqr_words)
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	a2,.L_bn_sqr_words_proceed
78ef1d
-	ld	t0,0(a1)
78ef1d
-	jr	ra
78ef1d
-	move	v0,zero
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_sqr_words_proceed:
78ef1d
-	li	MINUS4,-4
78ef1d
-	and	ta0,a2,MINUS4
78ef1d
-	move	v0,zero
78ef1d
-	beqz	ta0,.L_bn_sqr_words_tail
78ef1d
-
78ef1d
-.L_bn_sqr_words_loop:
78ef1d
-	dmultu	t0,t0
78ef1d
-	ld	t2,8(a1)
78ef1d
-	ld	ta0,16(a1)
78ef1d
-	ld	ta2,24(a1)
78ef1d
-	mflo	t1
78ef1d
-	mfhi	t0
78ef1d
-	sd	t1,0(a0)
78ef1d
-	sd	t0,8(a0)
78ef1d
-
78ef1d
-	dmultu	t2,t2
78ef1d
-	subu	a2,4
78ef1d
-	PTR_ADD	a0,64
78ef1d
-	PTR_ADD	a1,32
78ef1d
-	mflo	t3
78ef1d
-	mfhi	t2
78ef1d
-	sd	t3,-48(a0)
78ef1d
-	sd	t2,-40(a0)
78ef1d
-
78ef1d
-	dmultu	ta0,ta0
78ef1d
-	mflo	ta1
78ef1d
-	mfhi	ta0
78ef1d
-	sd	ta1,-32(a0)
78ef1d
-	sd	ta0,-24(a0)
78ef1d
-
78ef1d
-
78ef1d
-	dmultu	ta2,ta2
78ef1d
-	and	ta0,a2,MINUS4
78ef1d
-	mflo	ta3
78ef1d
-	mfhi	ta2
78ef1d
-	sd	ta3,-16(a0)
78ef1d
-	sd	ta2,-8(a0)
78ef1d
-
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	ta0,.L_bn_sqr_words_loop
78ef1d
-	ld	t0,0(a1)
78ef1d
-
78ef1d
-	bnezl	a2,.L_bn_sqr_words_tail
78ef1d
-	ld	t0,0(a1)
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_sqr_words_return:
78ef1d
-	move	v0,zero
78ef1d
-	jr	ra
78ef1d
-
78ef1d
-.L_bn_sqr_words_tail:
78ef1d
-	dmultu	t0,t0
78ef1d
-	subu	a2,1
78ef1d
-	mflo	t1
78ef1d
-	mfhi	t0
78ef1d
-	sd	t1,0(a0)
78ef1d
-	sd	t0,8(a0)
78ef1d
-	beqz	a2,.L_bn_sqr_words_return
78ef1d
-
78ef1d
-	ld	t0,8(a1)
78ef1d
-	dmultu	t0,t0
78ef1d
-	subu	a2,1
78ef1d
-	mflo	t1
78ef1d
-	mfhi	t0
78ef1d
-	sd	t1,16(a0)
78ef1d
-	sd	t0,24(a0)
78ef1d
-	beqz	a2,.L_bn_sqr_words_return
78ef1d
-
78ef1d
-	ld	t0,16(a1)
78ef1d
-	dmultu	t0,t0
78ef1d
-	mflo	t1
78ef1d
-	mfhi	t0
78ef1d
-	sd	t1,32(a0)
78ef1d
-	sd	t0,40(a0)
78ef1d
-	jr	ra
78ef1d
-END(bn_sqr_words)
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_add_words)
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	a3,.L_bn_add_words_proceed
78ef1d
-	ld	t0,0(a1)
78ef1d
-	jr	ra
78ef1d
-	move	v0,zero
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_add_words_proceed:
78ef1d
-	li	MINUS4,-4
78ef1d
-	and	AT,a3,MINUS4
78ef1d
-	move	v0,zero
78ef1d
-	beqz	AT,.L_bn_add_words_tail
78ef1d
-
78ef1d
-.L_bn_add_words_loop:
78ef1d
-	ld	ta0,0(a2)
78ef1d
-	subu	a3,4
78ef1d
-	ld	t1,8(a1)
78ef1d
-	and	AT,a3,MINUS4
78ef1d
-	ld	t2,16(a1)
78ef1d
-	PTR_ADD	a2,32
78ef1d
-	ld	t3,24(a1)
78ef1d
-	PTR_ADD	a0,32
78ef1d
-	ld	ta1,-24(a2)
78ef1d
-	PTR_ADD	a1,32
78ef1d
-	ld	ta2,-16(a2)
78ef1d
-	ld	ta3,-8(a2)
78ef1d
-	daddu	ta0,t0
78ef1d
-	sltu	t8,ta0,t0
78ef1d
-	daddu	t0,ta0,v0
78ef1d
-	sltu	v0,t0,ta0
78ef1d
-	sd	t0,-32(a0)
78ef1d
-	daddu	v0,t8
78ef1d
-
78ef1d
-	daddu	ta1,t1
78ef1d
-	sltu	t9,ta1,t1
78ef1d
-	daddu	t1,ta1,v0
78ef1d
-	sltu	v0,t1,ta1
78ef1d
-	sd	t1,-24(a0)
78ef1d
-	daddu	v0,t9
78ef1d
-
78ef1d
-	daddu	ta2,t2
78ef1d
-	sltu	t8,ta2,t2
78ef1d
-	daddu	t2,ta2,v0
78ef1d
-	sltu	v0,t2,ta2
78ef1d
-	sd	t2,-16(a0)
78ef1d
-	daddu	v0,t8
78ef1d
-	
78ef1d
-	daddu	ta3,t3
78ef1d
-	sltu	t9,ta3,t3
78ef1d
-	daddu	t3,ta3,v0
78ef1d
-	sltu	v0,t3,ta3
78ef1d
-	sd	t3,-8(a0)
78ef1d
-	daddu	v0,t9
78ef1d
-	
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	AT,.L_bn_add_words_loop
78ef1d
-	ld	t0,0(a1)
78ef1d
-
78ef1d
-	bnezl	a3,.L_bn_add_words_tail
78ef1d
-	ld	t0,0(a1)
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_add_words_return:
78ef1d
-	jr	ra
78ef1d
-
78ef1d
-.L_bn_add_words_tail:
78ef1d
-	ld	ta0,0(a2)
78ef1d
-	daddu	ta0,t0
78ef1d
-	subu	a3,1
78ef1d
-	sltu	t8,ta0,t0
78ef1d
-	daddu	t0,ta0,v0
78ef1d
-	sltu	v0,t0,ta0
78ef1d
-	sd	t0,0(a0)
78ef1d
-	daddu	v0,t8
78ef1d
-	beqz	a3,.L_bn_add_words_return
78ef1d
-
78ef1d
-	ld	t1,8(a1)
78ef1d
-	ld	ta1,8(a2)
78ef1d
-	daddu	ta1,t1
78ef1d
-	subu	a3,1
78ef1d
-	sltu	t9,ta1,t1
78ef1d
-	daddu	t1,ta1,v0
78ef1d
-	sltu	v0,t1,ta1
78ef1d
-	sd	t1,8(a0)
78ef1d
-	daddu	v0,t9
78ef1d
-	beqz	a3,.L_bn_add_words_return
78ef1d
-
78ef1d
-	ld	t2,16(a1)
78ef1d
-	ld	ta2,16(a2)
78ef1d
-	daddu	ta2,t2
78ef1d
-	sltu	t8,ta2,t2
78ef1d
-	daddu	t2,ta2,v0
78ef1d
-	sltu	v0,t2,ta2
78ef1d
-	sd	t2,16(a0)
78ef1d
-	daddu	v0,t8
78ef1d
-	jr	ra
78ef1d
-END(bn_add_words)
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_sub_words)
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	a3,.L_bn_sub_words_proceed
78ef1d
-	ld	t0,0(a1)
78ef1d
-	jr	ra
78ef1d
-	move	v0,zero
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_sub_words_proceed:
78ef1d
-	li	MINUS4,-4
78ef1d
-	and	AT,a3,MINUS4
78ef1d
-	move	v0,zero
78ef1d
-	beqz	AT,.L_bn_sub_words_tail
78ef1d
-
78ef1d
-.L_bn_sub_words_loop:
78ef1d
-	ld	ta0,0(a2)
78ef1d
-	subu	a3,4
78ef1d
-	ld	t1,8(a1)
78ef1d
-	and	AT,a3,MINUS4
78ef1d
-	ld	t2,16(a1)
78ef1d
-	PTR_ADD	a2,32
78ef1d
-	ld	t3,24(a1)
78ef1d
-	PTR_ADD	a0,32
78ef1d
-	ld	ta1,-24(a2)
78ef1d
-	PTR_ADD	a1,32
78ef1d
-	ld	ta2,-16(a2)
78ef1d
-	ld	ta3,-8(a2)
78ef1d
-	sltu	t8,t0,ta0
78ef1d
-	dsubu	t0,ta0
78ef1d
-	dsubu	ta0,t0,v0
78ef1d
-	sd	ta0,-32(a0)
78ef1d
-	MOVNZ	(t0,v0,t8)
78ef1d
-
78ef1d
-	sltu	t9,t1,ta1
78ef1d
-	dsubu	t1,ta1
78ef1d
-	dsubu	ta1,t1,v0
78ef1d
-	sd	ta1,-24(a0)
78ef1d
-	MOVNZ	(t1,v0,t9)
78ef1d
-
78ef1d
-
78ef1d
-	sltu	t8,t2,ta2
78ef1d
-	dsubu	t2,ta2
78ef1d
-	dsubu	ta2,t2,v0
78ef1d
-	sd	ta2,-16(a0)
78ef1d
-	MOVNZ	(t2,v0,t8)
78ef1d
-
78ef1d
-	sltu	t9,t3,ta3
78ef1d
-	dsubu	t3,ta3
78ef1d
-	dsubu	ta3,t3,v0
78ef1d
-	sd	ta3,-8(a0)
78ef1d
-	MOVNZ	(t3,v0,t9)
78ef1d
-
78ef1d
-	.set	noreorder
78ef1d
-	bgtzl	AT,.L_bn_sub_words_loop
78ef1d
-	ld	t0,0(a1)
78ef1d
-
78ef1d
-	bnezl	a3,.L_bn_sub_words_tail
78ef1d
-	ld	t0,0(a1)
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-.L_bn_sub_words_return:
78ef1d
-	jr	ra
78ef1d
-
78ef1d
-.L_bn_sub_words_tail:
78ef1d
-	ld	ta0,0(a2)
78ef1d
-	subu	a3,1
78ef1d
-	sltu	t8,t0,ta0
78ef1d
-	dsubu	t0,ta0
78ef1d
-	dsubu	ta0,t0,v0
78ef1d
-	MOVNZ	(t0,v0,t8)
78ef1d
-	sd	ta0,0(a0)
78ef1d
-	beqz	a3,.L_bn_sub_words_return
78ef1d
-
78ef1d
-	ld	t1,8(a1)
78ef1d
-	subu	a3,1
78ef1d
-	ld	ta1,8(a2)
78ef1d
-	sltu	t9,t1,ta1
78ef1d
-	dsubu	t1,ta1
78ef1d
-	dsubu	ta1,t1,v0
78ef1d
-	MOVNZ	(t1,v0,t9)
78ef1d
-	sd	ta1,8(a0)
78ef1d
-	beqz	a3,.L_bn_sub_words_return
78ef1d
-
78ef1d
-	ld	t2,16(a1)
78ef1d
-	ld	ta2,16(a2)
78ef1d
-	sltu	t8,t2,ta2
78ef1d
-	dsubu	t2,ta2
78ef1d
-	dsubu	ta2,t2,v0
78ef1d
-	MOVNZ	(t2,v0,t8)
78ef1d
-	sd	ta2,16(a0)
78ef1d
-	jr	ra
78ef1d
-END(bn_sub_words)
78ef1d
-
78ef1d
-#undef	MINUS4
78ef1d
-
78ef1d
-.align 5
78ef1d
-LEAF(bn_div_3_words)
78ef1d
-	.set	reorder
78ef1d
-	move	a3,a0		/* we know that bn_div_words doesn't
78ef1d
-				 * touch a3, ta2, ta3 and preserves a2
78ef1d
-				 * so that we can save two arguments
78ef1d
-				 * and return address in registers
78ef1d
-				 * instead of stack:-)
78ef1d
-				 */
78ef1d
-	ld	a0,(a3)
78ef1d
-	move	ta2,a1
78ef1d
-	ld	a1,-8(a3)
78ef1d
-	bne	a0,a2,.L_bn_div_3_words_proceed
78ef1d
-	li	v0,-1
78ef1d
-	jr	ra
78ef1d
-.L_bn_div_3_words_proceed:
78ef1d
-	move	ta3,ra
78ef1d
-	bal	bn_div_words
78ef1d
-	move	ra,ta3
78ef1d
-	dmultu	ta2,v0
78ef1d
-	ld	t2,-16(a3)
78ef1d
-	move	ta0,zero
78ef1d
-	mfhi	t1
78ef1d
-	mflo	t0
78ef1d
-	sltu	t8,t1,v1
78ef1d
-.L_bn_div_3_words_inner_loop:
78ef1d
-	bnez	t8,.L_bn_div_3_words_inner_loop_done
78ef1d
-	sgeu	AT,t2,t0
78ef1d
-	seq	t9,t1,v1
78ef1d
-	and	AT,t9
78ef1d
-	sltu	t3,t0,ta2
78ef1d
-	daddu	v1,a2
78ef1d
-	dsubu	t1,t3
78ef1d
-	dsubu	t0,ta2
78ef1d
-	sltu	t8,t1,v1
78ef1d
-	sltu	ta0,v1,a2
78ef1d
-	or	t8,ta0
78ef1d
-	.set	noreorder
78ef1d
-	beqzl	AT,.L_bn_div_3_words_inner_loop
78ef1d
-	dsubu	v0,1
78ef1d
-	.set	reorder
78ef1d
-.L_bn_div_3_words_inner_loop_done:
78ef1d
-	jr	ra
78ef1d
-END(bn_div_3_words)
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_div_words)
78ef1d
-	.set	noreorder
78ef1d
-	bnezl	a2,.L_bn_div_words_proceed
78ef1d
-	move	v1,zero
78ef1d
-	jr	ra
78ef1d
-	li	v0,-1		/* I'd rather signal div-by-zero
78ef1d
-				 * which can be done with 'break 7' */
78ef1d
-
78ef1d
-.L_bn_div_words_proceed:
78ef1d
-	bltz	a2,.L_bn_div_words_body
78ef1d
-	move	t9,v1
78ef1d
-	dsll	a2,1
78ef1d
-	bgtz	a2,.-4
78ef1d
-	addu	t9,1
78ef1d
-
78ef1d
-	.set	reorder
78ef1d
-	negu	t1,t9
78ef1d
-	li	t2,-1
78ef1d
-	dsll	t2,t1
78ef1d
-	and	t2,a0
78ef1d
-	dsrl	AT,a1,t1
78ef1d
-	.set	noreorder
78ef1d
-	bnezl	t2,.+8
78ef1d
-	break	6		/* signal overflow */
78ef1d
-	.set	reorder
78ef1d
-	dsll	a0,t9
78ef1d
-	dsll	a1,t9
78ef1d
-	or	a0,AT
78ef1d
-
78ef1d
-#define	QT	ta0
78ef1d
-#define	HH	ta1
78ef1d
-#define	DH	v1
78ef1d
-.L_bn_div_words_body:
78ef1d
-	dsrl	DH,a2,32
78ef1d
-	sgeu	AT,a0,a2
78ef1d
-	.set	noreorder
78ef1d
-	bnezl	AT,.+8
78ef1d
-	dsubu	a0,a2
78ef1d
-	.set	reorder
78ef1d
-
78ef1d
-	li	QT,-1
78ef1d
-	dsrl	HH,a0,32
78ef1d
-	dsrl	QT,32	/* q=0xffffffff */
78ef1d
-	beq	DH,HH,.L_bn_div_words_skip_div1
78ef1d
-	ddivu	zero,a0,DH
78ef1d
-	mflo	QT
78ef1d
-.L_bn_div_words_skip_div1:
78ef1d
-	dmultu	a2,QT
78ef1d
-	dsll	t3,a0,32
78ef1d
-	dsrl	AT,a1,32
78ef1d
-	or	t3,AT
78ef1d
-	mflo	t0
78ef1d
-	mfhi	t1
78ef1d
-.L_bn_div_words_inner_loop1:
78ef1d
-	sltu	t2,t3,t0
78ef1d
-	seq	t8,HH,t1
78ef1d
-	sltu	AT,HH,t1
78ef1d
-	and	t2,t8
78ef1d
-	sltu	v0,t0,a2
78ef1d
-	or	AT,t2
78ef1d
-	.set	noreorder
78ef1d
-	beqz	AT,.L_bn_div_words_inner_loop1_done
78ef1d
-	dsubu	t1,v0
78ef1d
-	dsubu	t0,a2
78ef1d
-	b	.L_bn_div_words_inner_loop1
78ef1d
-	dsubu	QT,1
78ef1d
-	.set	reorder
78ef1d
-.L_bn_div_words_inner_loop1_done:
78ef1d
-
78ef1d
-	dsll	a1,32
78ef1d
-	dsubu	a0,t3,t0
78ef1d
-	dsll	v0,QT,32
78ef1d
-
78ef1d
-	li	QT,-1
78ef1d
-	dsrl	HH,a0,32
78ef1d
-	dsrl	QT,32	/* q=0xffffffff */
78ef1d
-	beq	DH,HH,.L_bn_div_words_skip_div2
78ef1d
-	ddivu	zero,a0,DH
78ef1d
-	mflo	QT
78ef1d
-.L_bn_div_words_skip_div2:
78ef1d
-#undef	DH
78ef1d
-	dmultu	a2,QT
78ef1d
-	dsll	t3,a0,32
78ef1d
-	dsrl	AT,a1,32
78ef1d
-	or	t3,AT
78ef1d
-	mflo	t0
78ef1d
-	mfhi	t1
78ef1d
-.L_bn_div_words_inner_loop2:
78ef1d
-	sltu	t2,t3,t0
78ef1d
-	seq	t8,HH,t1
78ef1d
-	sltu	AT,HH,t1
78ef1d
-	and	t2,t8
78ef1d
-	sltu	v1,t0,a2
78ef1d
-	or	AT,t2
78ef1d
-	.set	noreorder
78ef1d
-	beqz	AT,.L_bn_div_words_inner_loop2_done
78ef1d
-	dsubu	t1,v1
78ef1d
-	dsubu	t0,a2
78ef1d
-	b	.L_bn_div_words_inner_loop2
78ef1d
-	dsubu	QT,1
78ef1d
-	.set	reorder
78ef1d
-.L_bn_div_words_inner_loop2_done:	
78ef1d
-#undef	HH
78ef1d
-
78ef1d
-	dsubu	a0,t3,t0
78ef1d
-	or	v0,QT
78ef1d
-	dsrl	v1,a0,t9	/* v1 contains remainder if anybody wants it */
78ef1d
-	dsrl	a2,t9		/* restore a2 */
78ef1d
-	jr	ra
78ef1d
-#undef	QT
78ef1d
-END(bn_div_words)
78ef1d
-
78ef1d
-#define	a_0	t0
78ef1d
-#define	a_1	t1
78ef1d
-#define	a_2	t2
78ef1d
-#define	a_3	t3
78ef1d
-#define	b_0	ta0
78ef1d
-#define	b_1	ta1
78ef1d
-#define	b_2	ta2
78ef1d
-#define	b_3	ta3
78ef1d
-
78ef1d
-#define	a_4	s0
78ef1d
-#define	a_5	s2
78ef1d
-#define	a_6	s4
78ef1d
-#define	a_7	a1	/* once we load a[7] we don't need a anymore */
78ef1d
-#define	b_4	s1
78ef1d
-#define	b_5	s3
78ef1d
-#define	b_6	s5
78ef1d
-#define	b_7	a2	/* once we load b[7] we don't need b anymore */
78ef1d
-
78ef1d
-#define	t_1	t8
78ef1d
-#define	t_2	t9
78ef1d
-
78ef1d
-#define	c_1	v0
78ef1d
-#define	c_2	v1
78ef1d
-#define	c_3	a3
78ef1d
-
78ef1d
-#define	FRAME_SIZE	48
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_mul_comba8)
78ef1d
-	.set	noreorder
78ef1d
-	PTR_SUB	sp,FRAME_SIZE
78ef1d
-	.frame	sp,64,ra
78ef1d
-	.set	reorder
78ef1d
-	ld	a_0,0(a1)	/* If compiled with -mips3 option on
78ef1d
-				 * R5000 box assembler barks on this
78ef1d
-				 * line with "shouldn't have mult/div
78ef1d
-				 * as last instruction in bb (R10K
78ef1d
-				 * bug)" warning. If anybody out there
78ef1d
-				 * has a clue about how to circumvent
78ef1d
-				 * this do send me a note.
78ef1d
-				 *		<appro@fy.chalmers.se>
78ef1d
-				 */
78ef1d
-	ld	b_0,0(a2)
78ef1d
-	ld	a_1,8(a1)
78ef1d
-	ld	a_2,16(a1)
78ef1d
-	ld	a_3,24(a1)
78ef1d
-	ld	b_1,8(a2)
78ef1d
-	ld	b_2,16(a2)
78ef1d
-	ld	b_3,24(a2)
78ef1d
-	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
78ef1d
-	sd	s0,0(sp)
78ef1d
-	sd	s1,8(sp)
78ef1d
-	sd	s2,16(sp)
78ef1d
-	sd	s3,24(sp)
78ef1d
-	sd	s4,32(sp)
78ef1d
-	sd	s5,40(sp)
78ef1d
-	mflo	c_1
78ef1d
-	mfhi	c_2
78ef1d
-
78ef1d
-	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
78ef1d
-	ld	a_4,32(a1)
78ef1d
-	ld	a_5,40(a1)
78ef1d
-	ld	a_6,48(a1)
78ef1d
-	ld	a_7,56(a1)
78ef1d
-	ld	b_4,32(a2)
78ef1d
-	ld	b_5,40(a2)
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	c_3,t_2,AT
78ef1d
-	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
78ef1d
-	ld	b_6,48(a2)
78ef1d
-	ld	b_7,56(a2)
78ef1d
-	sd	c_1,0(a0)	/* r[0]=c1; */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	c_1,c_3,t_2
78ef1d
-	sd	c_2,8(a0)	/* r[1]=c2; */
78ef1d
-
78ef1d
-	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	c_2,c_1,t_2
78ef1d
-	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,16(a0)	/* r[2]=c3; */
78ef1d
-
78ef1d
-	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	c_3,c_2,t_2
78ef1d
-	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,24(a0)	/* r[3]=c1; */
78ef1d
-
78ef1d
-	dmultu	a_4,b_0		/* mul_add_c(a[4],b[0],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	c_1,c_3,t_2
78ef1d
-	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_0,b_4		/* mul_add_c(a[0],b[4],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,32(a0)	/* r[4]=c2; */
78ef1d
-
78ef1d
-	dmultu	a_0,b_5		/* mul_add_c(a[0],b[5],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	c_2,c_1,t_2
78ef1d
-	dmultu	a_1,b_4		/* mul_add_c(a[1],b[4],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_4,b_1		/* mul_add_c(a[4],b[1],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_5,b_0		/* mul_add_c(a[5],b[0],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,40(a0)	/* r[5]=c3; */
78ef1d
-
78ef1d
-	dmultu	a_6,b_0		/* mul_add_c(a[6],b[0],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	c_3,c_2,t_2
78ef1d
-	dmultu	a_5,b_1		/* mul_add_c(a[5],b[1],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_4,b_2		/* mul_add_c(a[4],b[2],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_2,b_4		/* mul_add_c(a[2],b[4],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_1,b_5		/* mul_add_c(a[1],b[5],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_0,b_6		/* mul_add_c(a[0],b[6],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,48(a0)	/* r[6]=c1; */
78ef1d
-
78ef1d
-	dmultu	a_0,b_7		/* mul_add_c(a[0],b[7],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	c_1,c_3,t_2
78ef1d
-	dmultu	a_1,b_6		/* mul_add_c(a[1],b[6],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_2,b_5		/* mul_add_c(a[2],b[5],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_3,b_4		/* mul_add_c(a[3],b[4],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_4,b_3		/* mul_add_c(a[4],b[3],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_5,b_2		/* mul_add_c(a[5],b[2],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_6,b_1		/* mul_add_c(a[6],b[1],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_7,b_0		/* mul_add_c(a[7],b[0],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,56(a0)	/* r[7]=c2; */
78ef1d
-
78ef1d
-	dmultu	a_7,b_1		/* mul_add_c(a[7],b[1],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	c_2,c_1,t_2
78ef1d
-	dmultu	a_6,b_2		/* mul_add_c(a[6],b[2],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_5,b_3		/* mul_add_c(a[5],b[3],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_4,b_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_3,b_5		/* mul_add_c(a[3],b[5],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_2,b_6		/* mul_add_c(a[2],b[6],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_1,b_7		/* mul_add_c(a[1],b[7],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,64(a0)	/* r[8]=c3; */
78ef1d
-
78ef1d
-	dmultu	a_2,b_7		/* mul_add_c(a[2],b[7],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	c_3,c_2,t_2
78ef1d
-	dmultu	a_3,b_6		/* mul_add_c(a[3],b[6],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_4,b_5		/* mul_add_c(a[4],b[5],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_5,b_4		/* mul_add_c(a[5],b[4],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_6,b_3		/* mul_add_c(a[6],b[3],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_7,b_2		/* mul_add_c(a[7],b[2],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,72(a0)	/* r[9]=c1; */
78ef1d
-
78ef1d
-	dmultu	a_7,b_3		/* mul_add_c(a[7],b[3],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	c_1,c_3,t_2
78ef1d
-	dmultu	a_6,b_4		/* mul_add_c(a[6],b[4],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_5,b_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_4,b_6		/* mul_add_c(a[4],b[6],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_3,b_7		/* mul_add_c(a[3],b[7],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,80(a0)	/* r[10]=c2; */
78ef1d
-
78ef1d
-	dmultu	a_4,b_7		/* mul_add_c(a[4],b[7],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	c_2,c_1,t_2
78ef1d
-	dmultu	a_5,b_6		/* mul_add_c(a[5],b[6],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_6,b_5		/* mul_add_c(a[6],b[5],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_7,b_4		/* mul_add_c(a[7],b[4],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,88(a0)	/* r[11]=c3; */
78ef1d
-
78ef1d
-	dmultu	a_7,b_5		/* mul_add_c(a[7],b[5],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	c_3,c_2,t_2
78ef1d
-	dmultu	a_6,b_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_5,b_7		/* mul_add_c(a[5],b[7],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,96(a0)	/* r[12]=c1; */
78ef1d
-
78ef1d
-	dmultu	a_6,b_7		/* mul_add_c(a[6],b[7],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	c_1,c_3,t_2
78ef1d
-	dmultu	a_7,b_6		/* mul_add_c(a[7],b[6],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,104(a0)	/* r[13]=c2; */
78ef1d
-
78ef1d
-	dmultu	a_7,b_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
78ef1d
-	ld	s0,0(sp)
78ef1d
-	ld	s1,8(sp)
78ef1d
-	ld	s2,16(sp)
78ef1d
-	ld	s3,24(sp)
78ef1d
-	ld	s4,32(sp)
78ef1d
-	ld	s5,40(sp)
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sd	c_3,112(a0)	/* r[14]=c3; */
78ef1d
-	sd	c_1,120(a0)	/* r[15]=c1; */
78ef1d
-
78ef1d
-	PTR_ADD	sp,FRAME_SIZE
78ef1d
-
78ef1d
-	jr	ra
78ef1d
-END(bn_mul_comba8)
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_mul_comba4)
78ef1d
-	.set	reorder
78ef1d
-	ld	a_0,0(a1)
78ef1d
-	ld	b_0,0(a2)
78ef1d
-	ld	a_1,8(a1)
78ef1d
-	ld	a_2,16(a1)
78ef1d
-	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
78ef1d
-	ld	a_3,24(a1)
78ef1d
-	ld	b_1,8(a2)
78ef1d
-	ld	b_2,16(a2)
78ef1d
-	ld	b_3,24(a2)
78ef1d
-	mflo	c_1
78ef1d
-	mfhi	c_2
78ef1d
-	sd	c_1,0(a0)
78ef1d
-
78ef1d
-	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	c_3,t_2,AT
78ef1d
-	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	c_1,c_3,t_2
78ef1d
-	sd	c_2,8(a0)
78ef1d
-
78ef1d
-	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	c_2,c_1,t_2
78ef1d
-	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,16(a0)
78ef1d
-
78ef1d
-	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	c_3,c_2,t_2
78ef1d
-	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,24(a0)
78ef1d
-
78ef1d
-	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	c_1,c_3,t_2
78ef1d
-	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,32(a0)
78ef1d
-
78ef1d
-	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	c_2,c_1,t_2
78ef1d
-	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,40(a0)
78ef1d
-
78ef1d
-	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sd	c_1,48(a0)
78ef1d
-	sd	c_2,56(a0)
78ef1d
-
78ef1d
-	jr	ra
78ef1d
-END(bn_mul_comba4)
78ef1d
-
78ef1d
-#undef	a_4
78ef1d
-#undef	a_5
78ef1d
-#undef	a_6
78ef1d
-#undef	a_7
78ef1d
-#define	a_4	b_0
78ef1d
-#define	a_5	b_1
78ef1d
-#define	a_6	b_2
78ef1d
-#define	a_7	b_3
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_sqr_comba8)
78ef1d
-	.set	reorder
78ef1d
-	ld	a_0,0(a1)
78ef1d
-	ld	a_1,8(a1)
78ef1d
-	ld	a_2,16(a1)
78ef1d
-	ld	a_3,24(a1)
78ef1d
-
78ef1d
-	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
78ef1d
-	ld	a_4,32(a1)
78ef1d
-	ld	a_5,40(a1)
78ef1d
-	ld	a_6,48(a1)
78ef1d
-	ld	a_7,56(a1)
78ef1d
-	mflo	c_1
78ef1d
-	mfhi	c_2
78ef1d
-	sd	c_1,0(a0)
78ef1d
-
78ef1d
-	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_1,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	c_3,t_2,AT
78ef1d
-	sd	c_2,8(a0)
78ef1d
-
78ef1d
-	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_2,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,16(a0)
78ef1d
-
78ef1d
-	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_3,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_1,a_2		/* mul_add_c2(a[1],b[2],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_3,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,24(a0)
78ef1d
-
78ef1d
-	dmultu	a_4,a_0		/* mul_add_c2(a[4],b[0],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_1,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_1,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,32(a0)
78ef1d
-
78ef1d
-	dmultu	a_0,a_5		/* mul_add_c2(a[0],b[5],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_2,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_1,a_4		/* mul_add_c2(a[1],b[4],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_2,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_2,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,40(a0)
78ef1d
-
78ef1d
-	dmultu	a_6,a_0		/* mul_add_c2(a[6],b[0],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_3,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_5,a_1		/* mul_add_c2(a[5],b[1],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_3,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_4,a_2		/* mul_add_c2(a[4],b[2],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_3,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,48(a0)
78ef1d
-
78ef1d
-	dmultu	a_0,a_7		/* mul_add_c2(a[0],b[7],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_1,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_1,a_6		/* mul_add_c2(a[1],b[6],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_1,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_2,a_5		/* mul_add_c2(a[2],b[5],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_1,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_3,a_4		/* mul_add_c2(a[3],b[4],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_1,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,56(a0)
78ef1d
-
78ef1d
-	dmultu	a_7,a_1		/* mul_add_c2(a[7],b[1],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_2,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_6,a_2		/* mul_add_c2(a[6],b[2],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_2,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_5,a_3		/* mul_add_c2(a[5],b[3],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_2,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_4,a_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,64(a0)
78ef1d
-
78ef1d
-	dmultu	a_2,a_7		/* mul_add_c2(a[2],b[7],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_3,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_3,a_6		/* mul_add_c2(a[3],b[6],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_3,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_4,a_5		/* mul_add_c2(a[4],b[5],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_3,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,72(a0)
78ef1d
-
78ef1d
-	dmultu	a_7,a_3		/* mul_add_c2(a[7],b[3],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_1,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_6,a_4		/* mul_add_c2(a[6],b[4],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_1,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_5,a_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,80(a0)
78ef1d
-
78ef1d
-	dmultu	a_4,a_7		/* mul_add_c2(a[4],b[7],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_2,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_5,a_6		/* mul_add_c2(a[5],b[6],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_2,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,88(a0)
78ef1d
-
78ef1d
-	dmultu	a_7,a_5		/* mul_add_c2(a[7],b[5],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_3,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_6,a_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,96(a0)
78ef1d
-
78ef1d
-	dmultu	a_6,a_7		/* mul_add_c2(a[6],b[7],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_1,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,104(a0)
78ef1d
-
78ef1d
-	dmultu	a_7,a_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sd	c_3,112(a0)
78ef1d
-	sd	c_1,120(a0)
78ef1d
-
78ef1d
-	jr	ra
78ef1d
-END(bn_sqr_comba8)
78ef1d
-
78ef1d
-.align	5
78ef1d
-LEAF(bn_sqr_comba4)
78ef1d
-	.set	reorder
78ef1d
-	ld	a_0,0(a1)
78ef1d
-	ld	a_1,8(a1)
78ef1d
-	ld	a_2,16(a1)
78ef1d
-	ld	a_3,24(a1)
78ef1d
-	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
78ef1d
-	mflo	c_1
78ef1d
-	mfhi	c_2
78ef1d
-	sd	c_1,0(a0)
78ef1d
-
78ef1d
-	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_1,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	c_3,t_2,AT
78ef1d
-	sd	c_2,8(a0)
78ef1d
-
78ef1d
-	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_2,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,16(a0)
78ef1d
-
78ef1d
-	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_3,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	dmultu	a_1,a_2		/* mul_add_c(a2[1],b[2],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	AT,t_2,zero
78ef1d
-	daddu	c_3,AT
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sltu	AT,c_2,t_2
78ef1d
-	daddu	c_3,AT
78ef1d
-	sd	c_1,24(a0)
78ef1d
-
78ef1d
-	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_1,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_2,t_1
78ef1d
-	sltu	AT,c_2,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_3,t_2
78ef1d
-	sltu	AT,c_3,t_2
78ef1d
-	daddu	c_1,AT
78ef1d
-	sd	c_2,32(a0)
78ef1d
-
78ef1d
-	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	slt	c_2,t_2,zero
78ef1d
-	dsll	t_2,1
78ef1d
-	slt	a2,t_1,zero
78ef1d
-	daddu	t_2,a2
78ef1d
-	dsll	t_1,1
78ef1d
-	daddu	c_3,t_1
78ef1d
-	sltu	AT,c_3,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_1,t_2
78ef1d
-	sltu	AT,c_1,t_2
78ef1d
-	daddu	c_2,AT
78ef1d
-	sd	c_3,40(a0)
78ef1d
-
78ef1d
-	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
78ef1d
-	mflo	t_1
78ef1d
-	mfhi	t_2
78ef1d
-	daddu	c_1,t_1
78ef1d
-	sltu	AT,c_1,t_1
78ef1d
-	daddu	t_2,AT
78ef1d
-	daddu	c_2,t_2
78ef1d
-	sd	c_1,48(a0)
78ef1d
-	sd	c_2,56(a0)
78ef1d
-
78ef1d
-	jr	ra
78ef1d
-END(bn_sqr_comba4)
78ef1d
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
78ef1d
index 31476ab..2d39407 100644
78ef1d
--- a/crypto/bn/asm/x86_64-gcc.c
78ef1d
+++ b/crypto/bn/asm/x86_64-gcc.c
78ef1d
@@ -273,6 +273,10 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
78ef1d
 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
78ef1d
 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
78ef1d
 
78ef1d
+/*
78ef1d
+ * Keep in mind that carrying into high part of multiplication result
78ef1d
+ * can not overflow, because it cannot be all-ones.
78ef1d
+ */
78ef1d
 #if 0
78ef1d
 /* original macros are kept for reference purposes */
78ef1d
 #define mul_add_c(a,b,c0,c1,c2) {	\
78ef1d
@@ -287,10 +291,10 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
78ef1d
 	BN_ULONG ta=(a),tb=(b),t0;	\
78ef1d
 	t1 = BN_UMULT_HIGH(ta,tb);	\
78ef1d
 	t0 = ta * tb;			\
78ef1d
-	t2 = t1+t1; c2 += (t2
78ef1d
-	t1 = t0+t0; t2 += (t1
78ef1d
-	c0 += t1; t2 += (c0
78ef1d
+	c0 += t0; t2 = t1+((c0
78ef1d
 	c1 += t2; c2 += (c1
78ef1d
+	c0 += t0; t1 += (c0
78ef1d
+	c1 += t1; c2 += (c1
78ef1d
 	}
78ef1d
 #else
78ef1d
 #define mul_add_c(a,b,c0,c1,c2)	do {	\
78ef1d
@@ -328,22 +332,14 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
78ef1d
 		: "=a"(t1),"=d"(t2)	\
78ef1d
 		: "a"(a),"m"(b)		\
78ef1d
 		: "cc");		\
78ef1d
-	asm ("addq %0,%0; adcq %2,%1"	\
78ef1d
-		: "+d"(t2),"+r"(c2)	\
78ef1d
-		: "g"(0)		\
78ef1d
-		: "cc");		\
78ef1d
-	asm ("addq %0,%0; adcq %2,%1"	\
78ef1d
-		: "+a"(t1),"+d"(t2)	\
78ef1d
-		: "g"(0)		\
78ef1d
-		: "cc");		\
78ef1d
-	asm ("addq %2,%0; adcq %3,%1"	\
78ef1d
-		: "+r"(c0),"+d"(t2)	\
78ef1d
-		: "a"(t1),"g"(0)	\
78ef1d
-		: "cc");		\
78ef1d
-	asm ("addq %2,%0; adcq %3,%1"	\
78ef1d
-		: "+r"(c1),"+r"(c2)	\
78ef1d
-		: "d"(t2),"g"(0)	\
78ef1d
-		: "cc");		\
78ef1d
+	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"	\
78ef1d
+		: "+r"(c0),"+r"(c1),"+r"(c2)		\
78ef1d
+		: "r"(t1),"r"(t2),"g"(0)		\
78ef1d
+		: "cc");				\
78ef1d
+	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"	\
78ef1d
+		: "+r"(c0),"+r"(c1),"+r"(c2)		\
78ef1d
+		: "r"(t1),"r"(t2),"g"(0)		\
78ef1d
+		: "cc");				\
78ef1d
 	} while (0)
78ef1d
 #endif
78ef1d
 
78ef1d
diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c
78ef1d
index c43c91c..a33b634 100644
78ef1d
--- a/crypto/bn/bn_asm.c
78ef1d
+++ b/crypto/bn/bn_asm.c
78ef1d
@@ -438,6 +438,10 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
78ef1d
 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
78ef1d
 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
78ef1d
 
78ef1d
+/*
78ef1d
+ * Keep in mind that carrying into high part of multiplication result
78ef1d
+ * can not overflow, because it cannot be all-ones.
78ef1d
+ */
78ef1d
 #ifdef BN_LLONG
78ef1d
 #define mul_add_c(a,b,c0,c1,c2) \
78ef1d
 	t=(BN_ULLONG)a*b; \
78ef1d
@@ -478,10 +482,10 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
78ef1d
 #define mul_add_c2(a,b,c0,c1,c2) {	\
78ef1d
 	BN_ULONG ta=(a),tb=(b),t0;	\
78ef1d
 	BN_UMULT_LOHI(t0,t1,ta,tb);	\
78ef1d
-	t2 = t1+t1; c2 += (t2
78ef1d
-	t1 = t0+t0; t2 += (t1
78ef1d
-	c0 += t1; t2 += (c0
78ef1d
+	c0 += t0; t2 = t1+((c0
78ef1d
 	c1 += t2; c2 += (c1
78ef1d
+	c0 += t0; t1 += (c0
78ef1d
+	c1 += t1; c2 += (c1
78ef1d
 	}
78ef1d
 
78ef1d
 #define sqr_add_c(a,i,c0,c1,c2)	{	\
78ef1d
@@ -508,10 +512,10 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
78ef1d
 	BN_ULONG ta=(a),tb=(b),t0;	\
78ef1d
 	t1 = BN_UMULT_HIGH(ta,tb);	\
78ef1d
 	t0 = ta * tb;			\
78ef1d
-	t2 = t1+t1; c2 += (t2
78ef1d
-	t1 = t0+t0; t2 += (t1
78ef1d
-	c0 += t1; t2 += (c0
78ef1d
+	c0 += t0; t2 = t1+((c0
78ef1d
 	c1 += t2; c2 += (c1
78ef1d
+	c0 += t0; t1 += (c0
78ef1d
+	c1 += t1; c2 += (c1
78ef1d
 	}
78ef1d
 
78ef1d
 #define sqr_add_c(a,i,c0,c1,c2)	{	\
78ef1d
diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
78ef1d
index 7771e92..48bc633 100644
78ef1d
--- a/crypto/bn/bntest.c
78ef1d
+++ b/crypto/bn/bntest.c
78ef1d
@@ -678,44 +678,98 @@ int test_mul(BIO *bp)
78ef1d
 
78ef1d
 int test_sqr(BIO *bp, BN_CTX *ctx)
78ef1d
 	{
78ef1d
-	BIGNUM a,c,d,e;
78ef1d
-	int i;
78ef1d
+	BIGNUM *a,*c,*d,*e;
78ef1d
+	int i, ret = 0;
78ef1d
 
78ef1d
-	BN_init(&a);
78ef1d
-	BN_init(&c);
78ef1d
-	BN_init(&d);
78ef1d
-	BN_init(&e);
78ef1d
+	a = BN_new();
78ef1d
+	c = BN_new();
78ef1d
+	d = BN_new();
78ef1d
+	e = BN_new();
78ef1d
+	if (a == NULL || c == NULL || d == NULL || e == NULL)
78ef1d
+		{
78ef1d
+		goto err;
78ef1d
+		}
78ef1d
 
78ef1d
 	for (i=0; i
78ef1d
 		{
78ef1d
-		BN_bntest_rand(&a,40+i*10,0,0);
78ef1d
-		a.neg=rand_neg();
78ef1d
-		BN_sqr(&c,&a,ctx);
78ef1d
+		BN_bntest_rand(a,40+i*10,0,0);
78ef1d
+		a->neg=rand_neg();
78ef1d
+		BN_sqr(c,a,ctx);
78ef1d
 		if (bp != NULL)
78ef1d
 			{
78ef1d
 			if (!results)
78ef1d
 				{
78ef1d
-				BN_print(bp,&a);
78ef1d
+				BN_print(bp,a);
78ef1d
 				BIO_puts(bp," * ");
78ef1d
-				BN_print(bp,&a);
78ef1d
+				BN_print(bp,a);
78ef1d
 				BIO_puts(bp," - ");
78ef1d
 				}
78ef1d
-			BN_print(bp,&c);
78ef1d
+			BN_print(bp,c);
78ef1d
 			BIO_puts(bp,"\n");
78ef1d
 			}
78ef1d
-		BN_div(&d,&e,&c,&a,ctx);
78ef1d
-		BN_sub(&d,&d,&a);
78ef1d
-		if(!BN_is_zero(&d) || !BN_is_zero(&e))
78ef1d
-		    {
78ef1d
-		    fprintf(stderr,"Square test failed!\n");
78ef1d
-		    return 0;
78ef1d
-		    }
78ef1d
+		BN_div(d,e,c,a,ctx);
78ef1d
+		BN_sub(d,d,a);
78ef1d
+		if(!BN_is_zero(d) || !BN_is_zero(e))
78ef1d
+			{
78ef1d
+			fprintf(stderr,"Square test failed!\n");
78ef1d
+			goto err;
78ef1d
+			}
78ef1d
 		}
78ef1d
-	BN_free(&a);
78ef1d
-	BN_free(&c);
78ef1d
-	BN_free(&d);
78ef1d
-	BN_free(&e);
78ef1d
-	return(1);
78ef1d
+
78ef1d
+	/* Regression test for a BN_sqr overflow bug. */
78ef1d
+	BN_hex2bn(&a,
78ef1d
+		"80000000000000008000000000000001FFFFFFFFFFFFFFFE0000000000000000");
78ef1d
+	BN_sqr(c, a, ctx);
78ef1d
+	if (bp != NULL)
78ef1d
+		{
78ef1d
+		if (!results)
78ef1d
+			{
78ef1d
+			BN_print(bp,a);
78ef1d
+			BIO_puts(bp," * ");
78ef1d
+			BN_print(bp,a);
78ef1d
+			BIO_puts(bp," - ");
78ef1d
+			}
78ef1d
+		BN_print(bp,c);
78ef1d
+		BIO_puts(bp,"\n");
78ef1d
+		}
78ef1d
+	BN_mul(d, a, a, ctx);
78ef1d
+	if (BN_cmp(c, d))
78ef1d
+		{
78ef1d
+		fprintf(stderr, "Square test failed: BN_sqr and BN_mul produce "
78ef1d
+			"different results!\n");
78ef1d
+		goto err;
78ef1d
+		}
78ef1d
+
78ef1d
+	/* Regression test for a BN_sqr overflow bug. */
78ef1d
+	BN_hex2bn(&a,
78ef1d
+		"80000000000000000000000080000001FFFFFFFE000000000000000000000000");
78ef1d
+	BN_sqr(c, a, ctx);
78ef1d
+	if (bp != NULL)
78ef1d
+		{
78ef1d
+		if (!results)
78ef1d
+			{
78ef1d
+			BN_print(bp,a);
78ef1d
+			BIO_puts(bp," * ");
78ef1d
+			BN_print(bp,a);
78ef1d
+			BIO_puts(bp," - ");
78ef1d
+			}
78ef1d
+		BN_print(bp,c);
78ef1d
+		BIO_puts(bp,"\n");
78ef1d
+		}
78ef1d
+	BN_mul(d, a, a, ctx);
78ef1d
+	if (BN_cmp(c, d))
78ef1d
+		{
78ef1d
+		fprintf(stderr, "Square test failed: BN_sqr and BN_mul produce "
78ef1d
+			"different results!\n");
78ef1d
+		goto err;
78ef1d
+		}
78ef1d
+	ret = 1;
78ef1d
+err:
78ef1d
+	if (a != NULL) BN_free(a);
78ef1d
+	if (c != NULL) BN_free(c);
78ef1d
+	if (d != NULL) BN_free(d);
78ef1d
+	if (e != NULL) BN_free(e);
78ef1d
+	return ret;
78ef1d
 	}
78ef1d
 
78ef1d
 int test_mont(BIO *bp, BN_CTX *ctx)
78ef1d
-- 
78ef1d
1.8.3.1
78ef1d