|
|
b0ccf1 |
diff --git a/ecc-256.c b/ecc-256.c
|
|
|
b0ccf1 |
index 571cf73..07841b1 100644
|
|
|
b0ccf1 |
--- a/ecc-256.c
|
|
|
b0ccf1 |
+++ b/ecc-256.c
|
|
|
b0ccf1 |
@@ -108,7 +108,10 @@ ecc_256_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
|
|
|
b0ccf1 |
u0 -= t;
|
|
|
b0ccf1 |
t = (u1 < cy);
|
|
|
b0ccf1 |
u1 -= cy;
|
|
|
b0ccf1 |
- u1 += cnd_add_n (t, rp + n - 4, ecc->p, 3);
|
|
|
b0ccf1 |
+
|
|
|
b0ccf1 |
+ cy = cnd_add_n (t, rp + n - 4, ecc->p, 2);
|
|
|
b0ccf1 |
+ u0 += cy;
|
|
|
b0ccf1 |
+ u1 += (u0 < cy);
|
|
|
b0ccf1 |
u1 -= (-t) & 0xffffffff;
|
|
|
b0ccf1 |
}
|
|
|
b0ccf1 |
rp[2] = u0;
|
|
|
b0ccf1 |
@@ -195,7 +198,7 @@ ecc_256_modq (const struct ecc_curve *ecc, mp_limb_t *rp)
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
/* Conditional add of p */
|
|
|
b0ccf1 |
u1 += t;
|
|
|
b0ccf1 |
- u2 += (t<<32) + (u0 < t);
|
|
|
b0ccf1 |
+ u2 += (t<<32) + (u1 < t);
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
t = cnd_add_n (t, rp + n - 4, ecc->q, 2);
|
|
|
b0ccf1 |
u1 += t;
|
|
|
b0ccf1 |
diff --git a/x86_64/ecc-384-modp.asm b/x86_64/ecc-384-modp.asm
|
|
|
b0ccf1 |
index 698838f..31b739e 100644
|
|
|
b0ccf1 |
--- a/x86_64/ecc-384-modp.asm
|
|
|
b0ccf1 |
+++ b/x86_64/ecc-384-modp.asm
|
|
|
b0ccf1 |
@@ -20,7 +20,7 @@ C MA 02111-1301, USA.
|
|
|
b0ccf1 |
.file "ecc-384-modp.asm"
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
define(<RP>, <%rsi>)
|
|
|
b0ccf1 |
-define(<D4>, <%rax>)
|
|
|
b0ccf1 |
+define(<D5>, <%rax>)
|
|
|
b0ccf1 |
define(<T0>, <%rbx>)
|
|
|
b0ccf1 |
define(<T1>, <%rcx>)
|
|
|
b0ccf1 |
define(<T2>, <%rdx>)
|
|
|
b0ccf1 |
@@ -35,8 +35,8 @@ define(, <%r13>)
|
|
|
b0ccf1 |
define(, <%r14>)
|
|
|
b0ccf1 |
define(<C2>, <%r15>)
|
|
|
b0ccf1 |
define(<C0>, H5) C Overlap
|
|
|
b0ccf1 |
-define(<D0>, RP) C Overlap
|
|
|
b0ccf1 |
-define(<TMP>, H4) C Overlap
|
|
|
b0ccf1 |
+define(<TMP>, RP) C Overlap
|
|
|
b0ccf1 |
+
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
PROLOGUE(nettle_ecc_384_modp)
|
|
|
b0ccf1 |
W64_ENTRY(2, 0)
|
|
|
b0ccf1 |
@@ -48,34 +48,38 @@ PROLOGUE(nettle_ecc_384_modp)
|
|
|
b0ccf1 |
push %r14
|
|
|
b0ccf1 |
push %r15
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
- C First get top 2 limbs, which need folding twice
|
|
|
b0ccf1 |
+ C First get top 2 limbs, which need folding twice.
|
|
|
b0ccf1 |
+ C B^10 = B^6 + B^4 + 2^32 (B-1)B^4.
|
|
|
b0ccf1 |
+ C We handle the terms as follow:
|
|
|
b0ccf1 |
C
|
|
|
b0ccf1 |
- C H5 H4
|
|
|
b0ccf1 |
- C -H5
|
|
|
b0ccf1 |
- C ------
|
|
|
b0ccf1 |
- C H0 D4
|
|
|
b0ccf1 |
+ C B^6: Folded immediatly.
|
|
|
b0ccf1 |
C
|
|
|
b0ccf1 |
- C Then shift right, (H1,H0,D4) <-- (H0,D4) << 32
|
|
|
b0ccf1 |
- C and add
|
|
|
b0ccf1 |
+ C B^4: Delayed, added in in the next folding.
|
|
|
b0ccf1 |
C
|
|
|
b0ccf1 |
- C H5 H4
|
|
|
b0ccf1 |
- C H1 H0
|
|
|
b0ccf1 |
- C ----------
|
|
|
b0ccf1 |
- C C2 H1 H0
|
|
|
b0ccf1 |
-
|
|
|
b0ccf1 |
- mov 80(RP), D4
|
|
|
b0ccf1 |
- mov 88(RP), H0
|
|
|
b0ccf1 |
- mov D4, H4
|
|
|
b0ccf1 |
- mov H0, H5
|
|
|
b0ccf1 |
- sub H0, D4
|
|
|
b0ccf1 |
- sbb $0, H0
|
|
|
b0ccf1 |
-
|
|
|
b0ccf1 |
- mov D4, T2
|
|
|
b0ccf1 |
- mov H0, H1
|
|
|
b0ccf1 |
- shl $32, H0
|
|
|
b0ccf1 |
- shr $32, T2
|
|
|
b0ccf1 |
+ C 2^32(B-1) B^4: Low half limb delayed until the next
|
|
|
b0ccf1 |
+ C folding. Top 1.5 limbs subtracted and shifter now, resulting
|
|
|
b0ccf1 |
+ C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added
|
|
|
b0ccf1 |
+ C in.
|
|
|
b0ccf1 |
+
|
|
|
b0ccf1 |
+ mov 80(RP), H4
|
|
|
b0ccf1 |
+ mov 88(RP), H5
|
|
|
b0ccf1 |
+ C Shift right 32 bits, into H1, H0
|
|
|
b0ccf1 |
+ mov H4, H0
|
|
|
b0ccf1 |
+ mov H5, H1
|
|
|
b0ccf1 |
+ mov H5, D5
|
|
|
b0ccf1 |
shr $32, H1
|
|
|
b0ccf1 |
- or T2, H0
|
|
|
b0ccf1 |
+ shl $32, D5
|
|
|
b0ccf1 |
+ shr $32, H0
|
|
|
b0ccf1 |
+ or D5, H0
|
|
|
b0ccf1 |
+
|
|
|
b0ccf1 |
+ C H1 H0
|
|
|
b0ccf1 |
+ C - H1 H0
|
|
|
b0ccf1 |
+ C --------
|
|
|
b0ccf1 |
+ C H1 H0 D5
|
|
|
b0ccf1 |
+ mov H0, D5
|
|
|
b0ccf1 |
+ neg D5
|
|
|
b0ccf1 |
+ sbb H1, H0
|
|
|
b0ccf1 |
+ sbb $0, H1
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
xor C2, C2
|
|
|
b0ccf1 |
add H4, H0
|
|
|
b0ccf1 |
@@ -114,118 +118,95 @@ PROLOGUE(nettle_ecc_384_modp)
|
|
|
b0ccf1 |
adc H3, T5
|
|
|
b0ccf1 |
adc $0, C0
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
- C H3 H2 H1 H0 0
|
|
|
b0ccf1 |
- C - H4 H3 H2 H1 H0
|
|
|
b0ccf1 |
- C ---------------
|
|
|
b0ccf1 |
- C H3 H2 H1 H0 D0
|
|
|
b0ccf1 |
-
|
|
|
b0ccf1 |
- mov XREG(D4), XREG(D4)
|
|
|
b0ccf1 |
- mov H0, D0
|
|
|
b0ccf1 |
- neg D0
|
|
|
b0ccf1 |
- sbb H1, H0
|
|
|
b0ccf1 |
- sbb H2, H1
|
|
|
b0ccf1 |
- sbb H3, H2
|
|
|
b0ccf1 |
- sbb H4, H3
|
|
|
b0ccf1 |
- sbb $0, D4
|
|
|
b0ccf1 |
-
|
|
|
b0ccf1 |
- C Shift right. High bits are sign, to be added to C0.
|
|
|
b0ccf1 |
- mov D4, TMP
|
|
|
b0ccf1 |
- sar $32, TMP
|
|
|
b0ccf1 |
- shl $32, D4
|
|
|
b0ccf1 |
- add TMP, C0
|
|
|
b0ccf1 |
-
|
|
|
b0ccf1 |
+ C Shift left, including low half of H4
|
|
|
b0ccf1 |
mov H3, TMP
|
|
|
b0ccf1 |
+ shl $32, H4
|
|
|
b0ccf1 |
shr $32, TMP
|
|
|
b0ccf1 |
- shl $32, H3
|
|
|
b0ccf1 |
- or TMP, D4
|
|
|
b0ccf1 |
+ or TMP, H4
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
mov H2, TMP
|
|
|
b0ccf1 |
+ shl $32, H3
|
|
|
b0ccf1 |
shr $32, TMP
|
|
|
b0ccf1 |
- shl $32, H2
|
|
|
b0ccf1 |
or TMP, H3
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
mov H1, TMP
|
|
|
b0ccf1 |
+ shl $32, H2
|
|
|
b0ccf1 |
shr $32, TMP
|
|
|
b0ccf1 |
- shl $32, H1
|
|
|
b0ccf1 |
or TMP, H2
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
mov H0, TMP
|
|
|
b0ccf1 |
+ shl $32, H1
|
|
|
b0ccf1 |
shr $32, TMP
|
|
|
b0ccf1 |
- shl $32, H0
|
|
|
b0ccf1 |
or TMP, H1
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
- mov D0, TMP
|
|
|
b0ccf1 |
- shr $32, TMP
|
|
|
b0ccf1 |
- shl $32, D0
|
|
|
b0ccf1 |
- or TMP, H0
|
|
|
b0ccf1 |
+ shl $32, H0
|
|
|
b0ccf1 |
+
|
|
|
b0ccf1 |
+ C H4 H3 H2 H1 H0 0
|
|
|
b0ccf1 |
+ C - H4 H3 H2 H1 H0
|
|
|
b0ccf1 |
+ C ---------------
|
|
|
b0ccf1 |
+ C H4 H3 H2 H1 H0 TMP
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
- add D0, T0
|
|
|
b0ccf1 |
+ mov H0, TMP
|
|
|
b0ccf1 |
+ neg TMP
|
|
|
b0ccf1 |
+ sbb H1, H0
|
|
|
b0ccf1 |
+ sbb H2, H1
|
|
|
b0ccf1 |
+ sbb H3, H2
|
|
|
b0ccf1 |
+ sbb H4, H3
|
|
|
b0ccf1 |
+ sbb $0, H4
|
|
|
b0ccf1 |
+
|
|
|
b0ccf1 |
+ add TMP, T0
|
|
|
b0ccf1 |
adc H0, T1
|
|
|
b0ccf1 |
adc H1, T2
|
|
|
b0ccf1 |
adc H2, T3
|
|
|
b0ccf1 |
adc H3, T4
|
|
|
b0ccf1 |
- adc D4, T5
|
|
|
b0ccf1 |
+ adc H4, T5
|
|
|
b0ccf1 |
adc $0, C0
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
C Remains to add in C2 and C0
|
|
|
b0ccf1 |
- C C0 C0<<32 (-2^32+1)C0
|
|
|
b0ccf1 |
- C C2 C2<<32 (-2^32+1)C2
|
|
|
b0ccf1 |
- C where C2 is always positive, while C0 may be -1.
|
|
|
b0ccf1 |
+ C Set H1, H0 = (2^96 - 2^32 + 1) C0
|
|
|
b0ccf1 |
mov C0, H0
|
|
|
b0ccf1 |
mov C0, H1
|
|
|
b0ccf1 |
- mov C0, H2
|
|
|
b0ccf1 |
- sar $63, C0 C Get sign
|
|
|
b0ccf1 |
shl $32, H1
|
|
|
b0ccf1 |
- sub H1, H0 C Gives borrow iff C0 > 0
|
|
|
b0ccf1 |
+ sub H1, H0
|
|
|
b0ccf1 |
sbb $0, H1
|
|
|
b0ccf1 |
- add C0, H2
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
+ C Set H3, H2 = (2^96 - 2^32 + 1) C2
|
|
|
b0ccf1 |
+ mov C2, H2
|
|
|
b0ccf1 |
+ mov C2, H3
|
|
|
b0ccf1 |
+ shl $32, H3
|
|
|
b0ccf1 |
+ sub H3, H2
|
|
|
b0ccf1 |
+ sbb $0, H3
|
|
|
b0ccf1 |
+ add C0, H2 C No carry. Could use lea trick
|
|
|
b0ccf1 |
+
|
|
|
b0ccf1 |
+ xor C0, C0
|
|
|
b0ccf1 |
add H0, T0
|
|
|
b0ccf1 |
adc H1, T1
|
|
|
b0ccf1 |
- adc $0, H2
|
|
|
b0ccf1 |
- adc $0, C0
|
|
|
b0ccf1 |
-
|
|
|
b0ccf1 |
- C Set (H1 H0) <-- C2 << 96 - C2 << 32 + 1
|
|
|
b0ccf1 |
- mov C2, H0
|
|
|
b0ccf1 |
- mov C2, H1
|
|
|
b0ccf1 |
- shl $32, H1
|
|
|
b0ccf1 |
- sub H1, H0
|
|
|
b0ccf1 |
- sbb $0, H1
|
|
|
b0ccf1 |
-
|
|
|
b0ccf1 |
- add H2, H0
|
|
|
b0ccf1 |
- adc C0, H1
|
|
|
b0ccf1 |
- adc C2, C0
|
|
|
b0ccf1 |
- mov C0, H2
|
|
|
b0ccf1 |
- sar $63, C0
|
|
|
b0ccf1 |
- add H0, T2
|
|
|
b0ccf1 |
- adc H1, T3
|
|
|
b0ccf1 |
- adc H2, T4
|
|
|
b0ccf1 |
- adc C0, T5
|
|
|
b0ccf1 |
- sbb C0, C0
|
|
|
b0ccf1 |
+ adc H2, T2
|
|
|
b0ccf1 |
+ adc H3, T3
|
|
|
b0ccf1 |
+ adc C2, T4
|
|
|
b0ccf1 |
+ adc D5, T5 C Value delayed from initial folding
|
|
|
b0ccf1 |
+ adc $0, C0 C Use sbb and switch sign?
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
C Final unlikely carry
|
|
|
b0ccf1 |
mov C0, H0
|
|
|
b0ccf1 |
mov C0, H1
|
|
|
b0ccf1 |
- mov C0, H2
|
|
|
b0ccf1 |
- sar $63, C0
|
|
|
b0ccf1 |
shl $32, H1
|
|
|
b0ccf1 |
sub H1, H0
|
|
|
b0ccf1 |
sbb $0, H1
|
|
|
b0ccf1 |
- add C0, H2
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
pop RP
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
- sub H0, T0
|
|
|
b0ccf1 |
+ add H0, T0
|
|
|
b0ccf1 |
mov T0, (RP)
|
|
|
b0ccf1 |
- sbb H1, T1
|
|
|
b0ccf1 |
+ adc H1, T1
|
|
|
b0ccf1 |
mov T1, 8(RP)
|
|
|
b0ccf1 |
- sbb H2, T2
|
|
|
b0ccf1 |
+ adc C0, T2
|
|
|
b0ccf1 |
mov T2, 16(RP)
|
|
|
b0ccf1 |
- sbb C0, T3
|
|
|
b0ccf1 |
+ adc $0, T3
|
|
|
b0ccf1 |
mov T3, 24(RP)
|
|
|
b0ccf1 |
- sbb C0, T4
|
|
|
b0ccf1 |
+ adc $0, T4
|
|
|
b0ccf1 |
mov T4, 32(RP)
|
|
|
b0ccf1 |
- sbb C0, T5
|
|
|
b0ccf1 |
+ adc $0, T5
|
|
|
b0ccf1 |
mov T5, 40(RP)
|
|
|
b0ccf1 |
|
|
|
b0ccf1 |
pop %r15
|