c57f53
diff --git a/ecc-256.c b/ecc-256.c
c57f53
index 571cf73..07841b1 100644
c57f53
--- a/ecc-256.c
c57f53
+++ b/ecc-256.c
c57f53
@@ -108,7 +108,10 @@ ecc_256_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
c57f53
       u0 -= t;
c57f53
       t = (u1 < cy);
c57f53
       u1 -= cy;
c57f53
-      u1 += cnd_add_n (t, rp + n - 4, ecc->p, 3);
c57f53
+
c57f53
+      cy = cnd_add_n (t, rp + n - 4, ecc->p, 2);
c57f53
+      u0 += cy;
c57f53
+      u1 += (u0 < cy);
c57f53
       u1 -= (-t) & 0xffffffff;
c57f53
     }
c57f53
   rp[2] = u0;
c57f53
@@ -195,7 +198,7 @@ ecc_256_modq (const struct ecc_curve *ecc, mp_limb_t *rp)
c57f53
 
c57f53
       /* Conditional add of p */
c57f53
       u1 += t;
c57f53
-      u2 += (t<<32) + (u0 < t);
c57f53
+      u2 += (t<<32) + (u1 < t);
c57f53
 
c57f53
       t = cnd_add_n (t, rp + n - 4, ecc->q, 2);
c57f53
       u1 += t;
c57f53
diff --git a/x86_64/ecc-384-modp.asm b/x86_64/ecc-384-modp.asm
c57f53
index 698838f..31b739e 100644
c57f53
--- a/x86_64/ecc-384-modp.asm
c57f53
+++ b/x86_64/ecc-384-modp.asm
c57f53
@@ -20,7 +20,7 @@ C MA 02111-1301, USA.
c57f53
 	.file "ecc-384-modp.asm"
c57f53
 
c57f53
 define(<RP>, <%rsi>)
c57f53
-define(<D4>, <%rax>)
c57f53
+define(<D5>, <%rax>)
c57f53
 define(<T0>, <%rbx>)
c57f53
 define(<T1>, <%rcx>)
c57f53
 define(<T2>, <%rdx>)
c57f53
@@ -35,8 +35,8 @@ define(

, <%r13>)

c57f53
 define(
, <%r14>)
c57f53
 define(<C2>, <%r15>)
c57f53
 define(<C0>, H5)	C Overlap
c57f53
-define(<D0>, RP)	C Overlap
c57f53
-define(<TMP>, H4)	C Overlap
c57f53
+define(<TMP>, RP)	C Overlap
c57f53
+
c57f53
 
c57f53
 PROLOGUE(nettle_ecc_384_modp)
c57f53
 	W64_ENTRY(2, 0)
c57f53
@@ -48,34 +48,38 @@ PROLOGUE(nettle_ecc_384_modp)
c57f53
 	push	%r14
c57f53
 	push	%r15
c57f53
 
c57f53
-	C First get top 2 limbs, which need folding twice
c57f53
+	C First get top 2 limbs, which need folding twice.
c57f53
+	C B^10 = B^6 + B^4 + 2^32 (B-1)B^4.
c57f53
+	C We handle the terms as follow:
c57f53
 	C
c57f53
-	C   H5 H4
c57f53
-	C     -H5
c57f53
-	C  ------
c57f53
-	C   H0 D4
c57f53
+	C B^6: Folded immediatly.
c57f53
 	C
c57f53
-	C Then shift right, (H1,H0,D4)  <--  (H0,D4) << 32
c57f53
-	C and add
c57f53
+	C B^4: Delayed, added in in the next folding.
c57f53
 	C
c57f53
-	C     H5 H4
c57f53
-	C     H1 H0
c57f53
-	C ----------
c57f53
-	C  C2 H1 H0
c57f53
-
c57f53
-	mov	80(RP), D4
c57f53
-	mov	88(RP), H0
c57f53
-	mov	D4, H4
c57f53
-	mov	H0, H5
c57f53
-	sub	H0, D4
c57f53
-	sbb	$0, H0
c57f53
-
c57f53
-	mov	D4, T2
c57f53
-	mov	H0, H1
c57f53
-	shl	$32, H0
c57f53
-	shr	$32, T2
c57f53
+	C 2^32(B-1) B^4: Low half limb delayed until the next
c57f53
+	C folding. Top 1.5 limbs subtracted and shifter now, resulting
c57f53
+	C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added
c57f53
+	C in.
c57f53
+
c57f53
+	mov	80(RP), H4
c57f53
+	mov	88(RP), H5
c57f53
+	C Shift right 32 bits, into H1, H0
c57f53
+	mov	H4, H0
c57f53
+	mov	H5, H1
c57f53
+	mov	H5, D5
c57f53
 	shr	$32, H1
c57f53
-	or	T2, H0
c57f53
+	shl	$32, D5
c57f53
+	shr	$32, H0
c57f53
+	or	D5, H0
c57f53
+
c57f53
+	C	H1 H0
c57f53
+	C       -  H1 H0
c57f53
+	C       --------
c57f53
+	C       H1 H0 D5
c57f53
+	mov	H0, D5
c57f53
+	neg	D5
c57f53
+	sbb	H1, H0
c57f53
+	sbb	$0, H1
c57f53
 
c57f53
 	xor	C2, C2
c57f53
 	add	H4, H0
c57f53
@@ -114,118 +118,95 @@ PROLOGUE(nettle_ecc_384_modp)
c57f53
 	adc	H3, T5
c57f53
 	adc	$0, C0
c57f53
 
c57f53
-	C   H3 H2 H1 H0  0
c57f53
-	C - H4 H3 H2 H1 H0
c57f53
-	C  ---------------
c57f53
-	C   H3 H2 H1 H0 D0
c57f53
-
c57f53
-	mov	XREG(D4), XREG(D4)
c57f53
-	mov	H0, D0
c57f53
-	neg	D0
c57f53
-	sbb	H1, H0
c57f53
-	sbb	H2, H1
c57f53
-	sbb	H3, H2
c57f53
-	sbb	H4, H3
c57f53
-	sbb	$0, D4
c57f53
-
c57f53
-	C Shift right. High bits are sign, to be added to C0.
c57f53
-	mov	D4, TMP
c57f53
-	sar	$32, TMP
c57f53
-	shl	$32, D4
c57f53
-	add	TMP, C0
c57f53
-
c57f53
+	C Shift left, including low half of H4
c57f53
 	mov	H3, TMP
c57f53
+	shl	$32, H4
c57f53
 	shr	$32, TMP
c57f53
-	shl	$32, H3
c57f53
-	or	TMP, D4
c57f53
+	or	TMP, H4
c57f53
 
c57f53
 	mov	H2, TMP
c57f53
+	shl	$32, H3
c57f53
 	shr	$32, TMP
c57f53
-	shl	$32, H2
c57f53
 	or	TMP, H3
c57f53
 
c57f53
 	mov	H1, TMP
c57f53
+	shl	$32, H2
c57f53
 	shr	$32, TMP
c57f53
-	shl	$32, H1
c57f53
 	or	TMP, H2
c57f53
 
c57f53
 	mov	H0, TMP
c57f53
+	shl	$32, H1
c57f53
 	shr	$32, TMP
c57f53
-	shl	$32, H0
c57f53
 	or	TMP, H1
c57f53
 
c57f53
-	mov	D0, TMP
c57f53
-	shr	$32, TMP
c57f53
-	shl	$32, D0
c57f53
-	or	TMP, H0
c57f53
+	shl	$32, H0
c57f53
+
c57f53
+	C   H4 H3 H2 H1 H0  0
c57f53
+	C  -   H4 H3 H2 H1 H0
c57f53
+	C  ---------------
c57f53
+	C   H4 H3 H2 H1 H0 TMP
c57f53
 
c57f53
-	add	D0, T0
c57f53
+	mov	H0, TMP
c57f53
+	neg	TMP
c57f53
+	sbb	H1, H0
c57f53
+	sbb	H2, H1
c57f53
+	sbb	H3, H2
c57f53
+	sbb	H4, H3
c57f53
+	sbb	$0, H4
c57f53
+
c57f53
+	add	TMP, T0
c57f53
 	adc	H0, T1
c57f53
 	adc	H1, T2
c57f53
 	adc	H2, T3
c57f53
 	adc	H3, T4
c57f53
-	adc	D4, T5
c57f53
+	adc	H4, T5
c57f53
 	adc	$0, C0
c57f53
 
c57f53
 	C Remains to add in C2 and C0
c57f53
-	C                         C0  C0<<32  (-2^32+1)C0
c57f53
-	C    C2  C2<<32  (-2^32+1)C2
c57f53
-	C where C2 is always positive, while C0 may be -1.
c57f53
+	C Set H1, H0 = (2^96 - 2^32 + 1) C0
c57f53
 	mov	C0, H0
c57f53
 	mov	C0, H1
c57f53
-	mov	C0, H2
c57f53
-	sar	$63, C0		C Get sign
c57f53
 	shl	$32, H1
c57f53
-	sub	H1, H0		C Gives borrow iff C0 > 0
c57f53
+	sub	H1, H0
c57f53
 	sbb	$0, H1
c57f53
-	add	C0, H2
c57f53
 
c57f53
+	C Set H3, H2 = (2^96 - 2^32 + 1) C2
c57f53
+	mov	C2, H2
c57f53
+	mov	C2, H3
c57f53
+	shl	$32, H3
c57f53
+	sub	H3, H2
c57f53
+	sbb	$0, H3
c57f53
+	add	C0, H2		C No carry. Could use lea trick
c57f53
+
c57f53
+	xor	C0, C0
c57f53
 	add	H0, T0
c57f53
 	adc	H1, T1
c57f53
-	adc	$0, H2
c57f53
-	adc	$0, C0
c57f53
-
c57f53
-	C Set (H1 H0)  <-- C2 << 96 - C2 << 32 + 1
c57f53
-	mov	C2, H0
c57f53
-	mov	C2, H1
c57f53
-	shl	$32, H1
c57f53
-	sub	H1, H0
c57f53
-	sbb	$0, H1
c57f53
-
c57f53
-	add	H2, H0
c57f53
-	adc	C0, H1
c57f53
-	adc	C2, C0
c57f53
-	mov	C0, H2
c57f53
-	sar	$63, C0
c57f53
-	add	H0, T2
c57f53
-	adc	H1, T3
c57f53
-	adc	H2, T4
c57f53
-	adc	C0, T5
c57f53
-	sbb	C0, C0
c57f53
+	adc	H2, T2
c57f53
+	adc	H3, T3
c57f53
+	adc	C2, T4
c57f53
+	adc	D5, T5		C Value delayed from initial folding
c57f53
+	adc	$0, C0		C Use sbb and switch sign?
c57f53
 
c57f53
 	C Final unlikely carry
c57f53
 	mov	C0, H0
c57f53
 	mov	C0, H1
c57f53
-	mov	C0, H2
c57f53
-	sar	$63, C0
c57f53
 	shl	$32, H1
c57f53
 	sub	H1, H0
c57f53
 	sbb	$0, H1
c57f53
-	add	C0, H2
c57f53
 
c57f53
 	pop	RP
c57f53
 
c57f53
-	sub	H0, T0
c57f53
+	add	H0, T0
c57f53
 	mov	T0, (RP)
c57f53
-	sbb	H1, T1
c57f53
+	adc	H1, T1
c57f53
 	mov	T1, 8(RP)
c57f53
-	sbb	H2, T2
c57f53
+	adc	C0, T2
c57f53
 	mov	T2, 16(RP)
c57f53
-	sbb	C0, T3
c57f53
+	adc	$0, T3
c57f53
 	mov	T3, 24(RP)
c57f53
-	sbb	C0, T4
c57f53
+	adc	$0, T4
c57f53
 	mov	T4, 32(RP)
c57f53
-	sbb	C0, T5
c57f53
+	adc	$0, T5
c57f53
 	mov	T5, 40(RP)
c57f53
 
c57f53
 	pop	%r15