b0ccf1
diff --git a/ecc-256.c b/ecc-256.c
b0ccf1
index 571cf73..07841b1 100644
b0ccf1
--- a/ecc-256.c
b0ccf1
+++ b/ecc-256.c
b0ccf1
@@ -108,7 +108,10 @@ ecc_256_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
b0ccf1
       u0 -= t;
b0ccf1
       t = (u1 < cy);
b0ccf1
       u1 -= cy;
b0ccf1
-      u1 += cnd_add_n (t, rp + n - 4, ecc->p, 3);
b0ccf1
+
b0ccf1
+      cy = cnd_add_n (t, rp + n - 4, ecc->p, 2);
b0ccf1
+      u0 += cy;
b0ccf1
+      u1 += (u0 < cy);
b0ccf1
       u1 -= (-t) & 0xffffffff;
b0ccf1
     }
b0ccf1
   rp[2] = u0;
b0ccf1
@@ -195,7 +198,7 @@ ecc_256_modq (const struct ecc_curve *ecc, mp_limb_t *rp)
b0ccf1
 
b0ccf1
       /* Conditional add of p */
b0ccf1
       u1 += t;
b0ccf1
-      u2 += (t<<32) + (u0 < t);
b0ccf1
+      u2 += (t<<32) + (u1 < t);
b0ccf1
 
b0ccf1
       t = cnd_add_n (t, rp + n - 4, ecc->q, 2);
b0ccf1
       u1 += t;
b0ccf1
diff --git a/x86_64/ecc-384-modp.asm b/x86_64/ecc-384-modp.asm
b0ccf1
index 698838f..31b739e 100644
b0ccf1
--- a/x86_64/ecc-384-modp.asm
b0ccf1
+++ b/x86_64/ecc-384-modp.asm
b0ccf1
@@ -20,7 +20,7 @@ C MA 02111-1301, USA.
b0ccf1
 	.file "ecc-384-modp.asm"
b0ccf1
 
b0ccf1
 define(<RP>, <%rsi>)
b0ccf1
-define(<D4>, <%rax>)
b0ccf1
+define(<D5>, <%rax>)
b0ccf1
 define(<T0>, <%rbx>)
b0ccf1
 define(<T1>, <%rcx>)
b0ccf1
 define(<T2>, <%rdx>)
b0ccf1
@@ -35,8 +35,8 @@ define(

, <%r13>)

b0ccf1
 define(
, <%r14>)
b0ccf1
 define(<C2>, <%r15>)
b0ccf1
 define(<C0>, H5)	C Overlap
b0ccf1
-define(<D0>, RP)	C Overlap
b0ccf1
-define(<TMP>, H4)	C Overlap
b0ccf1
+define(<TMP>, RP)	C Overlap
b0ccf1
+
b0ccf1
 
b0ccf1
 PROLOGUE(nettle_ecc_384_modp)
b0ccf1
 	W64_ENTRY(2, 0)
b0ccf1
@@ -48,34 +48,38 @@ PROLOGUE(nettle_ecc_384_modp)
b0ccf1
 	push	%r14
b0ccf1
 	push	%r15
b0ccf1
 
b0ccf1
-	C First get top 2 limbs, which need folding twice
b0ccf1
+	C First get top 2 limbs, which need folding twice.
b0ccf1
+	C B^10 = B^6 + B^4 + 2^32 (B-1)B^4.
b0ccf1
+	C We handle the terms as follow:
b0ccf1
 	C
b0ccf1
-	C   H5 H4
b0ccf1
-	C     -H5
b0ccf1
-	C  ------
b0ccf1
-	C   H0 D4
b0ccf1
+	C B^6: Folded immediatly.
b0ccf1
 	C
b0ccf1
-	C Then shift right, (H1,H0,D4)  <--  (H0,D4) << 32
b0ccf1
-	C and add
b0ccf1
+	C B^4: Delayed, added in in the next folding.
b0ccf1
 	C
b0ccf1
-	C     H5 H4
b0ccf1
-	C     H1 H0
b0ccf1
-	C ----------
b0ccf1
-	C  C2 H1 H0
b0ccf1
-
b0ccf1
-	mov	80(RP), D4
b0ccf1
-	mov	88(RP), H0
b0ccf1
-	mov	D4, H4
b0ccf1
-	mov	H0, H5
b0ccf1
-	sub	H0, D4
b0ccf1
-	sbb	$0, H0
b0ccf1
-
b0ccf1
-	mov	D4, T2
b0ccf1
-	mov	H0, H1
b0ccf1
-	shl	$32, H0
b0ccf1
-	shr	$32, T2
b0ccf1
+	C 2^32(B-1) B^4: Low half limb delayed until the next
b0ccf1
+	C folding. Top 1.5 limbs subtracted and shifter now, resulting
b0ccf1
+	C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added
b0ccf1
+	C in.
b0ccf1
+
b0ccf1
+	mov	80(RP), H4
b0ccf1
+	mov	88(RP), H5
b0ccf1
+	C Shift right 32 bits, into H1, H0
b0ccf1
+	mov	H4, H0
b0ccf1
+	mov	H5, H1
b0ccf1
+	mov	H5, D5
b0ccf1
 	shr	$32, H1
b0ccf1
-	or	T2, H0
b0ccf1
+	shl	$32, D5
b0ccf1
+	shr	$32, H0
b0ccf1
+	or	D5, H0
b0ccf1
+
b0ccf1
+	C	H1 H0
b0ccf1
+	C       -  H1 H0
b0ccf1
+	C       --------
b0ccf1
+	C       H1 H0 D5
b0ccf1
+	mov	H0, D5
b0ccf1
+	neg	D5
b0ccf1
+	sbb	H1, H0
b0ccf1
+	sbb	$0, H1
b0ccf1
 
b0ccf1
 	xor	C2, C2
b0ccf1
 	add	H4, H0
b0ccf1
@@ -114,118 +118,95 @@ PROLOGUE(nettle_ecc_384_modp)
b0ccf1
 	adc	H3, T5
b0ccf1
 	adc	$0, C0
b0ccf1
 
b0ccf1
-	C   H3 H2 H1 H0  0
b0ccf1
-	C - H4 H3 H2 H1 H0
b0ccf1
-	C  ---------------
b0ccf1
-	C   H3 H2 H1 H0 D0
b0ccf1
-
b0ccf1
-	mov	XREG(D4), XREG(D4)
b0ccf1
-	mov	H0, D0
b0ccf1
-	neg	D0
b0ccf1
-	sbb	H1, H0
b0ccf1
-	sbb	H2, H1
b0ccf1
-	sbb	H3, H2
b0ccf1
-	sbb	H4, H3
b0ccf1
-	sbb	$0, D4
b0ccf1
-
b0ccf1
-	C Shift right. High bits are sign, to be added to C0.
b0ccf1
-	mov	D4, TMP
b0ccf1
-	sar	$32, TMP
b0ccf1
-	shl	$32, D4
b0ccf1
-	add	TMP, C0
b0ccf1
-
b0ccf1
+	C Shift left, including low half of H4
b0ccf1
 	mov	H3, TMP
b0ccf1
+	shl	$32, H4
b0ccf1
 	shr	$32, TMP
b0ccf1
-	shl	$32, H3
b0ccf1
-	or	TMP, D4
b0ccf1
+	or	TMP, H4
b0ccf1
 
b0ccf1
 	mov	H2, TMP
b0ccf1
+	shl	$32, H3
b0ccf1
 	shr	$32, TMP
b0ccf1
-	shl	$32, H2
b0ccf1
 	or	TMP, H3
b0ccf1
 
b0ccf1
 	mov	H1, TMP
b0ccf1
+	shl	$32, H2
b0ccf1
 	shr	$32, TMP
b0ccf1
-	shl	$32, H1
b0ccf1
 	or	TMP, H2
b0ccf1
 
b0ccf1
 	mov	H0, TMP
b0ccf1
+	shl	$32, H1
b0ccf1
 	shr	$32, TMP
b0ccf1
-	shl	$32, H0
b0ccf1
 	or	TMP, H1
b0ccf1
 
b0ccf1
-	mov	D0, TMP
b0ccf1
-	shr	$32, TMP
b0ccf1
-	shl	$32, D0
b0ccf1
-	or	TMP, H0
b0ccf1
+	shl	$32, H0
b0ccf1
+
b0ccf1
+	C   H4 H3 H2 H1 H0  0
b0ccf1
+	C  -   H4 H3 H2 H1 H0
b0ccf1
+	C  ---------------
b0ccf1
+	C   H4 H3 H2 H1 H0 TMP
b0ccf1
 
b0ccf1
-	add	D0, T0
b0ccf1
+	mov	H0, TMP
b0ccf1
+	neg	TMP
b0ccf1
+	sbb	H1, H0
b0ccf1
+	sbb	H2, H1
b0ccf1
+	sbb	H3, H2
b0ccf1
+	sbb	H4, H3
b0ccf1
+	sbb	$0, H4
b0ccf1
+
b0ccf1
+	add	TMP, T0
b0ccf1
 	adc	H0, T1
b0ccf1
 	adc	H1, T2
b0ccf1
 	adc	H2, T3
b0ccf1
 	adc	H3, T4
b0ccf1
-	adc	D4, T5
b0ccf1
+	adc	H4, T5
b0ccf1
 	adc	$0, C0
b0ccf1
 
b0ccf1
 	C Remains to add in C2 and C0
b0ccf1
-	C                         C0  C0<<32  (-2^32+1)C0
b0ccf1
-	C    C2  C2<<32  (-2^32+1)C2
b0ccf1
-	C where C2 is always positive, while C0 may be -1.
b0ccf1
+	C Set H1, H0 = (2^96 - 2^32 + 1) C0
b0ccf1
 	mov	C0, H0
b0ccf1
 	mov	C0, H1
b0ccf1
-	mov	C0, H2
b0ccf1
-	sar	$63, C0		C Get sign
b0ccf1
 	shl	$32, H1
b0ccf1
-	sub	H1, H0		C Gives borrow iff C0 > 0
b0ccf1
+	sub	H1, H0
b0ccf1
 	sbb	$0, H1
b0ccf1
-	add	C0, H2
b0ccf1
 
b0ccf1
+	C Set H3, H2 = (2^96 - 2^32 + 1) C2
b0ccf1
+	mov	C2, H2
b0ccf1
+	mov	C2, H3
b0ccf1
+	shl	$32, H3
b0ccf1
+	sub	H3, H2
b0ccf1
+	sbb	$0, H3
b0ccf1
+	add	C0, H2		C No carry. Could use lea trick
b0ccf1
+
b0ccf1
+	xor	C0, C0
b0ccf1
 	add	H0, T0
b0ccf1
 	adc	H1, T1
b0ccf1
-	adc	$0, H2
b0ccf1
-	adc	$0, C0
b0ccf1
-
b0ccf1
-	C Set (H1 H0)  <-- C2 << 96 - C2 << 32 + 1
b0ccf1
-	mov	C2, H0
b0ccf1
-	mov	C2, H1
b0ccf1
-	shl	$32, H1
b0ccf1
-	sub	H1, H0
b0ccf1
-	sbb	$0, H1
b0ccf1
-
b0ccf1
-	add	H2, H0
b0ccf1
-	adc	C0, H1
b0ccf1
-	adc	C2, C0
b0ccf1
-	mov	C0, H2
b0ccf1
-	sar	$63, C0
b0ccf1
-	add	H0, T2
b0ccf1
-	adc	H1, T3
b0ccf1
-	adc	H2, T4
b0ccf1
-	adc	C0, T5
b0ccf1
-	sbb	C0, C0
b0ccf1
+	adc	H2, T2
b0ccf1
+	adc	H3, T3
b0ccf1
+	adc	C2, T4
b0ccf1
+	adc	D5, T5		C Value delayed from initial folding
b0ccf1
+	adc	$0, C0		C Use sbb and switch sign?
b0ccf1
 
b0ccf1
 	C Final unlikely carry
b0ccf1
 	mov	C0, H0
b0ccf1
 	mov	C0, H1
b0ccf1
-	mov	C0, H2
b0ccf1
-	sar	$63, C0
b0ccf1
 	shl	$32, H1
b0ccf1
 	sub	H1, H0
b0ccf1
 	sbb	$0, H1
b0ccf1
-	add	C0, H2
b0ccf1
 
b0ccf1
 	pop	RP
b0ccf1
 
b0ccf1
-	sub	H0, T0
b0ccf1
+	add	H0, T0
b0ccf1
 	mov	T0, (RP)
b0ccf1
-	sbb	H1, T1
b0ccf1
+	adc	H1, T1
b0ccf1
 	mov	T1, 8(RP)
b0ccf1
-	sbb	H2, T2
b0ccf1
+	adc	C0, T2
b0ccf1
 	mov	T2, 16(RP)
b0ccf1
-	sbb	C0, T3
b0ccf1
+	adc	$0, T3
b0ccf1
 	mov	T3, 24(RP)
b0ccf1
-	sbb	C0, T4
b0ccf1
+	adc	$0, T4
b0ccf1
 	mov	T4, 32(RP)
b0ccf1
-	sbb	C0, T5
b0ccf1
+	adc	$0, T5
b0ccf1
 	mov	T5, 40(RP)
b0ccf1
 
b0ccf1
 	pop	%r15