|
|
a42b25 |
# HG changeset patch
|
|
|
a42b25 |
# User mdoerr
|
|
|
a42b25 |
# Date 1507750779 -3600
|
|
|
a42b25 |
# Wed Oct 11 20:39:39 2017 +0100
|
|
|
a42b25 |
# Node ID 92f0dbe76a13992cc27188e0f68e4b1771c7004a
|
|
|
a42b25 |
# Parent 542c122b1d7d30c29189565248074aa28f21ae58
|
|
|
a42b25 |
8145913, PR3466, RH1498309: PPC64: add Montgomery multiply intrinsic
|
|
|
a42b25 |
Reviewed-by: aph, goetz
|
|
|
a42b25 |
|
|
|
a42b25 |
diff --git a/src/cpu/ppc/vm/assembler_ppc.hpp b/src/cpu/ppc/vm/assembler_ppc.hpp
|
|
|
a42b25 |
--- openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
|
|
|
a42b25 |
+++ openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
|
|
|
a42b25 |
@@ -1179,6 +1179,8 @@
|
|
|
a42b25 |
inline void mullw_( Register d, Register a, Register b);
|
|
|
a42b25 |
inline void mulhw( Register d, Register a, Register b);
|
|
|
a42b25 |
inline void mulhw_( Register d, Register a, Register b);
|
|
|
a42b25 |
+ inline void mulhwu( Register d, Register a, Register b);
|
|
|
a42b25 |
+ inline void mulhwu_(Register d, Register a, Register b);
|
|
|
a42b25 |
inline void mulhd( Register d, Register a, Register b);
|
|
|
a42b25 |
inline void mulhd_( Register d, Register a, Register b);
|
|
|
a42b25 |
inline void mulhdu( Register d, Register a, Register b);
|
|
|
a42b25 |
diff --git a/src/cpu/ppc/vm/assembler_ppc.inline.hpp b/src/cpu/ppc/vm/assembler_ppc.inline.hpp
|
|
|
a42b25 |
--- openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
|
|
|
a42b25 |
+++ openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
|
|
|
a42b25 |
@@ -109,6 +109,8 @@
|
|
|
a42b25 |
inline void Assembler::mullw_( Register d, Register a, Register b) { emit_int32(MULLW_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
|
|
|
a42b25 |
inline void Assembler::mulhw( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
|
|
|
a42b25 |
inline void Assembler::mulhw_( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
|
|
|
a42b25 |
+inline void Assembler::mulhwu( Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
|
|
|
a42b25 |
+inline void Assembler::mulhwu_(Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
|
|
|
a42b25 |
inline void Assembler::mulhd( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
|
|
|
a42b25 |
inline void Assembler::mulhd_( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
|
|
|
a42b25 |
inline void Assembler::mulhdu( Register d, Register a, Register b) { emit_int32(MULHDU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
|
|
|
a42b25 |
diff --git a/src/cpu/ppc/vm/c2_init_ppc.cpp b/src/cpu/ppc/vm/c2_init_ppc.cpp
|
|
|
a42b25 |
--- openjdk/hotspot/src/cpu/ppc/vm/c2_init_ppc.cpp
|
|
|
a42b25 |
+++ openjdk/hotspot/src/cpu/ppc/vm/c2_init_ppc.cpp
|
|
|
a42b25 |
@@ -45,4 +45,10 @@
|
|
|
a42b25 |
FLAG_SET_ERGO(bool, InsertEndGroupPPC64, true);
|
|
|
a42b25 |
}
|
|
|
a42b25 |
}
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ if (OptimizeFill) {
|
|
|
a42b25 |
+ warning("OptimizeFill is not supported on this CPU.");
|
|
|
a42b25 |
+ FLAG_SET_DEFAULT(OptimizeFill, false);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+
|
|
|
a42b25 |
}
|
|
|
a42b25 |
diff --git a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp
|
|
|
a42b25 |
--- openjdk/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp
|
|
|
a42b25 |
+++ openjdk/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp
|
|
|
a42b25 |
@@ -42,6 +42,8 @@
|
|
|
a42b25 |
#include "opto/runtime.hpp"
|
|
|
a42b25 |
#endif
|
|
|
a42b25 |
|
|
|
a42b25 |
+#include <alloca.h>
|
|
|
a42b25 |
+
|
|
|
a42b25 |
#define __ masm->
|
|
|
a42b25 |
|
|
|
a42b25 |
#ifdef PRODUCT
|
|
|
a42b25 |
@@ -3269,3 +3271,245 @@
|
|
|
a42b25 |
return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_bytes/wordSize,
|
|
|
a42b25 |
oop_maps, true);
|
|
|
a42b25 |
}
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+//------------------------------Montgomery multiplication------------------------
|
|
|
a42b25 |
+//
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+// Subtract 0:b from carry:a. Return carry.
|
|
|
a42b25 |
+static unsigned long
|
|
|
a42b25 |
+sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
|
|
|
a42b25 |
+ long i = 0;
|
|
|
a42b25 |
+ unsigned long tmp, tmp2;
|
|
|
a42b25 |
+ __asm__ __volatile__ (
|
|
|
a42b25 |
+ "subfc %[tmp], %[tmp], %[tmp] \n" // pre-set CA
|
|
|
a42b25 |
+ "mtctr %[len] \n"
|
|
|
a42b25 |
+ "0: \n"
|
|
|
a42b25 |
+ "ldx %[tmp], %[i], %[a] \n"
|
|
|
a42b25 |
+ "ldx %[tmp2], %[i], %[b] \n"
|
|
|
a42b25 |
+ "subfe %[tmp], %[tmp2], %[tmp] \n" // subtract extended
|
|
|
a42b25 |
+ "stdx %[tmp], %[i], %[a] \n"
|
|
|
a42b25 |
+ "addi %[i], %[i], 8 \n"
|
|
|
a42b25 |
+ "bdnz 0b \n"
|
|
|
a42b25 |
+ "addme %[tmp], %[carry] \n" // carry + CA - 1
|
|
|
a42b25 |
+ : [i]"+b"(i), [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2)
|
|
|
a42b25 |
+ : [a]"r"(a), [b]"r"(b), [carry]"r"(carry), [len]"r"(len)
|
|
|
a42b25 |
+ : "ctr", "xer", "memory"
|
|
|
a42b25 |
+ );
|
|
|
a42b25 |
+ return tmp;
|
|
|
a42b25 |
+}
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+// Multiply (unsigned) Long A by Long B, accumulating the double-
|
|
|
a42b25 |
+// length result into the accumulator formed of T0, T1, and T2.
|
|
|
a42b25 |
+inline void MACC(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
|
|
|
a42b25 |
+ unsigned long hi, lo;
|
|
|
a42b25 |
+ __asm__ __volatile__ (
|
|
|
a42b25 |
+ "mulld %[lo], %[A], %[B] \n"
|
|
|
a42b25 |
+ "mulhdu %[hi], %[A], %[B] \n"
|
|
|
a42b25 |
+ "addc %[T0], %[T0], %[lo] \n"
|
|
|
a42b25 |
+ "adde %[T1], %[T1], %[hi] \n"
|
|
|
a42b25 |
+ "addze %[T2], %[T2] \n"
|
|
|
a42b25 |
+ : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
|
|
|
a42b25 |
+ : [A]"r"(A), [B]"r"(B)
|
|
|
a42b25 |
+ : "xer"
|
|
|
a42b25 |
+ );
|
|
|
a42b25 |
+}
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+// As above, but add twice the double-length result into the
|
|
|
a42b25 |
+// accumulator.
|
|
|
a42b25 |
+inline void MACC2(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
|
|
|
a42b25 |
+ unsigned long hi, lo;
|
|
|
a42b25 |
+ __asm__ __volatile__ (
|
|
|
a42b25 |
+ "mulld %[lo], %[A], %[B] \n"
|
|
|
a42b25 |
+ "mulhdu %[hi], %[A], %[B] \n"
|
|
|
a42b25 |
+ "addc %[T0], %[T0], %[lo] \n"
|
|
|
a42b25 |
+ "adde %[T1], %[T1], %[hi] \n"
|
|
|
a42b25 |
+ "addze %[T2], %[T2] \n"
|
|
|
a42b25 |
+ "addc %[T0], %[T0], %[lo] \n"
|
|
|
a42b25 |
+ "adde %[T1], %[T1], %[hi] \n"
|
|
|
a42b25 |
+ "addze %[T2], %[T2] \n"
|
|
|
a42b25 |
+ : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
|
|
|
a42b25 |
+ : [A]"r"(A), [B]"r"(B)
|
|
|
a42b25 |
+ : "xer"
|
|
|
a42b25 |
+ );
|
|
|
a42b25 |
+}
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+// Fast Montgomery multiplication. The derivation of the algorithm is
|
|
|
a42b25 |
+// in "A Cryptographic Library for the Motorola DSP56000,
|
|
|
a42b25 |
+// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237".
|
|
|
a42b25 |
+static void
|
|
|
a42b25 |
+montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
|
|
|
a42b25 |
+ unsigned long m[], unsigned long inv, int len) {
|
|
|
a42b25 |
+ unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
|
|
|
a42b25 |
+ int i;
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ for (i = 0; i < len; i++) {
|
|
|
a42b25 |
+ int j;
|
|
|
a42b25 |
+ for (j = 0; j < i; j++) {
|
|
|
a42b25 |
+ MACC(a[j], b[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ MACC(m[j], n[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ MACC(a[i], b[0], t0, t1, t2);
|
|
|
a42b25 |
+ m[i] = t0 * inv;
|
|
|
a42b25 |
+ MACC(m[i], n[0], t0, t1, t2);
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ assert(t0 == 0, "broken Montgomery multiply");
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ t0 = t1; t1 = t2; t2 = 0;
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ for (i = len; i < 2*len; i++) {
|
|
|
a42b25 |
+ int j;
|
|
|
a42b25 |
+ for (j = i-len+1; j < len; j++) {
|
|
|
a42b25 |
+ MACC(a[j], b[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ MACC(m[j], n[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ m[i-len] = t0;
|
|
|
a42b25 |
+ t0 = t1; t1 = t2; t2 = 0;
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ while (t0) {
|
|
|
a42b25 |
+ t0 = sub(m, n, t0, len);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+}
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+// Fast Montgomery squaring. This uses asymptotically 25% fewer
|
|
|
a42b25 |
+// multiplies so it should be up to 25% faster than Montgomery
|
|
|
a42b25 |
+// multiplication. However, its loop control is more complex and it
|
|
|
a42b25 |
+// may actually run slower on some machines.
|
|
|
a42b25 |
+static void
|
|
|
a42b25 |
+montgomery_square(unsigned long a[], unsigned long n[],
|
|
|
a42b25 |
+ unsigned long m[], unsigned long inv, int len) {
|
|
|
a42b25 |
+ unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
|
|
|
a42b25 |
+ int i;
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ for (i = 0; i < len; i++) {
|
|
|
a42b25 |
+ int j;
|
|
|
a42b25 |
+ int end = (i+1)/2;
|
|
|
a42b25 |
+ for (j = 0; j < end; j++) {
|
|
|
a42b25 |
+ MACC2(a[j], a[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ MACC(m[j], n[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ if ((i & 1) == 0) {
|
|
|
a42b25 |
+ MACC(a[j], a[j], t0, t1, t2);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ for (; j < i; j++) {
|
|
|
a42b25 |
+ MACC(m[j], n[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ m[i] = t0 * inv;
|
|
|
a42b25 |
+ MACC(m[i], n[0], t0, t1, t2);
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ assert(t0 == 0, "broken Montgomery square");
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ t0 = t1; t1 = t2; t2 = 0;
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ for (i = len; i < 2*len; i++) {
|
|
|
a42b25 |
+ int start = i-len+1;
|
|
|
a42b25 |
+ int end = start + (len - start)/2;
|
|
|
a42b25 |
+ int j;
|
|
|
a42b25 |
+ for (j = start; j < end; j++) {
|
|
|
a42b25 |
+ MACC2(a[j], a[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ MACC(m[j], n[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ if ((i & 1) == 0) {
|
|
|
a42b25 |
+ MACC(a[j], a[j], t0, t1, t2);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ for (; j < len; j++) {
|
|
|
a42b25 |
+ MACC(m[j], n[i-j], t0, t1, t2);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ m[i-len] = t0;
|
|
|
a42b25 |
+ t0 = t1; t1 = t2; t2 = 0;
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ while (t0) {
|
|
|
a42b25 |
+ t0 = sub(m, n, t0, len);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+}
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+// The threshold at which squaring is advantageous was determined
|
|
|
a42b25 |
+// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
|
|
|
a42b25 |
+// Doesn't seem to be relevant for Power8 so we use the same value.
|
|
|
a42b25 |
+#define MONTGOMERY_SQUARING_THRESHOLD 64
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+// Copy len longwords from s to d, word-swapping as we go. The
|
|
|
a42b25 |
+// destination array is reversed.
|
|
|
a42b25 |
+static void reverse_words(unsigned long *s, unsigned long *d, int len) {
|
|
|
a42b25 |
+ d += len;
|
|
|
a42b25 |
+ while(len-- > 0) {
|
|
|
a42b25 |
+ d--;
|
|
|
a42b25 |
+ unsigned long s_val = *s;
|
|
|
a42b25 |
+ // Swap words in a longword on little endian machines.
|
|
|
a42b25 |
+#ifdef VM_LITTLE_ENDIAN
|
|
|
a42b25 |
+ s_val = (s_val << 32) | (s_val >> 32);
|
|
|
a42b25 |
+#endif
|
|
|
a42b25 |
+ *d = s_val;
|
|
|
a42b25 |
+ s++;
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+}
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
|
|
|
a42b25 |
+ jint len, jlong inv,
|
|
|
a42b25 |
+ jint *m_ints) {
|
|
|
a42b25 |
+ assert(len % 2 == 0, "array length in montgomery_multiply must be even");
|
|
|
a42b25 |
+ int longwords = len/2;
|
|
|
a42b25 |
+ assert(longwords > 0, "unsupported");
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ // Make very sure we don't use so much space that the stack might
|
|
|
a42b25 |
+ // overflow. 512 jints corresponds to an 16384-bit integer and
|
|
|
a42b25 |
+ // will use here a total of 8k bytes of stack space.
|
|
|
a42b25 |
+ int total_allocation = longwords * sizeof (unsigned long) * 4;
|
|
|
a42b25 |
+ guarantee(total_allocation <= 8192, "must be");
|
|
|
a42b25 |
+ unsigned long *scratch = (unsigned long *)alloca(total_allocation);
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ // Local scratch arrays
|
|
|
a42b25 |
+ unsigned long
|
|
|
a42b25 |
+ *a = scratch + 0 * longwords,
|
|
|
a42b25 |
+ *b = scratch + 1 * longwords,
|
|
|
a42b25 |
+ *n = scratch + 2 * longwords,
|
|
|
a42b25 |
+ *m = scratch + 3 * longwords;
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ reverse_words((unsigned long *)a_ints, a, longwords);
|
|
|
a42b25 |
+ reverse_words((unsigned long *)b_ints, b, longwords);
|
|
|
a42b25 |
+ reverse_words((unsigned long *)n_ints, n, longwords);
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ reverse_words(m, (unsigned long *)m_ints, longwords);
|
|
|
a42b25 |
+}
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
|
|
|
a42b25 |
+ jint len, jlong inv,
|
|
|
a42b25 |
+ jint *m_ints) {
|
|
|
a42b25 |
+ assert(len % 2 == 0, "array length in montgomery_square must be even");
|
|
|
a42b25 |
+ int longwords = len/2;
|
|
|
a42b25 |
+ assert(longwords > 0, "unsupported");
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ // Make very sure we don't use so much space that the stack might
|
|
|
a42b25 |
+ // overflow. 512 jints corresponds to an 16384-bit integer and
|
|
|
a42b25 |
+ // will use here a total of 6k bytes of stack space.
|
|
|
a42b25 |
+ int total_allocation = longwords * sizeof (unsigned long) * 3;
|
|
|
a42b25 |
+ guarantee(total_allocation <= 8192, "must be");
|
|
|
a42b25 |
+ unsigned long *scratch = (unsigned long *)alloca(total_allocation);
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ // Local scratch arrays
|
|
|
a42b25 |
+ unsigned long
|
|
|
a42b25 |
+ *a = scratch + 0 * longwords,
|
|
|
a42b25 |
+ *n = scratch + 1 * longwords,
|
|
|
a42b25 |
+ *m = scratch + 2 * longwords;
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ reverse_words((unsigned long *)a_ints, a, longwords);
|
|
|
a42b25 |
+ reverse_words((unsigned long *)n_ints, n, longwords);
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
|
|
|
a42b25 |
+ ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
|
|
|
a42b25 |
+ } else {
|
|
|
a42b25 |
+ ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+
|
|
|
a42b25 |
+ reverse_words(m, (unsigned long *)m_ints, longwords);
|
|
|
a42b25 |
+}
|
|
|
a42b25 |
diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp
|
|
|
a42b25 |
--- openjdk/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
|
|
|
a42b25 |
+++ openjdk/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
|
|
|
a42b25 |
@@ -2094,6 +2094,14 @@
|
|
|
a42b25 |
generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
|
|
|
a42b25 |
&StubRoutines::_safefetchN_fault_pc,
|
|
|
a42b25 |
&StubRoutines::_safefetchN_continuation_pc);
|
|
|
a42b25 |
+ if (UseMontgomeryMultiplyIntrinsic) {
|
|
|
a42b25 |
+ StubRoutines::_montgomeryMultiply
|
|
|
a42b25 |
+ = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ if (UseMontgomerySquareIntrinsic) {
|
|
|
a42b25 |
+ StubRoutines::_montgomerySquare
|
|
|
a42b25 |
+ = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
}
|
|
|
a42b25 |
|
|
|
a42b25 |
public:
|
|
|
a42b25 |
diff --git a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp
|
|
|
a42b25 |
--- openjdk/hotspot/src/cpu/ppc/vm/templateInterpreter_ppc.cpp
|
|
|
a42b25 |
+++ openjdk/hotspot/src/cpu/ppc/vm/templateInterpreter_ppc.cpp
|
|
|
a42b25 |
@@ -265,7 +265,7 @@
|
|
|
a42b25 |
__ cmpdi(CCR0, Rmdo, 0);
|
|
|
a42b25 |
__ beq(CCR0, no_mdo);
|
|
|
a42b25 |
|
|
|
a42b25 |
- // Increment backedge counter in the MDO.
|
|
|
a42b25 |
+ // Increment invocation counter in the MDO.
|
|
|
a42b25 |
const int mdo_bc_offs = in_bytes(MethodData::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
|
|
|
a42b25 |
__ lwz(Rscratch2, mdo_bc_offs, Rmdo);
|
|
|
a42b25 |
__ addi(Rscratch2, Rscratch2, increment);
|
|
|
a42b25 |
@@ -277,12 +277,12 @@
|
|
|
a42b25 |
}
|
|
|
a42b25 |
|
|
|
a42b25 |
// Increment counter in MethodCounters*.
|
|
|
a42b25 |
- const int mo_bc_offs = in_bytes(MethodCounters::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
|
|
|
a42b25 |
+ const int mo_ic_offs = in_bytes(MethodCounters::invocation_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
|
|
|
a42b25 |
__ bind(no_mdo);
|
|
|
a42b25 |
__ get_method_counters(R19_method, R3_counters, done);
|
|
|
a42b25 |
- __ lwz(Rscratch2, mo_bc_offs, R3_counters);
|
|
|
a42b25 |
+ __ lwz(Rscratch2, mo_ic_offs, R3_counters);
|
|
|
a42b25 |
__ addi(Rscratch2, Rscratch2, increment);
|
|
|
a42b25 |
- __ stw(Rscratch2, mo_bc_offs, R3_counters);
|
|
|
a42b25 |
+ __ stw(Rscratch2, mo_ic_offs, R3_counters);
|
|
|
a42b25 |
__ load_const_optimized(Rscratch1, mask, R0);
|
|
|
a42b25 |
__ and_(Rscratch1, Rscratch2, Rscratch1);
|
|
|
a42b25 |
__ beq(CCR0, *overflow);
|
|
|
a42b25 |
diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp
|
|
|
a42b25 |
--- openjdk/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
|
|
|
a42b25 |
+++ openjdk/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
|
|
|
a42b25 |
@@ -177,6 +177,12 @@
|
|
|
a42b25 |
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
|
|
|
a42b25 |
}
|
|
|
a42b25 |
|
|
|
a42b25 |
+ if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
|
|
|
a42b25 |
+ UseMontgomeryMultiplyIntrinsic = true;
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+ if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
|
|
|
a42b25 |
+ UseMontgomerySquareIntrinsic = true;
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
}
|
|
|
a42b25 |
|
|
|
a42b25 |
void VM_Version::print_features() {
|
|
|
a42b25 |
diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp
|
|
|
a42b25 |
--- openjdk/hotspot/src/share/vm/opto/library_call.cpp
|
|
|
a42b25 |
+++ openjdk/hotspot/src/share/vm/opto/library_call.cpp
|
|
|
a42b25 |
@@ -6031,11 +6031,21 @@
|
|
|
a42b25 |
Node* n_start = array_element_address(n, intcon(0), n_elem);
|
|
|
a42b25 |
Node* m_start = array_element_address(m, intcon(0), m_elem);
|
|
|
a42b25 |
|
|
|
a42b25 |
- Node* call = make_runtime_call(RC_LEAF,
|
|
|
a42b25 |
- OptoRuntime::montgomeryMultiply_Type(),
|
|
|
a42b25 |
- stubAddr, stubName, TypePtr::BOTTOM,
|
|
|
a42b25 |
- a_start, b_start, n_start, len, inv, top(),
|
|
|
a42b25 |
- m_start);
|
|
|
a42b25 |
+ Node* call = NULL;
|
|
|
a42b25 |
+ if (CCallingConventionRequiresIntsAsLongs) {
|
|
|
a42b25 |
+ Node* len_I2L = ConvI2L(len);
|
|
|
a42b25 |
+ call = make_runtime_call(RC_LEAF,
|
|
|
a42b25 |
+ OptoRuntime::montgomeryMultiply_Type(),
|
|
|
a42b25 |
+ stubAddr, stubName, TypePtr::BOTTOM,
|
|
|
a42b25 |
+ a_start, b_start, n_start, len_I2L XTOP, inv,
|
|
|
a42b25 |
+ top(), m_start);
|
|
|
a42b25 |
+ } else {
|
|
|
a42b25 |
+ call = make_runtime_call(RC_LEAF,
|
|
|
a42b25 |
+ OptoRuntime::montgomeryMultiply_Type(),
|
|
|
a42b25 |
+ stubAddr, stubName, TypePtr::BOTTOM,
|
|
|
a42b25 |
+ a_start, b_start, n_start, len, inv, top(),
|
|
|
a42b25 |
+ m_start);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
set_result(m);
|
|
|
a42b25 |
}
|
|
|
a42b25 |
|
|
|
a42b25 |
@@ -6085,11 +6095,22 @@
|
|
|
a42b25 |
Node* n_start = array_element_address(n, intcon(0), n_elem);
|
|
|
a42b25 |
Node* m_start = array_element_address(m, intcon(0), m_elem);
|
|
|
a42b25 |
|
|
|
a42b25 |
- Node* call = make_runtime_call(RC_LEAF,
|
|
|
a42b25 |
- OptoRuntime::montgomerySquare_Type(),
|
|
|
a42b25 |
- stubAddr, stubName, TypePtr::BOTTOM,
|
|
|
a42b25 |
- a_start, n_start, len, inv, top(),
|
|
|
a42b25 |
- m_start);
|
|
|
a42b25 |
+ Node* call = NULL;
|
|
|
a42b25 |
+ if (CCallingConventionRequiresIntsAsLongs) {
|
|
|
a42b25 |
+ Node* len_I2L = ConvI2L(len);
|
|
|
a42b25 |
+ call = make_runtime_call(RC_LEAF,
|
|
|
a42b25 |
+ OptoRuntime::montgomerySquare_Type(),
|
|
|
a42b25 |
+ stubAddr, stubName, TypePtr::BOTTOM,
|
|
|
a42b25 |
+ a_start, n_start, len_I2L XTOP, inv, top(),
|
|
|
a42b25 |
+ m_start);
|
|
|
a42b25 |
+ } else {
|
|
|
a42b25 |
+ call = make_runtime_call(RC_LEAF,
|
|
|
a42b25 |
+ OptoRuntime::montgomerySquare_Type(),
|
|
|
a42b25 |
+ stubAddr, stubName, TypePtr::BOTTOM,
|
|
|
a42b25 |
+ a_start, n_start, len, inv, top(),
|
|
|
a42b25 |
+ m_start);
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
+
|
|
|
a42b25 |
set_result(m);
|
|
|
a42b25 |
}
|
|
|
a42b25 |
|
|
|
a42b25 |
diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp
|
|
|
a42b25 |
--- openjdk/hotspot/src/share/vm/opto/runtime.cpp
|
|
|
a42b25 |
+++ openjdk/hotspot/src/share/vm/opto/runtime.cpp
|
|
|
a42b25 |
@@ -1005,12 +1005,20 @@
|
|
|
a42b25 |
// create input type (domain)
|
|
|
a42b25 |
int num_args = 7;
|
|
|
a42b25 |
int argcnt = num_args;
|
|
|
a42b25 |
+ if (CCallingConventionRequiresIntsAsLongs) {
|
|
|
a42b25 |
+ argcnt++; // additional placeholder
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
const Type** fields = TypeTuple::fields(argcnt);
|
|
|
a42b25 |
int argp = TypeFunc::Parms;
|
|
|
a42b25 |
fields[argp++] = TypePtr::NOTNULL; // a
|
|
|
a42b25 |
fields[argp++] = TypePtr::NOTNULL; // b
|
|
|
a42b25 |
fields[argp++] = TypePtr::NOTNULL; // n
|
|
|
a42b25 |
- fields[argp++] = TypeInt::INT; // len
|
|
|
a42b25 |
+ if (CCallingConventionRequiresIntsAsLongs) {
|
|
|
a42b25 |
+ fields[argp++] = TypeLong::LONG; // len
|
|
|
a42b25 |
+ fields[argp++] = TypeLong::HALF; // placeholder
|
|
|
a42b25 |
+ } else {
|
|
|
a42b25 |
+ fields[argp++] = TypeInt::INT; // len
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
fields[argp++] = TypeLong::LONG; // inv
|
|
|
a42b25 |
fields[argp++] = Type::HALF;
|
|
|
a42b25 |
fields[argp++] = TypePtr::NOTNULL; // result
|
|
|
a42b25 |
@@ -1029,11 +1037,19 @@
|
|
|
a42b25 |
// create input type (domain)
|
|
|
a42b25 |
int num_args = 6;
|
|
|
a42b25 |
int argcnt = num_args;
|
|
|
a42b25 |
+ if (CCallingConventionRequiresIntsAsLongs) {
|
|
|
a42b25 |
+ argcnt++; // additional placeholder
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
const Type** fields = TypeTuple::fields(argcnt);
|
|
|
a42b25 |
int argp = TypeFunc::Parms;
|
|
|
a42b25 |
fields[argp++] = TypePtr::NOTNULL; // a
|
|
|
a42b25 |
fields[argp++] = TypePtr::NOTNULL; // n
|
|
|
a42b25 |
- fields[argp++] = TypeInt::INT; // len
|
|
|
a42b25 |
+ if (CCallingConventionRequiresIntsAsLongs) {
|
|
|
a42b25 |
+ fields[argp++] = TypeLong::LONG; // len
|
|
|
a42b25 |
+ fields[argp++] = TypeLong::HALF; // placeholder
|
|
|
a42b25 |
+ } else {
|
|
|
a42b25 |
+ fields[argp++] = TypeInt::INT; // len
|
|
|
a42b25 |
+ }
|
|
|
a42b25 |
fields[argp++] = TypeLong::LONG; // inv
|
|
|
a42b25 |
fields[argp++] = Type::HALF;
|
|
|
a42b25 |
fields[argp++] = TypePtr::NOTNULL; // result
|