|
|
b1dca6 |
commit 7cc65773f04e0f4252428c40dcbb784a39b58cd1
|
|
|
b1dca6 |
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
|
b1dca6 |
Date: Wed Oct 24 02:19:15 2018 -0700
|
|
|
b1dca6 |
|
|
|
b1dca6 |
x86: Support RDTSCP for benchtests
|
|
|
b1dca6 |
|
|
|
b1dca6 |
RDTSCP waits until all previous instructions have executed and all
|
|
|
b1dca6 |
previous loads are globally visible before reading the counter. RDTSC
|
|
|
b1dca6 |
doesn't wait until all previous instructions have been executed before
|
|
|
b1dca6 |
reading the counter. All x86 processors since 2010 support RDTSCP
|
|
|
b1dca6 |
instruction. This patch adds RDTSCP support to benchtests.
|
|
|
b1dca6 |
|
|
|
b1dca6 |
* benchtests/Makefile (CPPFLAGS-nonlib): Add -DUSE_RDTSCP if
|
|
|
b1dca6 |
USE_RDTSCP is defined.
|
|
|
b1dca6 |
* sysdeps/x86/hp-timing.h (HP_TIMING_NOW): Use RDTSCP if
|
|
|
b1dca6 |
USE_RDTSCP is defined.
|
|
|
b1dca6 |
|
|
|
b1dca6 |
diff --git a/benchtests/Makefile b/benchtests/Makefile
|
|
|
b1dca6 |
index 28d6b0c43f5bd390..bde0caf140e8cf17 100644
|
|
|
b1dca6 |
--- a/benchtests/Makefile
|
|
|
b1dca6 |
+++ b/benchtests/Makefile
|
|
|
b1dca6 |
@@ -131,6 +131,12 @@ CPPFLAGS-nonlib += -DDURATION=$(BENCH_DURATION) -D_ISOMAC
|
|
|
b1dca6 |
# HP_TIMING if it is available.
|
|
|
b1dca6 |
ifdef USE_CLOCK_GETTIME
|
|
|
b1dca6 |
CPPFLAGS-nonlib += -DUSE_CLOCK_GETTIME
|
|
|
b1dca6 |
+else
|
|
|
b1dca6 |
+# On x86 processors, use RDTSCP, instead of RDTSC, to measure performance
|
|
|
b1dca6 |
+# of functions. All x86 processors since 2010 support RDTSCP instruction.
|
|
|
b1dca6 |
+ifdef USE_RDTSCP
|
|
|
b1dca6 |
+CPPFLAGS-nonlib += -DUSE_RDTSCP
|
|
|
b1dca6 |
+endif
|
|
|
b1dca6 |
endif
|
|
|
b1dca6 |
|
|
|
b1dca6 |
DETAILED_OPT :=
|
|
|
b1dca6 |
diff --git a/benchtests/README b/benchtests/README
|
|
|
b1dca6 |
index 4ddff794d136f65f..aaf0b659e2b25627 100644
|
|
|
b1dca6 |
--- a/benchtests/README
|
|
|
b1dca6 |
+++ b/benchtests/README
|
|
|
b1dca6 |
@@ -34,6 +34,15 @@ the benchmark to use clock_gettime by invoking make as follows:
|
|
|
b1dca6 |
|
|
|
b1dca6 |
Again, one must run `make bench-clean' before changing the measurement method.
|
|
|
b1dca6 |
|
|
|
b1dca6 |
+On x86 processors, RDTSCP instruction provides more precise timing data
|
|
|
b1dca6 |
+than RDTSC instruction. All x86 processors since 2010 support RDTSCP
|
|
|
b1dca6 |
+instruction. One can force the benchmark to use RDTSCP by invoking make
|
|
|
b1dca6 |
+as follows:
|
|
|
b1dca6 |
+
|
|
|
b1dca6 |
+ $ make USE_RDTSCP=1 bench
|
|
|
b1dca6 |
+
|
|
|
b1dca6 |
+One must run `make bench-clean' before changing the measurement method.
|
|
|
b1dca6 |
+
|
|
|
b1dca6 |
Running benchmarks on another target:
|
|
|
b1dca6 |
====================================
|
|
|
b1dca6 |
|
|
|
b1dca6 |
diff --git a/sysdeps/x86/hp-timing.h b/sysdeps/x86/hp-timing.h
|
|
|
b1dca6 |
index 77a1360748ca4535..0aa6f5e3f83e0d34 100644
|
|
|
b1dca6 |
--- a/sysdeps/x86/hp-timing.h
|
|
|
b1dca6 |
+++ b/sysdeps/x86/hp-timing.h
|
|
|
b1dca6 |
@@ -40,7 +40,19 @@ typedef unsigned long long int hp_timing_t;
|
|
|
b1dca6 |
|
|
|
b1dca6 |
NB: Use __builtin_ia32_rdtsc directly since including <x86intrin.h>
|
|
|
b1dca6 |
makes building glibc very slow. */
|
|
|
b1dca6 |
-# define HP_TIMING_NOW(Var) ((Var) = __builtin_ia32_rdtsc ())
|
|
|
b1dca6 |
+# ifdef USE_RDTSCP
|
|
|
b1dca6 |
+/* RDTSCP waits until all previous instructions have executed and all
|
|
|
b1dca6 |
+ previous loads are globally visible before reading the counter.
|
|
|
b1dca6 |
+ RDTSC doesn't wait until all previous instructions have been executed
|
|
|
b1dca6 |
+ before reading the counter. */
|
|
|
b1dca6 |
+# define HP_TIMING_NOW(Var) \
|
|
|
b1dca6 |
+ (__extension__ ({ \
|
|
|
b1dca6 |
+ unsigned int __aux; \
|
|
|
b1dca6 |
+ (Var) = __builtin_ia32_rdtscp (&__aux); \
|
|
|
b1dca6 |
+ }))
|
|
|
b1dca6 |
+# else
|
|
|
b1dca6 |
+# define HP_TIMING_NOW(Var) ((Var) = __builtin_ia32_rdtsc ())
|
|
|
b1dca6 |
+# endif
|
|
|
b1dca6 |
|
|
|
b1dca6 |
# include <hp-timing-common.h>
|
|
|
b1dca6 |
#else
|