e354a5
commit 7cc65773f04e0f4252428c40dcbb784a39b58cd1
e354a5
Author: H.J. Lu <hjl.tools@gmail.com>
e354a5
Date:   Wed Oct 24 02:19:15 2018 -0700
e354a5
e354a5
    x86: Support RDTSCP for benchtests
e354a5
    
e354a5
    RDTSCP waits until all previous instructions have executed and all
e354a5
    previous loads are globally visible before reading the counter.  RDTSC
e354a5
    doesn't wait until all previous instructions have been executed before
e354a5
    reading the counter.  All x86 processors since 2010 support RDTSCP
e354a5
    instruction.  This patch adds RDTSCP support to benchtests.
e354a5
    
e354a5
            * benchtests/Makefile (CPPFLAGS-nonlib): Add -DUSE_RDTSCP if
e354a5
            USE_RDTSCP is defined.
e354a5
            * sysdeps/x86/hp-timing.h (HP_TIMING_NOW): Use RDTSCP if
e354a5
            USE_RDTSCP is defined.
e354a5
e354a5
diff --git a/benchtests/Makefile b/benchtests/Makefile
e354a5
index 28d6b0c43f5bd390..bde0caf140e8cf17 100644
e354a5
--- a/benchtests/Makefile
e354a5
+++ b/benchtests/Makefile
e354a5
@@ -131,6 +131,12 @@ CPPFLAGS-nonlib += -DDURATION=$(BENCH_DURATION) -D_ISOMAC
e354a5
 # HP_TIMING if it is available.
e354a5
 ifdef USE_CLOCK_GETTIME
e354a5
 CPPFLAGS-nonlib += -DUSE_CLOCK_GETTIME
e354a5
+else
e354a5
+# On x86 processors, use RDTSCP, instead of RDTSC, to measure performance
e354a5
+# of functions.  All x86 processors since 2010 support RDTSCP instruction.
e354a5
+ifdef USE_RDTSCP
e354a5
+CPPFLAGS-nonlib += -DUSE_RDTSCP
e354a5
+endif
e354a5
 endif
e354a5
 
e354a5
 DETAILED_OPT :=
e354a5
diff --git a/benchtests/README b/benchtests/README
e354a5
index 4ddff794d136f65f..aaf0b659e2b25627 100644
e354a5
--- a/benchtests/README
e354a5
+++ b/benchtests/README
e354a5
@@ -34,6 +34,15 @@ the benchmark to use clock_gettime by invoking make as follows:
e354a5
 
e354a5
 Again, one must run `make bench-clean' before changing the measurement method.
e354a5
 
e354a5
+On x86 processors, RDTSCP instruction provides more precise timing data
e354a5
+than RDTSC instruction.  All x86 processors since 2010 support RDTSCP
e354a5
+instruction.  One can force the benchmark to use RDTSCP by invoking make
e354a5
+as follows:
e354a5
+
e354a5
+  $ make USE_RDTSCP=1 bench
e354a5
+
e354a5
+One must run `make bench-clean' before changing the measurement method.
e354a5
+
e354a5
 Running benchmarks on another target:
e354a5
 ====================================
e354a5
 
e354a5
diff --git a/sysdeps/x86/hp-timing.h b/sysdeps/x86/hp-timing.h
e354a5
index 77a1360748ca4535..0aa6f5e3f83e0d34 100644
e354a5
--- a/sysdeps/x86/hp-timing.h
e354a5
+++ b/sysdeps/x86/hp-timing.h
e354a5
@@ -40,7 +40,19 @@ typedef unsigned long long int hp_timing_t;
e354a5
 
e354a5
    NB: Use __builtin_ia32_rdtsc directly since including <x86intrin.h>
e354a5
    makes building glibc very slow.  */
e354a5
-# define HP_TIMING_NOW(Var)	((Var) = __builtin_ia32_rdtsc ())
e354a5
+# ifdef USE_RDTSCP
e354a5
+/* RDTSCP waits until all previous instructions have executed and all
e354a5
+   previous loads are globally visible before reading the counter.
e354a5
+   RDTSC doesn't wait until all previous instructions have been executed
e354a5
+   before reading the counter.  */
e354a5
+#  define HP_TIMING_NOW(Var) \
e354a5
+  (__extension__ ({				\
e354a5
+    unsigned int __aux;				\
e354a5
+    (Var) = __builtin_ia32_rdtscp (&__aux);	\
e354a5
+  }))
e354a5
+# else
e354a5
+#  define HP_TIMING_NOW(Var) ((Var) = __builtin_ia32_rdtsc ())
e354a5
+# endif
e354a5
 
e354a5
 # include <hp-timing-common.h>
e354a5
 #else