b1dca6
commit 7cc65773f04e0f4252428c40dcbb784a39b58cd1
b1dca6
Author: H.J. Lu <hjl.tools@gmail.com>
b1dca6
Date:   Wed Oct 24 02:19:15 2018 -0700
b1dca6
b1dca6
    x86: Support RDTSCP for benchtests
b1dca6
    
b1dca6
    RDTSCP waits until all previous instructions have executed and all
b1dca6
    previous loads are globally visible before reading the counter.  RDTSC
b1dca6
    doesn't wait until all previous instructions have been executed before
b1dca6
    reading the counter.  All x86 processors since 2010 support RDTSCP
b1dca6
    instruction.  This patch adds RDTSCP support to benchtests.
b1dca6
    
b1dca6
            * benchtests/Makefile (CPPFLAGS-nonlib): Add -DUSE_RDTSCP if
b1dca6
            USE_RDTSCP is defined.
b1dca6
            * sysdeps/x86/hp-timing.h (HP_TIMING_NOW): Use RDTSCP if
b1dca6
            USE_RDTSCP is defined.
b1dca6
b1dca6
diff --git a/benchtests/Makefile b/benchtests/Makefile
b1dca6
index 28d6b0c43f5bd390..bde0caf140e8cf17 100644
b1dca6
--- a/benchtests/Makefile
b1dca6
+++ b/benchtests/Makefile
b1dca6
@@ -131,6 +131,12 @@ CPPFLAGS-nonlib += -DDURATION=$(BENCH_DURATION) -D_ISOMAC
b1dca6
 # HP_TIMING if it is available.
b1dca6
 ifdef USE_CLOCK_GETTIME
b1dca6
 CPPFLAGS-nonlib += -DUSE_CLOCK_GETTIME
b1dca6
+else
b1dca6
+# On x86 processors, use RDTSCP, instead of RDTSC, to measure performance
b1dca6
+# of functions.  All x86 processors since 2010 support RDTSCP instruction.
b1dca6
+ifdef USE_RDTSCP
b1dca6
+CPPFLAGS-nonlib += -DUSE_RDTSCP
b1dca6
+endif
b1dca6
 endif
b1dca6
 
b1dca6
 DETAILED_OPT :=
b1dca6
diff --git a/benchtests/README b/benchtests/README
b1dca6
index 4ddff794d136f65f..aaf0b659e2b25627 100644
b1dca6
--- a/benchtests/README
b1dca6
+++ b/benchtests/README
b1dca6
@@ -34,6 +34,15 @@ the benchmark to use clock_gettime by invoking make as follows:
b1dca6
 
b1dca6
 Again, one must run `make bench-clean' before changing the measurement method.
b1dca6
 
b1dca6
+On x86 processors, RDTSCP instruction provides more precise timing data
b1dca6
+than RDTSC instruction.  All x86 processors since 2010 support RDTSCP
b1dca6
+instruction.  One can force the benchmark to use RDTSCP by invoking make
b1dca6
+as follows:
b1dca6
+
b1dca6
+  $ make USE_RDTSCP=1 bench
b1dca6
+
b1dca6
+One must run `make bench-clean' before changing the measurement method.
b1dca6
+
b1dca6
 Running benchmarks on another target:
b1dca6
 ====================================
b1dca6
 
b1dca6
diff --git a/sysdeps/x86/hp-timing.h b/sysdeps/x86/hp-timing.h
b1dca6
index 77a1360748ca4535..0aa6f5e3f83e0d34 100644
b1dca6
--- a/sysdeps/x86/hp-timing.h
b1dca6
+++ b/sysdeps/x86/hp-timing.h
b1dca6
@@ -40,7 +40,19 @@ typedef unsigned long long int hp_timing_t;
b1dca6
 
b1dca6
    NB: Use __builtin_ia32_rdtsc directly since including <x86intrin.h>
b1dca6
    makes building glibc very slow.  */
b1dca6
-# define HP_TIMING_NOW(Var)	((Var) = __builtin_ia32_rdtsc ())
b1dca6
+# ifdef USE_RDTSCP
b1dca6
+/* RDTSCP waits until all previous instructions have executed and all
b1dca6
+   previous loads are globally visible before reading the counter.
b1dca6
+   RDTSC doesn't wait until all previous instructions have been executed
b1dca6
+   before reading the counter.  */
b1dca6
+#  define HP_TIMING_NOW(Var) \
b1dca6
+  (__extension__ ({				\
b1dca6
+    unsigned int __aux;				\
b1dca6
+    (Var) = __builtin_ia32_rdtscp (&__aux);	\
b1dca6
+  }))
b1dca6
+# else
b1dca6
+#  define HP_TIMING_NOW(Var) ((Var) = __builtin_ia32_rdtsc ())
b1dca6
+# endif
b1dca6
 
b1dca6
 # include <hp-timing-common.h>
b1dca6
 #else