446cf2
commit 7cc65773f04e0f4252428c40dcbb784a39b58cd1
446cf2
Author: H.J. Lu <hjl.tools@gmail.com>
446cf2
Date:   Wed Oct 24 02:19:15 2018 -0700
446cf2
446cf2
    x86: Support RDTSCP for benchtests
446cf2
    
446cf2
    RDTSCP waits until all previous instructions have executed and all
446cf2
    previous loads are globally visible before reading the counter.  RDTSC
446cf2
    doesn't wait until all previous instructions have been executed before
446cf2
    reading the counter.  All x86 processors since 2010 support RDTSCP
446cf2
    instruction.  This patch adds RDTSCP support to benchtests.
446cf2
    
446cf2
            * benchtests/Makefile (CPPFLAGS-nonlib): Add -DUSE_RDTSCP if
446cf2
            USE_RDTSCP is defined.
446cf2
            * sysdeps/x86/hp-timing.h (HP_TIMING_NOW): Use RDTSCP if
446cf2
            USE_RDTSCP is defined.
446cf2
446cf2
diff --git a/benchtests/Makefile b/benchtests/Makefile
446cf2
index 28d6b0c43f5bd390..bde0caf140e8cf17 100644
446cf2
--- a/benchtests/Makefile
446cf2
+++ b/benchtests/Makefile
446cf2
@@ -131,6 +131,12 @@ CPPFLAGS-nonlib += -DDURATION=$(BENCH_DURATION) -D_ISOMAC
446cf2
 # HP_TIMING if it is available.
446cf2
 ifdef USE_CLOCK_GETTIME
446cf2
 CPPFLAGS-nonlib += -DUSE_CLOCK_GETTIME
446cf2
+else
446cf2
+# On x86 processors, use RDTSCP, instead of RDTSC, to measure performance
446cf2
+# of functions.  All x86 processors since 2010 support RDTSCP instruction.
446cf2
+ifdef USE_RDTSCP
446cf2
+CPPFLAGS-nonlib += -DUSE_RDTSCP
446cf2
+endif
446cf2
 endif
446cf2
 
446cf2
 DETAILED_OPT :=
446cf2
diff --git a/benchtests/README b/benchtests/README
446cf2
index 4ddff794d136f65f..aaf0b659e2b25627 100644
446cf2
--- a/benchtests/README
446cf2
+++ b/benchtests/README
446cf2
@@ -34,6 +34,15 @@ the benchmark to use clock_gettime by invoking make as follows:
446cf2
 
446cf2
 Again, one must run `make bench-clean' before changing the measurement method.
446cf2
 
446cf2
+On x86 processors, RDTSCP instruction provides more precise timing data
446cf2
+than RDTSC instruction.  All x86 processors since 2010 support RDTSCP
446cf2
+instruction.  One can force the benchmark to use RDTSCP by invoking make
446cf2
+as follows:
446cf2
+
446cf2
+  $ make USE_RDTSCP=1 bench
446cf2
+
446cf2
+One must run `make bench-clean' before changing the measurement method.
446cf2
+
446cf2
 Running benchmarks on another target:
446cf2
 ====================================
446cf2
 
446cf2
diff --git a/sysdeps/x86/hp-timing.h b/sysdeps/x86/hp-timing.h
446cf2
index 77a1360748ca4535..0aa6f5e3f83e0d34 100644
446cf2
--- a/sysdeps/x86/hp-timing.h
446cf2
+++ b/sysdeps/x86/hp-timing.h
446cf2
@@ -40,7 +40,19 @@ typedef unsigned long long int hp_timing_t;
446cf2
 
446cf2
    NB: Use __builtin_ia32_rdtsc directly since including <x86intrin.h>
446cf2
    makes building glibc very slow.  */
446cf2
-# define HP_TIMING_NOW(Var)	((Var) = __builtin_ia32_rdtsc ())
446cf2
+# ifdef USE_RDTSCP
446cf2
+/* RDTSCP waits until all previous instructions have executed and all
446cf2
+   previous loads are globally visible before reading the counter.
446cf2
+   RDTSC doesn't wait until all previous instructions have been executed
446cf2
+   before reading the counter.  */
446cf2
+#  define HP_TIMING_NOW(Var) \
446cf2
+  (__extension__ ({				\
446cf2
+    unsigned int __aux;				\
446cf2
+    (Var) = __builtin_ia32_rdtscp (&__aux);	\
446cf2
+  }))
446cf2
+# else
446cf2
+#  define HP_TIMING_NOW(Var) ((Var) = __builtin_ia32_rdtsc ())
446cf2
+# endif
446cf2
 
446cf2
 # include <hp-timing-common.h>
446cf2
 #else