about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--benchtests/Makefile6
-rw-r--r--benchtests/README9
-rw-r--r--sysdeps/x86/hp-timing.h14
4 files changed, 35 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 6cb7d604ce..cda75db202 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2018-10-24  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* benchtests/Makefile (CPPFLAGS-nonlib): Add -DUSE_RDTSCP if
+	USE_RDTSCP is defined.
+	* sysdeps/x86/hp-timing.h (HP_TIMING_NOW): Use RDTSCP if
+	USE_RDTSCP is defined.
+
 2018-10-23  Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 
 	* misc/tst-preadvwritev2-common.c (IOV_MAX): Define if not
diff --git a/benchtests/Makefile b/benchtests/Makefile
index bcd6a9c26d..45aeb5febe 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -131,6 +131,12 @@ CPPFLAGS-nonlib += -DDURATION=$(BENCH_DURATION) -D_ISOMAC
 # HP_TIMING if it is available.
 ifdef USE_CLOCK_GETTIME
 CPPFLAGS-nonlib += -DUSE_CLOCK_GETTIME
+else
+# On x86 processors, use RDTSCP, instead of RDTSC, to measure performance
+# of functions.  All x86 processors since 2010 support RDTSCP instruction.
+ifdef USE_RDTSCP
+CPPFLAGS-nonlib += -DUSE_RDTSCP
+endif
 endif
 
 DETAILED_OPT :=
diff --git a/benchtests/README b/benchtests/README
index 4ddff794d1..aaf0b659e2 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -34,6 +34,15 @@ the benchmark to use clock_gettime by invoking make as follows:
 
 Again, one must run `make bench-clean' before changing the measurement method.
 
+On x86 processors, RDTSCP instruction provides more precise timing data
+than RDTSC instruction.  All x86 processors since 2010 support RDTSCP
+instruction.  One can force the benchmark to use RDTSCP by invoking make
+as follows:
+
+  $ make USE_RDTSCP=1 bench
+
+One must run `make bench-clean' before changing the measurement method.
+
 Running benchmarks on another target:
 ====================================
 
diff --git a/sysdeps/x86/hp-timing.h b/sysdeps/x86/hp-timing.h
index 77a1360748..0aa6f5e3f8 100644
--- a/sysdeps/x86/hp-timing.h
+++ b/sysdeps/x86/hp-timing.h
@@ -40,7 +40,19 @@ typedef unsigned long long int hp_timing_t;
 
    NB: Use __builtin_ia32_rdtsc directly since including <x86intrin.h>
    makes building glibc very slow.  */
-# define HP_TIMING_NOW(Var)	((Var) = __builtin_ia32_rdtsc ())
+# ifdef USE_RDTSCP
+/* RDTSCP waits until all previous instructions have executed and all
+   previous loads are globally visible before reading the counter.
+   RDTSC doesn't wait until all previous instructions have been executed
+   before reading the counter.  */
+#  define HP_TIMING_NOW(Var) \
+  (__extension__ ({				\
+    unsigned int __aux;				\
+    (Var) = __builtin_ia32_rdtscp (&__aux);	\
+  }))
+# else
+#  define HP_TIMING_NOW(Var) ((Var) = __builtin_ia32_rdtsc ())
+# endif
 
 # include <hp-timing-common.h>
 #else