3 files changed, 51 insertions, 18 deletions
diff --git a/ChangeLog b/ChangeLog
index 2d81375287..80bb1f847e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2017-06-20  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* benchtests/README: Describe workload feature.
+	* benchtests/bench-skeleton.c (main): Add support for
+	benchmarking traces from workloads.
+
 2017-06-20  Zack Weinberg  <zackw@panix.com>
 
 	* string/string.h (__mempcpy_inline): Delete.
diff --git a/benchtests/README b/benchtests/README
index 2c5f381135..b015acfd53 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -102,6 +102,12 @@ the same file by using the `name' directive that looks something like this:
 See the pow-inputs file for an example of what such a partitioned input file
 would look like.
 
+It is also possible to measure throughput of a (partial) trace extracted from
+a real workload.  In this case the whole trace is iterated over multiple times
+rather than repeating every input multiple times.  This can be done via:
+
+  ##name: workload-<name>
+
 Benchmark Sets:
 ==============
 
diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 09eb78df1b..3c6dad7055 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -68,34 +68,50 @@ main (int argc, char **argv)
       clock_gettime (CLOCK_MONOTONIC_RAW, &runtime);
       runtime.tv_sec += DURATION;
 
+      bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
       double d_total_i = 0;
       timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
       int64_t c = 0;
+      uint64_t cur;
       while (1)
 	{
-	  for (i = 0; i < NUM_SAMPLES (v); i++)
+	  if (is_bench)
 	    {
-	      uint64_t cur;
+	      /* Benchmark a real trace of calls - all samples are iterated
+		 over once before repeating.  This models actual use more
+		 accurately than repeating the same sample many times.  */
 	      TIMING_NOW (start);
 	      for (k = 0; k < iters; k++)
-		BENCH_FUNC (v, i);
+		for (i = 0; i < NUM_SAMPLES (v); i++)
+		  BENCH_FUNC (v, i);
 	      TIMING_NOW (end);
-
 	      TIMING_DIFF (cur, start, end);
+	      TIMING_ACCUM (total, cur);
+	      d_total_i += iters * NUM_SAMPLES (v);
+	    }
+	  else
+	    for (i = 0; i < NUM_SAMPLES (v); i++)
+	      {
+		TIMING_NOW (start);
+		for (k = 0; k < iters; k++)
+		  BENCH_FUNC (v, i);
+		TIMING_NOW (end);
 
-	      if (cur > max)
-		max = cur;
+		TIMING_DIFF (cur, start, end);
 
-	      if (cur < min)
-		min = cur;
+		if (cur > max)
+		  max = cur;
 
-	      TIMING_ACCUM (total, cur);
-	      /* Accumulate timings for the value.  In the end we will divide
-	         by the total iterations.  */
-	      RESULT_ACCUM (cur, v, i, c * iters, (c + 1) * iters);
+		if (cur < min)
+		  min = cur;
 
-	      d_total_i += iters;
-	    }
+		TIMING_ACCUM (total, cur);
+		/* Accumulate timings for the value.  In the end we will divide
+		   by the total iterations.  */
+		RESULT_ACCUM (cur, v, i, c * iters, (c + 1) * iters);
+
+		d_total_i += iters;
+	      }
 	  c++;
 	  struct timespec curtime;
 
@@ -117,11 +133,16 @@ main (int argc, char **argv)
 
       json_attr_double (&json_ctx, "duration", d_total_s);
       json_attr_double (&json_ctx, "iterations", d_total_i);
-      json_attr_double (&json_ctx, "max", max / d_iters);
-      json_attr_double (&json_ctx, "min", min / d_iters);
-      json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
+      if (is_bench)
+	json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i);
+      else
+	{
+	  json_attr_double (&json_ctx, "max", max / d_iters);
+	  json_attr_double (&json_ctx, "min", min / d_iters);
+	  json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
+	}
 
-      if (detailed)
+      if (detailed && !is_bench)
 	{
 	  json_array_begin (&json_ctx, "timings");