diff options
-rw-r--r-- | ChangeLog | 6 | ||||
-rwxr-xr-x | benchtests/scripts/compare_bench.py | 184 | ||||
-rw-r--r-- | benchtests/scripts/import_bench.py | 96 |
3 files changed, 286 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog index 624e6f0f4e..6d295e720b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2015-06-01 Siddhesh Poyarekar <siddhesh@redhat.com> + * benchtests/scripts/compare_bench.py: New file. + * benchtests/scripts/import_bench.py (mean): New function. + (split_list): Likewise. + (do_for_all_timings): Likewise. + (compress_timings): Likewise. + * benchtests/scripts/import_bench.py: New file. * benchtests/scripts/validate_benchout.py: Import import_bench instead of jsonschema. diff --git a/benchtests/scripts/compare_bench.py b/benchtests/scripts/compare_bench.py new file mode 100755 index 0000000000..be5b5ca9cd --- /dev/null +++ b/benchtests/scripts/compare_bench.py @@ -0,0 +1,184 @@ +#!/usr/bin/python +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. +"""Compare two benchmark results + +Given two benchmark result files and a threshold, this script compares the +benchmark results and flags differences in performance beyond a given +threshold. +""" +import sys +import os +import pylab +import import_bench as bench + +def do_compare(func, var, tl1, tl2, par, threshold): + """Compare one of the aggregate measurements + + Helper function to compare one of the aggregate measurements of a function + variant. + + Args: + func: Function name + var: Function variant name + tl1: The first timings list + tl2: The second timings list + par: The aggregate to measure + threshold: The threshold for differences, beyond which the script should + print a warning. + """ + d = abs(tl2[par] - tl1[par]) * 100 / tl1[str(par)] + if d > threshold: + if tl1[par] > tl2[par]: + ind = '+++' + else: + ind = '---' + print('%s %s(%s)[%s]: (%.2lf%%) from %g to %g' % + (ind, func, var, par, d, tl1[par], tl2[par])) + + +def compare_runs(pts1, pts2, threshold): + """Compare two benchmark runs + + Args: + pts1: Timing data from first machine + pts2: Timing data from second machine + """ + + # XXX We assume that the two benchmarks have identical functions and + # variants. We cannot compare two benchmarks that may have different + # functions or variants. Maybe that is something for the future. + for func in pts1['functions'].keys(): + for var in pts1['functions'][func].keys(): + tl1 = pts1['functions'][func][var] + tl2 = pts2['functions'][func][var] + + # Compare the consolidated numbers + # do_compare(func, var, tl1, tl2, 'max', threshold) + do_compare(func, var, tl1, tl2, 'min', threshold) + do_compare(func, var, tl1, tl2, 'mean', threshold) + + # Skip over to the next variant or function if there is no detailed + # timing info for the function variant. + if 'timings' not in pts1['functions'][func][var].keys() or \ + 'timings' not in pts2['functions'][func][var].keys(): + return + + # If two lists do not have the same length then it is likely that + # the performance characteristics of the function have changed. + # XXX: It is also likely that there was some measurement that + # strayed outside the usual range. Such ouiers should not + # happen on an idle machine with identical hardware and + # configuration, but ideal environments are hard to come by. + if len(tl1['timings']) != len(tl2['timings']): + print('* %s(%s): Timing characteristics changed' % + (func, var)) + print('\tBefore: [%s]' % + ', '.join([str(x) for x in tl1['timings']])) + print('\tAfter: [%s]' % + ', '.join([str(x) for x in tl2['timings']])) + continue + + # Collect numbers whose differences cross the threshold we have + # set. + issues = [(x, y) for x, y in zip(tl1['timings'], tl2['timings']) \ + if abs(y - x) * 100 / x > threshold] + + # Now print them. + for t1, t2 in issues: + d = abs(t2 - t1) * 100 / t1 + if t2 > t1: + ind = '-' + else: + ind = '+' + + print("%s %s(%s): (%.2lf%%) from %g to %g" % + (ind, func, var, d, t1, t2)) + + +def plot_graphs(bench1, bench2): + """Plot graphs for functions + + Make scatter plots for the functions and their variants. + + Args: + bench1: Set of points from the first machine + bench2: Set of points from the second machine. + """ + for func in bench1['functions'].keys(): + for var in bench1['functions'][func].keys(): + # No point trying to print a graph if there are no detailed + # timings. + if u'timings' not in bench1['functions'][func][var].keys(): + print('Skipping graph for %s(%s)' % (func, var)) + continue + + pylab.clf() + pylab.ylabel('Time (cycles)') + + # First set of points + length = len(bench1['functions'][func][var]['timings']) + X = [float(x) for x in range(length)] + lines = pylab.scatter(X, bench1['functions'][func][var]['timings'], + 1.5 + 100 / length) + pylab.setp(lines, 'color', 'r') + + # Second set of points + length = len(bench2['functions'][func][var]['timings']) + X = [float(x) for x in range(length)] + lines = pylab.scatter(X, bench2['functions'][func][var]['timings'], + 1.5 + 100 / length) + pylab.setp(lines, 'color', 'g') + + if var: + filename = "%s-%s.png" % (func, var) + else: + filename = "%s.png" % func + print('Writing out %s' % filename) + pylab.savefig(filename) + + +def main(args): + """Program Entry Point + + Take two benchmark output files and compare their timings. + """ + if len(args) > 4 or len(args) < 3: + print('Usage: %s <schema> <file1> <file2> [threshold in %%]' % sys.argv[0]) + sys.exit(os.EX_USAGE) + + bench1 = bench.parse_bench(args[1], args[0]) + bench2 = bench.parse_bench(args[2], args[0]) + if len(args) == 4: + threshold = float(args[3]) + else: + threshold = 10.0 + + if (bench1['timing_type'] != bench2['timing_type']): + print('Cannot compare benchmark outputs: timing types are different') + return + + plot_graphs(bench1, bench2) + + bench.compress_timings(bench1) + bench.compress_timings(bench2) + + compare_runs(bench1, bench2, threshold) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py index 81248c2adf..d37ff62383 100644 --- a/benchtests/scripts/import_bench.py +++ b/benchtests/scripts/import_bench.py @@ -25,6 +25,102 @@ except ImportError: raise +def mean(lst): + """Compute and return mean of numbers in a list + + The numpy average function has horrible performance, so implement our + own mean function. + + Args: + lst: The list of numbers to average. + Return: + The mean of members in the list. + """ + return sum(lst) / len(lst) + + +def split_list(bench, func, var): + """ Split the list into a smaller set of more distinct points + + Group together points such that the difference between the smallest + point and the mean is less than 1/3rd of the mean. This means that + the mean is at most 1.5x the smallest member of that group. + + mean - xmin < mean / 3 + i.e. 2 * mean / 3 < xmin + i.e. mean < 3 * xmin / 2 + + For an evenly distributed group, the largest member will be less than + twice the smallest member of the group. + Derivation: + + An evenly distributed series would be xmin, xmin + d, xmin + 2d... + + mean = (2 * n * xmin + n * (n - 1) * d) / 2 * n + and max element is xmin + (n - 1) * d + + Now, mean < 3 * xmin / 2 + + 3 * xmin > 2 * mean + 3 * xmin > (2 * n * xmin + n * (n - 1) * d) / n + 3 * n * xmin > 2 * n * xmin + n * (n - 1) * d + n * xmin > n * (n - 1) * d + xmin > (n - 1) * d + 2 * xmin > xmin + (n-1) * d + 2 * xmin > xmax + + Hence, proved. + + Similarly, it is trivial to prove that for a similar aggregation by using + the maximum element, the maximum element in the group must be at most 4/3 + times the mean. + + Args: + bench: The benchmark object + func: The function name + var: The function variant name + """ + means = [] + lst = bench['functions'][func][var]['timings'] + last = len(lst) - 1 + while lst: + for i in range(last + 1): + avg = mean(lst[i:]) + if avg > 0.75 * lst[last]: + means.insert(0, avg) + lst = lst[:i] + last = i - 1 + break + bench['functions'][func][var]['timings'] = means + + +def do_for_all_timings(bench, callback): + """Call a function for all timing objects for each function and its + variants. + + Args: + bench: The benchmark object + callback: The callback function + """ + for func in bench['functions'].keys(): + for k in bench['functions'][func].keys(): + if 'timings' not in bench['functions'][func][k].keys(): + continue + + callback(bench, func, k) + + +def compress_timings(points): + """Club points with close enough values into a single mean value + + See split_list for details on how the clubbing is done. + + Args: + points: The set of points. + """ + do_for_all_timings(points, split_list) + + def parse_bench(filename, schema_filename): """Parse the input file |