diff options
-rw-r--r-- | sysdeps/x86_64/fpu/Makeconfig | 35 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/Makefile | 40 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 103 | ||||
-rwxr-xr-x | sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464 |
4 files changed, 642 insertions, 0 deletions
diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig index 24aaee1a43..503e9b5ffa 100644 --- a/sysdeps/x86_64/fpu/Makeconfig +++ b/sysdeps/x86_64/fpu/Makeconfig @@ -29,6 +29,23 @@ libmvec-funcs = \ sin \ sincos \ +# Define libmvec function for benchtests directory. +libmvec-bench-funcs = \ + +bench-libmvec-double = \ + $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \ + $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \ + $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \ + $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \ + $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \ + +bench-libmvec-float = \ + $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \ + $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \ + $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \ + $(addsuffix f, $(addprefix float-vlen8-avx2-, $(libmvec-bench-funcs))) \ + $(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \ + # The base libmvec ABI tests. libmvec-abi-func-tests = \ $(addprefix test-double-libmvec-,$(libmvec-funcs)) \ @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk: $(common-objpfx)config.make echo " \$$(float-vlen16-arch-ext-cflags)"; \ echo; \ done; \ + echo "endif"; \ + echo "ifeq (\$$(subdir),benchtests)"; \ + for t in $(libmvec-bench-funcs); do \ + echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \ + echo " \$$(double-vlen4-arch-ext-cflags)"; \ + echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \ + echo " \$$(double-vlen4-arch-ext2-cflags)"; \ + echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \ + echo " \$$(double-vlen8-arch-ext-cflags)"; \ + echo; \ + echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \ + echo " \$$(float-vlen8-arch-ext-cflags)"; \ + echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \ + echo " \$$(float-vlen8-arch-ext2-cflags)"; \ + echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \ + echo " \$$(float-vlen16-arch-ext-cflags)"; \ + echo; \ + done; \ echo "endif") > $@T mv -f $@T $@ diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile index d172ae815d..9fb587cf8f 100644 --- a/sysdeps/x86_64/fpu/Makefile +++ b/sysdeps/x86_64/fpu/Makefile @@ -72,3 +72,43 @@ ifeq ($(subdir)$(config-cflags-mprefer-vector-width),mathyes) # performance of sin and cos by more than 40% on Skylake. CFLAGS-branred.c = -mprefer-vector-width=128 endif + +ifeq ($(subdir),benchtests) +double-vlen4-arch-ext-cflags = -mavx +double-vlen4-arch-ext2-cflags = -mavx2 +double-vlen8-arch-ext-cflags = -mavx512f + +float-vlen8-arch-ext-cflags = -mavx +float-vlen8-arch-ext2-cflags = -mavx2 +float-vlen16-arch-ext-cflags = -mavx512f + +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float) + +ifeq (${BENCHSET},) +bench += $(bench-libmvec) +endif + +ifeq (${STATIC-BENCHTESTS},yes) +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a $(common-objpfx)math/libm.a +else +libmvec-benchtests = $(libmvec) $(libm) +endif + +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)): $(libmvec-benchtests) +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)): $(libmvec-benchtests) +bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c bench-timing.h Makefile + +$(objpfx)bench-float-%.c: $(bench-libmvec-deps) + { if [ -n "$($*-INCLUDE)" ]; then \ + cat $($*-INCLUDE); \ + fi; \ + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp + mv -f $@-tmp $@ + +$(objpfx)bench-double-%.c: $(bench-libmvec-deps) + { if [ -n "$($*-INCLUDE)" ]; then \ + cat $($*-INCLUDE); \ + fi; \ + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp + mv -f $@-tmp $@ +endif diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c new file mode 100644 index 0000000000..ee025594a5 --- /dev/null +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c @@ -0,0 +1,103 @@ +/* Skeleton for libmvec benchmark programs. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <string.h> +#include <stdint.h> +#include <stdbool.h> +#include <stdio.h> +#include <time.h> +#include <inttypes.h> +#include <bench-timing.h> +#include <json-lib.h> +#include <bench-util.h> +#include <math-tests-arch.h> + +#include <bench-util.c> +#define D_ITERS 10000 + +int +main (int argc, char **argv) +{ + unsigned long i, k; + timing_t start, end; + json_ctx_t json_ctx; + +#if defined REQUIRE_AVX + if (!CPU_FEATURE_ACTIVE (AVX)) + { + printf ("AVX not supported.\n"); + return 0; + } +#elif defined REQUIRE_AVX2 + if (!CPU_FEATURE_ACTIVE (AVX2)) + { + printf ("AVX2 not supported.\n"); + return 0; + } +#elif defined REQUIRE_AVX512F + if (!CPU_FEATURE_ACTIVE (AVX512F)) + { + printf ("AVX512F not supported.\n"); + return 0; + } +#endif + + bench_start (); + +#ifdef BENCH_INIT + BENCH_INIT (); +#endif + + json_init (&json_ctx, 2, stdout); + + /* Begin function. */ + json_attr_object_begin (&json_ctx, FUNCNAME); + + for (int v = 0; v < NUM_VARIANTS; v++) + { + double d_total_time = 0; + timing_t cur; + for (k = 0; k < D_ITERS; k++) + { + TIMING_NOW (start); + for (i = 0; i < NUM_SAMPLES (v); i++) + BENCH_FUNC (v, i); + TIMING_NOW (end); + + TIMING_DIFF (cur, start, end); + + TIMING_ACCUM (d_total_time, cur); + } + double d_total_data_set = D_ITERS * NUM_SAMPLES (v) * STRIDE; + + /* Begin variant. */ + json_attr_object_begin (&json_ctx, VARIANT (v)); + + json_attr_double (&json_ctx, "duration", d_total_time); + json_attr_double (&json_ctx, "iterations", d_total_data_set); + json_attr_double (&json_ctx, "mean", d_total_time / d_total_data_set); + + /* End variant. */ + json_attr_object_end (&json_ctx); + } + + /* End function. */ + json_attr_object_end (&json_ctx); + + return 0; +} diff --git a/sysdeps/x86_64/fpu/scripts/bench_libmvec.py b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py new file mode 100755 index 0000000000..762865de8f --- /dev/null +++ b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py @@ -0,0 +1,464 @@ +#!/usr/bin/python3 +# Copyright (C) 2021 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <https://www.gnu.org/licenses/>. + +"""Benchmark program generator script + +This script takes a function name as input and generates a program using +an libmvec input file located in the sysdeps/x86_64/fpu directory. The +name of the input file should be of the form libmvec-foo-inputs where +'foo' is the name of the function. +""" + +from __future__ import print_function +import sys +import os +import itertools +import re + +# Macro definitions for functions that take no arguments. For functions +# that take arguments, the STRUCT_TEMPLATE, ARGS_TEMPLATE and +# VARIANTS_TEMPLATE are used instead. +DEFINES_TEMPLATE = ''' +#define CALL_BENCH_FUNC(v, i) %(func)s () +#define NUM_VARIANTS (1) +#define NUM_SAMPLES(v) (1) +#define VARIANT(v) FUNCNAME "()" +''' + +# Structures to store arguments for the function call. A function may +# have its inputs partitioned to represent distinct performance +# characteristics or distinct flavors of the function. Each such +# variant is represented by the _VARIANT structure. The ARGS structure +# represents a single set of arguments. +BENCH_VEC_TEMPLATE = ''' +#define CALL_BENCH_FUNC(v, i) (__extension__ ({ \\ + %(defs)s mx0 = %(func)s (%(func_args)s); \\ + mx0; })) +''' + +BENCH_SCALAR_TEMPLATE = ''' +#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s) +''' + +STRUCT_TEMPLATE = '''struct args +{ +%(args)s + double timing; +}; + +struct _variants +{ + const char *name; + int count; + struct args *in; +}; +''' + +# The actual input arguments. +ARGS_TEMPLATE = '''struct args in%(argnum)d[%(num_args)d] = { +%(args)s +}; +''' + +# The actual variants, along with macros defined to access the variants. +VARIANTS_TEMPLATE = '''struct _variants variants[%(num_variants)d] = { +%(variants)s +}; + +#define NUM_VARIANTS %(num_variants)d +#define NUM_SAMPLES(i) (variants[i].count) +#define VARIANT(i) (variants[i].name) +''' + +# Epilogue for the generated source file. +EPILOGUE = ''' +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);}) +#define FUNCNAME "%(func)s" +#include <bench-libmvec-skeleton.c>''' + + +def gen_source(func_types, directives, all_vals): + """Generate source for the function + + Generate the C source for the function from the values and + directives. + + Args: + func: The function name + directives: A dictionary of directives applicable to this function + all_vals: A dictionary input values + """ + # The includes go in first. + for header in directives['includes']: + print('#include <%s>' % header) + + for header in directives['include-sources']: + print('#include "%s"' % header) + + argtype_vtable = { + 2: '128', + 4: '256', + 8: '512' + } + prefix_vtable = { + 2: 'b', + 4: 'c', + 8: 'e' + } + + # Get all the function properties + funcname_argtype = '' + float_flag = False + if func_types[1] == 'float': + float_flag = True + avx_flag = False + if func_types[3] == 'avx2': + avx_flag = True + funcname_stride = int(func_types[2][4:]) + funcname_origin = func_types[-1] + if float_flag: + funcname_origin = funcname_origin[:-1] + + if funcname_stride == 1: + # Prepare for scalar functions file generation + funcname_prefix = '' + funcname_prefix_1 = '' + funcname_argtype = 'double' + if float_flag: + funcname_argtype = 'float' + else: + # Prepare for libmvec functions file generation + funcname_prefix_1 = len(directives['args']) * 'v' + '_' + aligned_stride = funcname_stride + if float_flag: + aligned_stride /= 2 + funcname_prefix = '_ZGV' + if (avx_flag and (aligned_stride == 4)): + funcname_prefix += 'd' + else: + funcname_prefix += prefix_vtable[aligned_stride] + funcname_prefix = funcname_prefix + 'N' + func_types[2][4:] + funcname_argtype = '__m' + argtype_vtable[aligned_stride] + if not float_flag: + funcname_argtype += 'd' + + # Include x86intrin.h for vector functions + if not funcname_stride == 1: + print('#include <x86intrin.h>') + if (avx_flag and (aligned_stride == 4)): + # For bench-float-vlen8-avx2* and bench-double-vlen4-avx2* + print('#define REQUIRE_AVX2') + elif aligned_stride == 8: + # For bench-float-vlen16* and bench-double-vlen8* + print('#define REQUIRE_AVX512F') + elif aligned_stride == 4: + # For bench-float-vlen8* and bench-double-vlen4* without avx2 + print('#define REQUIRE_AVX') + else: + print('#define FUNCTYPE %s' % funcname_argtype) + + print('#define STRIDE %d ' % funcname_stride) + + funcname = funcname_prefix + funcname_prefix_1 + funcname_origin + if float_flag: + funcname += 'f' + + funcname_rettype = funcname_argtype + if directives['ret'] == '': + funcname_rettype = 'void' + + funcname_inputtype = [] + for arg, i in zip(directives['args'], itertools.count()): + funcname_inputtype.append(funcname_argtype) + if arg[0] == '<' and arg[-1] == '>': + pos = arg.rfind('*') + if pos == -1: + die('Output argument must be a pointer type') + funcname_inputtype[i] += ' *' + + if not funcname_stride == 1: + if len(directives['args']) == 2: + print('extern %s %s (%s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1])) + elif len(directives['args']) == 3: + print('extern %s %s (%s, %s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1], funcname_inputtype[2])) + else: + print('extern %s %s (%s);' % (funcname_rettype, funcname, funcname_inputtype[0])) + + # Print macros. This branches out to a separate routine if + # the function takes arguments. + if not directives['args']: + print(DEFINES_TEMPLATE % {'funcname': funcname}) + outargs = [] + else: + outargs = _print_arg_data(funcname, float_flag, funcname_argtype, funcname_stride, directives, all_vals) + + # Print the output variable definitions if necessary. + for out in outargs: + print(out) + + # If we have a return value from the function, make sure it is + # assigned to prevent the compiler from optimizing out the + # call. + getret = '' + + if directives['ret']: + if funcname_argtype != '': + print('static %s volatile ret;' % funcname_argtype) + getret = 'ret =' + else: + print('static %s volatile ret;' % directives['ret']) + getret = 'ret =' + + # Test initialization. + if directives['init']: + print('#define BENCH_INIT %s' % directives['init']) + + print(EPILOGUE % {'getret': getret, 'func': funcname}) + + +def _print_arg_data(func, float_flag, funcname_argtype, funcname_stride, directives, all_vals): + """Print argument data + + This is a helper function for gen_source that prints structure and + values for arguments and their variants and returns output arguments + if any are found. + + Args: + func: Function name + float_flag: True if function is float type + funcname_argtype: Type for vector variants + funcname_stride: Vector Length + directives: A dictionary of directives applicable to this function + all_vals: A dictionary input values + + Returns: + Returns a list of definitions for function arguments that act as + output parameters. + """ + # First, all of the definitions. We process writing of + # CALL_BENCH_FUNC, struct args and also the output arguments + # together in a single traversal of the arguments list. + func_args = [] + _func_args = [] + arg_struct = [] + outargs = [] + # Conversion function for each type + vtable = { + '__m128d': '_mm_loadu_pd', + '__m256d': '_mm256_loadu_pd', + '__m512d': '_mm512_loadu_pd', + '__m128': '_mm_loadu_ps', + '__m256': '_mm256_loadu_ps', + '__m512': '_mm512_loadu_ps', + 'double': '', + 'float': '' + } + + # For double max_vlen=8, for float max_vlen=16. + if float_flag == True: + max_vlen = 16 + else: + max_vlen = 8 + + for arg, i in zip(directives['args'], itertools.count()): + if arg[0] == '<' and arg[-1] == '>': + outargs.append('static %s out%d __attribute__((used));' % (funcname_argtype, i)) + func_args.append('&out%d' % i) + _func_args.append('&out%d' % i) + else: + arg_struct.append(' %s arg%d[STRIDE];' % (arg, i)) + func_args.append('%s (variants[v].in[i].arg%d)' % + (vtable[funcname_argtype], i)) + _func_args.append('variants[v].in[i].arg%d[0]' % i) + + if funcname_stride == 1: + print(BENCH_SCALAR_TEMPLATE % {'func': func, + 'func_args': ', '.join(_func_args)}) + elif directives['ret'] == '': + print(BENCH_SCALAR_TEMPLATE % {'func': func, + 'func_args': ', '.join(func_args)}) + else: + print(BENCH_VEC_TEMPLATE % {'func': func, 'func_args': ', '.join(func_args), + 'defs': funcname_argtype}) + print(STRUCT_TEMPLATE % {'args': '\n'.join(arg_struct)}) + + # Now print the values. + variants = [] + for (k, _vals), i in zip(all_vals.items(), itertools.count()): + vals = [] + temp_vals = [] + j = 0 + temp_j = 0 + result_v = ['', '', ''] + for _v in _vals: + nums = _v.split(',') + for l in range(0, len(nums)): + result_v[l] = result_v[l] + nums[l].strip() + ',' + j += 1 + temp_j += 1 + + if temp_j == funcname_stride: + final_result = '' + for l in range(0, len(nums)): + final_result = final_result + '{' + result_v[l][:-1] + '},' + temp_vals.append(final_result[:-1]) + temp_j = 0 + result_v = ['', '', ''] + + # Make sure amount of test data is multiple of max_vlen + # to keep data size same for all vector length. + if j == max_vlen: + vals.extend(temp_vals) + temp_vals = [] + j = 0 + + out = [' {%s, 0},' % v for v in vals] + + # Members for the variants structure list that we will + # print later. + variants.append(' {"%s", %d, in%d},' % (k, len(vals), i)) + print(ARGS_TEMPLATE % {'argnum': i, 'num_args': len(vals), + 'args': '\n'.join(out)}) + + # Print the variants and the last set of macros. + print(VARIANTS_TEMPLATE % {'num_variants': len(all_vals), + 'variants': '\n'.join(variants)}) + return outargs + + +def _process_directive(d_name, d_val, func_args): + """Process a directive. + + Evaluate the directive name and value passed and return the + processed value. This is a helper function for parse_file. + + Args: + d_name: Name of the directive + d_val: The string value to process + + Returns: + The processed value, which may be the string as it is or an object + that describes the directive. + """ + # Process the directive values if necessary. name and ret don't + # need any processing. + if d_name.startswith('include'): + d_val = d_val.split(',') + elif d_name == 'args': + d_val = d_val.split(':') + # Check if args type match + if not d_val[0] == func_args: + die("Args mismatch, should be %s, but get %s" % (d_val[0], func_args)) + + # Return the values. + return d_val + + +def parse_file(func_types): + """Parse an input file + + Given a function name, open and parse an input file for the function + and get the necessary parameters for the generated code and the list + of inputs. + + Args: + func: The function name + + Returns: + A tuple of two elements, one a dictionary of directives and the + other a dictionary of all input values. + """ + all_vals = {} + # Valid directives. + directives = { + 'name': '', + 'args': [], + 'includes': [], + 'include-sources': [], + 'ret': '', + 'init': '' + } + + func = func_types[-1] + try: + with open('../sysdeps/x86_64/fpu/libmvec-%s-inputs' % func) as f: + for line in f: + # Look for directives and parse it if found. + if line.startswith('##'): + try: + d_name, d_val = line[2:].split(':', 1) + d_name = d_name.strip() + d_val = d_val.strip() + directives[d_name] = _process_directive(d_name, d_val, func_types[1]) + except (IndexError, KeyError): + die('Invalid directive: %s' % line[2:]) + + # Skip blank lines and comments. + line = line.split('#', 1)[0].rstrip() + if not line: + continue + + # Otherwise, we're an input. Add to the appropriate + # input set. + cur_name = directives['name'] + all_vals.setdefault(cur_name, []) + all_vals[cur_name].append(line) + except IOError as ex: + die("Failed to open input file (%s): %s" % (ex.filename, ex.strerror)) + + return directives, all_vals + + +def die(msg): + """Exit with an error + + Prints an error message to the standard error stream and exits with + a non-zero status. + + Args: + msg: The error message to print to standard error + """ + print('%s\n' % msg, file=sys.stderr) + sys.exit(os.EX_DATAERR) + + +def main(args): + """Main function + + Use the first command line argument as function name and parse its + input file to generate C source that calls the function repeatedly + for the input. + + Args: + args: The command line arguments with the program name dropped + + Returns: + os.EX_USAGE on error and os.EX_OK on success. + """ + if len(args) != 1: + print('Usage: %s <function>' % sys.argv[0]) + return os.EX_USAGE + + func_types = args[0].split('-') + directives, all_vals = parse_file(func_types) + gen_source(func_types, directives, all_vals) + return os.EX_OK + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) |