From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Sun, 6 Feb 2022 00:54:18 -0600 Subject: x86: Improve vec generation in memset-vec-unaligned-erms.S No bug. Split vec generation into multiple steps. This allows the broadcast in AVX2 to use 'xmm' registers for the L(less_vec) case. This saves an expensive lane-cross instruction and removes the need for 'vzeroupper'. For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for byte broadcast. Results for memset-avx2 small (geomean of N = 20 benchset runs). size, New Time, Old Time, New / Old 0, 4.100, 3.831, 0.934 1, 5.074, 4.399, 0.867 2, 4.433, 4.411, 0.995 4, 4.487, 4.415, 0.984 8, 4.454, 4.396, 0.987 16, 4.502, 4.443, 0.987 All relevant string/wcsmbs tests are passing. Reviewed-by: H.J. Lu --- sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S') diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 1af668af0a..c0bf2875d0 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -10,15 +10,18 @@ # define VMOVU vmovdqu # define VMOVA vmovdqa -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ - movq r, %rax; \ - vpbroadcastb %xmm0, %ymm0 + movq r, %rax; -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - vmovd d, %xmm0; \ - movq r, %rax; \ - vpbroadcastd %xmm0, %ymm0 +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) + +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 + +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 # ifndef SECTION # define SECTION(p) p##.avx @@ -30,5 +33,6 @@ # define WMEMSET_SYMBOL(p,s) p##_avx2_##s # endif +# define USE_XMM_LESS_VEC # include "memset-vec-unaligned-erms.S" #endif -- cgit 1.4.1