diff options
author | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-02-06 00:54:18 -0600 |
---|---|---|
committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-02-06 20:58:07 -0600 |
commit | b62ace2740a106222e124cc86956448fa07abf4d (patch) | |
tree | 33113f4227f89fc389d82ec9a14834bb364704bb /sysdeps/x86_64/memset.S | |
parent | d7fca835e064ead5a46914d5f3a2eda3cad5649f (diff) | |
download | glibc-b62ace2740a106222e124cc86956448fa07abf4d.tar.gz glibc-b62ace2740a106222e124cc86956448fa07abf4d.tar.xz glibc-b62ace2740a106222e124cc86956448fa07abf4d.zip |
x86: Improve vec generation in memset-vec-unaligned-erms.S
No bug. Split vec generation into multiple steps. This allows the broadcast in AVX2 to use 'xmm' registers for the L(less_vec) case. This saves an expensive lane-cross instruction and removes the need for 'vzeroupper'. For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for byte broadcast. Results for memset-avx2 small (geomean of N = 20 benchset runs). size, New Time, Old Time, New / Old 0, 4.100, 3.831, 0.934 1, 5.074, 4.399, 0.867 2, 4.433, 4.411, 0.995 4, 4.487, 4.415, 0.984 8, 4.454, 4.396, 0.987 16, 4.502, 4.443, 0.987 All relevant string/wcsmbs tests are passing. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Diffstat (limited to 'sysdeps/x86_64/memset.S')
-rw-r--r-- | sysdeps/x86_64/memset.S | 21 |
1 files changed, 13 insertions, 8 deletions
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 65c09bd0ac..ccf036be53 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -28,17 +28,22 @@ #define VMOVU movups #define VMOVA movaps -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ - movq r, %rax; \ - punpcklbw %xmm0, %xmm0; \ - punpcklwd %xmm0, %xmm0; \ - pshufd $0, %xmm0, %xmm0 + pxor %xmm1, %xmm1; \ + pshufb %xmm1, %xmm0; \ + movq r, %rax -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ - movq r, %rax; \ - pshufd $0, %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0; \ + movq r, %rax + +# define MEMSET_VDUP_TO_VEC0_HIGH() +# define MEMSET_VDUP_TO_VEC0_LOW() + +# define WMEMSET_VDUP_TO_VEC0_HIGH() +# define WMEMSET_VDUP_TO_VEC0_LOW() #define SECTION(p) p |