about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-02-06 00:54:18 -0600
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-02-06 20:58:07 -0600
commitb62ace2740a106222e124cc86956448fa07abf4d (patch)
tree33113f4227f89fc389d82ec9a14834bb364704bb /sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
parentd7fca835e064ead5a46914d5f3a2eda3cad5649f (diff)
downloadglibc-b62ace2740a106222e124cc86956448fa07abf4d.tar.gz
glibc-b62ace2740a106222e124cc86956448fa07abf4d.tar.xz
glibc-b62ace2740a106222e124cc86956448fa07abf4d.zip
x86: Improve vec generation in memset-vec-unaligned-erms.S
No bug.

Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.

For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.

Results for memset-avx2 small (geomean of N = 20 benchset runs).

size, New Time, Old Time, New / Old
   0,    4.100,    3.831,     0.934
   1,    5.074,    4.399,     0.867
   2,    4.433,    4.411,     0.995
   4,    4.487,    4.415,     0.984
   8,    4.454,    4.396,     0.987
  16,    4.502,    4.443,     0.987

All relevant string/wcsmbs tests are passing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Diffstat (limited to 'sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S')
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S18
1 files changed, 11 insertions, 7 deletions
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af0a..c0bf2875d0 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -10,15 +10,18 @@
 # define VMOVU     vmovdqu
 # define VMOVA     vmovdqa
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastb %xmm0, %ymm0
+  movq r, %rax;
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
 # ifndef SECTION
 #  define SECTION(p)		p##.avx
@@ -30,5 +33,6 @@
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
 
+# define USE_XMM_LESS_VEC
 # include "memset-vec-unaligned-erms.S"
 #endif