x86: Only align destination to 1x VEC_SIZE in memset 4x loop

Current code aligns to 2x VEC_SIZE. Aligning to 2x has no affect on performance other than potentially resulting in an additional iteration of the loop. 1x maintains aligned stores (the only reason to align in this case) and doesn't incur any unnecessary loop iterations. Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
author: Noah Goldstein <goldstein.w.n@gmail.com> 2023-11-01 15:30:26 -0500
committer: Noah Goldstein <goldstein.w.n@gmail.com> 2023-11-28 12:06:19 -0600
commit: 9469261cf1924d350feeec64d2c80cafbbdcdd4d (patch)
tree: 2d6599f49c3497e64b354a60a6ae4ee644587e05 /sysdeps/x86_64
parent: 3921c5b40f293c57cb326f58713c924b0662ef59 (diff)
download: glibc-9469261cf1924d350feeec64d2c80cafbbdcdd4d.tar.gz
glibc-9469261cf1924d350feeec64d2c80cafbbdcdd4d.tar.xz
glibc-9469261cf1924d350feeec64d2c80cafbbdcdd4d.zip
1 files changed, 1 insertions, 1 deletions
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 3d9ad49cb9..0f0636b90f 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -293,7 +293,7 @@ L(more_2x_vec):
 	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
 #endif
 	/* Align dst for loop.  */
-	andq	$(VEC_SIZE * -2), %LOOP_REG
+	andq	$(VEC_SIZE * -1), %LOOP_REG
 	.p2align 4
 L(loop):
 	VMOVA	%VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
author	Noah Goldstein <goldstein.w.n@gmail.com>	2023-11-01 15:30:26 -0500
committer	Noah Goldstein <goldstein.w.n@gmail.com>	2023-11-28 12:06:19 -0600
commit	9469261cf1924d350feeec64d2c80cafbbdcdd4d (patch)
tree	2d6599f49c3497e64b354a60a6ae4ee644587e05 /sysdeps/x86_64
parent	3921c5b40f293c57cb326f58713c924b0662ef59 (diff)
download	glibc-9469261cf1924d350feeec64d2c80cafbbdcdd4d.tar.gz glibc-9469261cf1924d350feeec64d2c80cafbbdcdd4d.tar.xz glibc-9469261cf1924d350feeec64d2c80cafbbdcdd4d.zip