about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S6
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S6
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S6
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S6
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S6
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S6
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S6
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S6
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S6
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S4
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S4
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-avx2.S2
75 files changed, 158 insertions, 158 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
index e19bddd2e2..73025e8b0f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
@@ -210,11 +210,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	acos@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
index f4c72c3618..b8cc6dd776 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
@@ -232,11 +232,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	acos@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
index 5d0b23b72c..126110cf17 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
@@ -372,11 +372,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	acosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
index b9a1131664..db0ef3b9dd 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
@@ -317,11 +317,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	acosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
index ba96089504..612a45da30 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
@@ -202,11 +202,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	asin@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
index 0f5b773b04..e7b41ab232 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
@@ -224,11 +224,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	asin@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
index 131b716c95..1fcbb245b7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
@@ -429,11 +429,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	asinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
index 5bdc6859f0..8445fc8ba4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
@@ -343,11 +343,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	asinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
index 1b601576cc..a45cae79a1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
@@ -277,12 +277,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
-	movsd	64(%rsp, %r14, 8), %xmm1
+	vmovsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm1
 	call	atan2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 96(%rsp, %r14, 8)
+	vmovsd	%xmm0, 96(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
index ef9581075d..c3b0f7940c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
@@ -295,12 +295,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
-	movsd	128(%rsp, %r14, 8), %xmm1
+	vmovsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	128(%rsp, %r14, 8), %xmm1
 	call	atan2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 192(%rsp, %r14, 8)
+	vmovsd	%xmm0, 192(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
index b5cbfd224c..c9c41ef9f4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
@@ -339,11 +339,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	atanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
index 3193c026dd..de4edb3cc0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
@@ -274,11 +274,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	atanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
index 96ecbe05c1..71a25f3db8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
@@ -262,11 +262,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	cbrt@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
index 25df252108..a3d9104f5e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
@@ -282,11 +282,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	cosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
index 066bbc7de6..4ff0e038a3 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
@@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	cosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
index c832b65e3e..6efd2e95ba 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
@@ -258,11 +258,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	erfc@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
index 77228814d3..42bdfe6f18 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
@@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	erfc@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
index 7271bcc1d9..f519bcce45 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
@@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	exp10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
index 40b01c3cd0..3f0c670199 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
@@ -191,11 +191,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	exp10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
index ced774e89c..afa00a38bb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
@@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	exp2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
index 7a85fd8b18..eee785dbf5 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
@@ -227,11 +227,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	exp2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
index 590341c243..4a3202750f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
@@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	expm1@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
index efae1f8b66..0fa17f3a73 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
@@ -211,11 +211,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	expm1@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
index ae5738c1b7..5c693d132e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
@@ -231,12 +231,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
-	movsd	64(%rsp, %r14, 8), %xmm1
+	vmovsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm1
 	call	hypot@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 96(%rsp, %r14, 8)
+	vmovsd	%xmm0, 96(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
index 0c404fd5ee..a392252c8b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
@@ -194,12 +194,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
-	movsd	128(%rsp, %r14, 8), %xmm1
+	vmovsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	128(%rsp, %r14, 8), %xmm1
 	call	hypot@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 192(%rsp, %r14, 8)
+	vmovsd	%xmm0, 192(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
index 2461c6ad56..9bf45a6dc2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
@@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	log10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
index 5d129ef4e5..101618cce9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
@@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	log10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
index 13235793e8..39ec0024cf 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
@@ -263,11 +263,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	log1p@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
index dd55b5dd18..3033fcb5b3 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
@@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	log1p@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
index 25d2edaae5..84bdb2090d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
@@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	log2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
index bcb6736dec..b3e9bb3ca4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
@@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	log2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
index ae16600579..ad2a06ad37 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
@@ -280,11 +280,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	sinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
index 075665d57d..7ca915e30f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
@@ -271,11 +271,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	sinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
index 01c86736e7..f26daf316b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
@@ -267,11 +267,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	tan@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
index 376479035e..0c90328b0a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
@@ -239,11 +239,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	tan@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
index 7ddf145b25..ea41d326eb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
@@ -110,7 +110,7 @@ ENTRY(_ZGVdN4v_tanh_avx2)
 	vpcmpgtd %xmm11, %xmm9, %xmm10
 	vpcmpgtd %xmm8, %xmm9, %xmm0
 	vpand	%xmm10, %xmm9, %xmm7
-	blendvps %xmm0, %xmm8, %xmm7
+	vblendvps %xmm0, %xmm8, %xmm7, %xmm7
 
 	/*
 	 * VSHRIMM( I, iIndex, = iIndex, (17 - 4) );
@@ -272,11 +272,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	tanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
index 82c0119500..c995401a24 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
@@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	tanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
index 26fef1f268..fd84977e95 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
@@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	acosf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
index bf28a5dd00..078fe5a898 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
@@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	acosf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
index 3f44e75248..65026e647d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
@@ -290,11 +290,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	acoshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
index 3a70fc1448..489dac033c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
@@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	acoshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
index 4e9984d870..2accef703e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
@@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	asinf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
index 59bea9dc42..257c8da2f7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
@@ -187,11 +187,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	asinf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
index 6b569ecf41..a0c27922e4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
@@ -313,11 +313,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	asinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
index 794030a481..d6f6c3d5aa 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
@@ -361,11 +361,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	asinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
index 56aa5bb917..15ffa4b6c9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
@@ -257,12 +257,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
-	movss	128(%rsp, %r14, 4), %xmm1
+	vmovss	64(%rsp, %r14, 4), %xmm0
+	vmovss	128(%rsp, %r14, 4), %xmm1
 	call	atan2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 192(%rsp, %r14, 4)
+	vmovss	%xmm0, 192(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
index 29ebbb6db2..08b18c3e3f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
@@ -238,12 +238,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	movss	64(%rsp, %r14, 4), %xmm1
+	vmovss	32(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm1
 	call	atan2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 96(%rsp, %r14, 4)
+	vmovss	%xmm0, 96(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index f42462c581..94186a14cb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -222,13 +222,13 @@ L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
-	movss	64(%rsp, %rbp, 4), %xmm0
+	vmovss	64(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
index 43eb423831..49ffd7a9b2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
@@ -231,13 +231,13 @@ L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
-	movss	32(%rsp, %rbp, 4), %xmm0
+	vmovss	32(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
index d24d36163d..14b58c171a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
@@ -304,11 +304,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	cbrtf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
index 6b740bf866..d1a5ddf5b4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
@@ -228,11 +228,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	coshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
index 6f29218af1..a00650ccd6 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
@@ -242,11 +242,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	coshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
index 9daaa0c06d..5fb5b2f0f7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
@@ -218,11 +218,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	erfcf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
index 4cafc1bcd5..60b9fab000 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
@@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	erfcf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
index eb9f3f8d8b..10f0b2cb37 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
@@ -186,11 +186,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	exp10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
index 11244d5a5f..275ab42529 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
@@ -238,11 +238,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	exp10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
index 5b406c6e32..8a5f1e3985 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
@@ -209,11 +209,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	exp2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
index f7a80a4d64..cc87e66425 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
@@ -188,11 +188,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	exp2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
index 71d23e632c..7fe830daa4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
@@ -194,11 +194,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	expm1f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
index 73f862528a..d5d7fa2791 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
@@ -212,11 +212,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	expm1f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
index 548936fe61..c92e3ab065 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
@@ -202,12 +202,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
-	movss	128(%rsp, %r14, 4), %xmm1
+	vmovss	64(%rsp, %r14, 4), %xmm0
+	vmovss	128(%rsp, %r14, 4), %xmm1
 	call	hypotf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 192(%rsp, %r14, 4)
+	vmovss	%xmm0, 192(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
index fc97828008..7a26c5accc 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
@@ -226,12 +226,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	movss	64(%rsp, %r14, 4), %xmm1
+	vmovss	32(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm1
 	call	hypotf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 96(%rsp, %r14, 4)
+	vmovss	%xmm0, 96(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
index b192dfe464..0eb9b23c4e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
@@ -161,11 +161,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	log10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
index ea51c28f81..4bdc62e90e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
@@ -174,11 +174,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	log10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
index 8fa5068595..2c864f0c0e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
@@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	log1pf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
index 54d6a9a685..7326a2b5ad 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
@@ -190,11 +190,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	log1pf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
index 3b0a28fee0..02b255dde8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
@@ -158,11 +158,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	log2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
index eaa5112178..2245d40f84 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
@@ -169,11 +169,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	log2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
index fad4847f28..89be733eb2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
@@ -252,11 +252,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	sinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
index 8c4b46cee2..e358e2efee 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
@@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	sinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
index f2a18f0b2c..4e18cdc0ce 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
@@ -235,11 +235,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	tanf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
index cd33fac643..d34e61ac41 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
@@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%ebx, %r13d
-	movss	32(%rsp, %r13, 4), %xmm0
+	vmovss	32(%rsp, %r13, 4), %xmm0
 	call	tanf@PLT
 	# LOE r13 r14 r15 ebx r12d xmm0
 
-	movss	%xmm0, 64(%rsp, %r13, 4)
+	vmovss	%xmm0, 64(%rsp, %r13, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
index 7edc74a116..84f73fdaf9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
@@ -221,13 +221,13 @@ L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
-	movss	64(%rsp, %rbp, 4), %xmm0
+	vmovss	64(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
index 55df346a00..ea3e9f4210 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
@@ -240,13 +240,13 @@ L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math function call to process special input.  */
-	movss	32(%rsp, %rbp, 4), %xmm0
+	vmovss	32(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index bd26ba80d5..eb128a2ae3 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -49,7 +49,7 @@
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY(STRRCHR)
-	movd	%esi, %xmm7
+	vmovd	%esi, %xmm7
 	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
 	VPBROADCAST %xmm7, %ymm7