about summary refs log tree commit diff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2021-03-05 07:26:42 -0800
committerH.J. Lu <hjl.tools@gmail.com>2022-01-27 12:47:19 -0800
commit5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5 (patch)
tree49bd6c60949e027f9f1b38436f7fb4ba0e569b73
parentd584356fe8417790d20f024c5233c2d35a615ea2 (diff)
downloadglibc-5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5.tar.gz
glibc-5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5.tar.xz
glibc-5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5.zip
x86-64: Add AVX optimized string/memory functions for RTM
Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX
optimized string/memory functions with

	xtest
	jz	1f
	vzeroall
	ret
1:
	vzeroupper
	ret

at function exit on processors with usable RTM, but without 256-bit EVEX
instructions to avoid VZEROUPPER inside a transactionally executing RTM
region.

(cherry picked from commit 7ebba91361badf7531d4e75050627a88d424872f)
-rw-r--r--sysdeps/x86_64/multiarch/Makefile21
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-avx2.h4
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c146
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memcmp.h4
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memmove.h12
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memset.h12
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-wmemset.h5
-rw-r--r--sysdeps/x86_64/multiarch/memchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/memchr-avx2.S45
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S28
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S17
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S33
-rw-r--r--sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/memrchr-avx2.S53
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S10
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S12
-rw-r--r--sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S41
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/strcat-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strchr-avx2.S22
-rw-r--r--sysdeps/x86_64/multiarch/strchr.c4
-rw-r--r--sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-avx2.S55
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.c4
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strlen-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strlen-avx2.S43
-rw-r--r--sysdeps/x86_64/multiarch/strncat-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncmp.c4
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-avx2.S19
-rw-r--r--sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen.c4
-rw-r--r--sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S4
-rw-r--r--sysdeps/x86_64/sysdep.h22
48 files changed, 594 insertions, 190 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 141585a984..49fe1aaef7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,19 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   memset-sse2-unaligned-erms \
 		   memset-avx2-unaligned-erms \
 		   memset-avx512-unaligned-erms \
+		   memchr-avx2-rtm \
+		   memcmp-avx2-movbe-rtm \
+		   memmove-avx-unaligned-erms-rtm \
+		   memrchr-avx2-rtm \
+		   memset-avx2-unaligned-erms-rtm \
+		   rawmemchr-avx2-rtm \
+		   strchr-avx2-rtm \
+		   strcmp-avx2-rtm \
+		   strchrnul-avx2-rtm \
+		   strlen-avx2-rtm \
+		   strncmp-avx2-rtm \
+		   strnlen-avx2-rtm \
+		   strrchr-avx2-rtm \
 		   memchr-evex \
 		   memcmp-evex-movbe \
 		   memmove-evex-unaligned-erms \
@@ -77,6 +90,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wcsrchr-sse2 wcsrchr-avx2 \
 		   wcsnlen-sse4_1 wcsnlen-c \
 		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
+		   wcschr-avx2-rtm \
+		   wcscmp-avx2-rtm \
+		   wcslen-avx2-rtm \
+		   wcsncmp-avx2-rtm \
+		   wcsnlen-avx2-rtm \
+		   wcsrchr-avx2-rtm \
+		   wmemchr-avx2-rtm \
+		   wmemcmp-avx2-movbe-rtm \
 		   wcschr-evex \
 		   wcscmp-evex \
 		   wcslen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
index d7875db6e2..348d3d0531 100644
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -21,6 +21,7 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -36,6 +37,9 @@ IFUNC_SELECTOR (void)
 	  && CPU_FEATURES_CPU_P (cpu_features, BMI2))
 	return OPTIMIZE (evex);
 
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2);
     }
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 13b0599bcc..8bc42e7813 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -44,6 +44,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, memchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, memchr,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -57,6 +61,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && HAS_CPU_FEATURE (MOVBE)),
 			      __memcmp_avx2_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcmp_avx2_movbe_rtm)
+	      IFUNC_IMPL_ADD (array, i, memcmp,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (MOVBE)),
@@ -86,6 +95,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memmove_chk_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memmove_chk_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memmove_chk_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
@@ -114,6 +131,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memmove_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memmove_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memmove_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove,
@@ -144,6 +169,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memrchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, memrchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memrchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, memrchr,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __memrchr_evex)
@@ -166,6 +195,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_chk_avx2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memset_chk_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memset_chk_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __memset_chk_evex_unaligned)
@@ -199,6 +236,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_avx2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memset_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memset_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, memset,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __memset_evex_unaligned)
@@ -223,6 +268,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __rawmemchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __rawmemchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -235,6 +284,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strlen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strlen,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __strlen_evex)
@@ -246,6 +299,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strnlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strnlen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __strnlen_evex)
@@ -318,6 +375,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strchr,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -331,6 +392,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strchrnul_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strchrnul_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strchrnul,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -343,6 +408,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strrchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, strrchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strrchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strrchr,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __strrchr_evex)
@@ -354,6 +423,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strcmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, strcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strcmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strcmp,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -458,6 +531,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcschr_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcschr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcschr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcschr,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -470,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcsrchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcsrchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -482,6 +563,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcscmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcscmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcscmp,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -494,6 +579,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcsncmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcsncmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcsncmp,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -512,6 +601,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcslen_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcslen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -524,6 +617,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcsnlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcsnlen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -539,6 +636,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wmemchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wmemchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (BMI2)),
@@ -552,6 +653,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && HAS_CPU_FEATURE (MOVBE)),
 			      __wmemcmp_avx2_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wmemcmp_avx2_movbe_rtm)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
 			       && HAS_CPU_FEATURE (MOVBE)),
@@ -570,6 +676,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wmemset_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wmemset_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
 			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __wmemset_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, wmemset,
@@ -595,6 +705,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcpy_chk_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcpy_chk_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memcpy_chk_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
@@ -623,6 +741,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcpy_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcpy_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memcpy_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
@@ -665,6 +791,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __mempcpy_chk_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __mempcpy_chk_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __mempcpy_chk_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
@@ -702,6 +836,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __mempcpy_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __mempcpy_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __mempcpy_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
@@ -723,6 +865,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strncmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, strncmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strncmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strncmp,
 			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
 			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __strncmp_evex)
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 34e0a1295f..c12a023a19 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
 
 static inline void *
@@ -38,6 +39,9 @@ IFUNC_SELECTOR (void)
 	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
 	return OPTIMIZE (evex_movbe);
 
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_movbe_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2_movbe);
     }
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 83db955826..fe003b28e1 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
@@ -71,6 +75,14 @@ IFUNC_SELECTOR (void)
 	  return OPTIMIZE (evex_unaligned);
 	}
 
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx_unaligned_erms_rtm);
+
+	  return OPTIMIZE (avx_unaligned_rtm);
+	}
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	{
 	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index fea6c832f4..6fdf53ec1c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
@@ -69,6 +73,14 @@ IFUNC_SELECTOR (void)
 	  return OPTIMIZE (evex_unaligned);
 	}
 
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx2_unaligned_erms_rtm);
+
+	  return OPTIMIZE (avx2_unaligned_rtm);
+	}
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	{
 	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index fae721cdb0..091d691dc6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -20,6 +20,8 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
 
@@ -39,6 +41,9 @@ IFUNC_SELECTOR (void)
       if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
 	return OPTIMIZE (evex_unaligned);
 
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_unaligned_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2_unaligned);
     }
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
new file mode 100644
index 0000000000..87b076c7c4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index c81da19bf0..cf893e77b3 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -34,9 +34,13 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
@@ -107,8 +111,8 @@ L(cros_page_boundary):
 # endif
 	addq	%rdi, %rax
 	addq	%rcx, %rax
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(aligned_more):
@@ -224,8 +228,7 @@ L(last_4x_vec_or_less):
 
 	jnz	L(first_vec_x3_check)
 	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_2x_vec):
@@ -243,8 +246,7 @@ L(last_2x_vec):
 	testl	%eax, %eax
 	jnz	L(first_vec_x1_check)
 	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x0_check):
@@ -253,8 +255,7 @@ L(first_vec_x0_check):
 	cmpq	%rax, %rdx
 	jbe	L(zero)
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x1_check):
@@ -264,8 +265,7 @@ L(first_vec_x1_check):
 	jbe	L(zero)
 	addq	$VEC_SIZE, %rax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2_check):
@@ -275,8 +275,7 @@ L(first_vec_x2_check):
 	jbe	L(zero)
 	addq	$(VEC_SIZE * 2), %rax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x3_check):
@@ -286,12 +285,14 @@ L(first_vec_x3_check):
 	jbe	L(zero)
 	addq	$(VEC_SIZE * 3), %rax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(zero):
-	VZEROUPPER
+	xorl	%eax, %eax
+	jmp     L(return_vzeroupper)
+
+	.p2align 4
 L(null):
 	xorl	%eax, %eax
 	ret
@@ -301,24 +302,21 @@ L(null):
 L(first_vec_x0):
 	tzcntl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
 	addq	$VEC_SIZE, %rax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
 	addq	$(VEC_SIZE * 2), %rax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(4x_vec_end):
@@ -337,8 +335,7 @@ L(first_vec_x3):
 	tzcntl	%eax, %eax
 	addq	$(VEC_SIZE * 3), %rax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
new file mode 100644
index 0000000000..cf4eff5d4a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCMP
+# define MEMCMP __memcmp_avx2_movbe_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memcmp-avx2-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index e3a35b899e..9d5c9c72b3 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -47,6 +47,10 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
 # define VEC_MASK ((1 << VEC_SIZE) - 1)
 
@@ -55,7 +59,7 @@
            memcmp has to use UNSIGNED comparison for elemnts.
 */
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
 	shl	$2, %RDX_LP
@@ -123,8 +127,8 @@ ENTRY (MEMCMP)
 	vptest	%ymm0, %ymm5
 	jnc	L(4x_vec_end)
 	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(last_2x_vec):
@@ -144,8 +148,7 @@ L(last_vec):
 	vpmovmskb %ymm2, %eax
 	subl    $VEC_MASK, %eax
 	jnz	L(first_vec)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec):
@@ -164,8 +167,7 @@ L(wmemcmp_return):
 	movzbl	(%rsi, %rcx), %edx
 	sub	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_WMEMCMP
 	.p2align 4
@@ -367,8 +369,7 @@ L(last_4x_vec):
 	vpmovmskb %ymm2, %eax
 	subl    $VEC_MASK, %eax
 	jnz	L(first_vec)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(4x_vec_end):
@@ -394,8 +395,7 @@ L(4x_vec_end):
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 	sub	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x1):
@@ -410,8 +410,7 @@ L(first_vec_x1):
 	movzbl	VEC_SIZE(%rsi, %rcx), %edx
 	sub	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2):
@@ -426,7 +425,6 @@ L(first_vec_x2):
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 	sub	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 END (MEMCMP)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
new file mode 100644
index 0000000000..1ec1962e86
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -0,0 +1,17 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
+
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+# define VZEROUPPER_RETURN jmp	 L(return)
+
+# define SECTION(p)		p##.avx.rtm
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 50fffeb5ce..386624b3c4 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -158,11 +158,12 @@ L(last_2x_vec):
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-	VZEROUPPER
 #if !defined USE_MULTIARCH || !IS_IN (libc)
 L(nop):
-#endif
 	ret
+#else
+	VZEROUPPER_RETURN
+#endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 
@@ -255,8 +256,11 @@ L(last_2x_vec):
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
 L(return):
-	VZEROUPPER
+#if VEC_SIZE > 16
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
 	ret
+#endif
 
 L(movsb):
 	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
@@ -324,8 +328,7 @@ L(between_32_63):
 	VMOVU	-32(%rsi,%rdx), %YMM1
 	VMOVU	%YMM0, (%rdi)
 	VMOVU	%YMM1, -32(%rdi,%rdx)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 #endif
 #if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
@@ -334,7 +337,7 @@ L(between_16_31):
 	VMOVU	-16(%rsi,%rdx), %XMM1
 	VMOVU	%XMM0, (%rdi)
 	VMOVU	%XMM1, -16(%rdi,%rdx)
-	ret
+	VZEROUPPER_RETURN
 #endif
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
@@ -387,8 +390,7 @@ L(more_2x_vec):
 	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 L(last_4x_vec):
 	/* Copy from 2 * VEC to 4 * VEC. */
 	VMOVU	(%rsi), %VEC(0)
@@ -399,8 +401,7 @@ L(last_4x_vec):
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 L(more_8x_vec):
 	cmpq	%rsi, %rdi
@@ -456,8 +457,7 @@ L(loop_4x_vec_forward):
 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
 	/* Store the first VEC.  */
 	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
@@ -508,8 +508,7 @@ L(loop_4x_vec_backward):
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
 	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 L(large_forward):
@@ -544,8 +543,7 @@ L(loop_large_forward):
 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
 	/* Store the first VEC.  */
 	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 L(large_backward):
 	/* Don't use non-temporal store if there is overlap between
@@ -579,8 +577,7 @@ L(loop_large_backward):
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
 	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
new file mode 100644
index 0000000000..cea2d2a72d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ce488dd9e8..20efe7ac7c 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -20,14 +20,22 @@
 
 # include <sysdep.h>
 
+# ifndef MEMRCHR
+#  define MEMRCHR	__memrchr_avx2
+# endif
+
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-	.section .text.avx,"ax",@progbits
-ENTRY (__memrchr_avx2)
+	.section SECTION(.text),"ax",@progbits
+ENTRY (MEMRCHR)
 	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
 	vpbroadcastb %xmm0, %ymm0
@@ -134,8 +142,8 @@ L(loop_4x_vec):
 	vpmovmskb %ymm1, %eax
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(last_4x_vec_or_less):
@@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
 	addq	%rax, %rdx
 	jl	L(zero)
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_2x_vec):
@@ -191,31 +198,27 @@ L(last_2x_vec):
 	jl	L(zero)
 	addl	$(VEC_SIZE * 2), %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x0):
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x1):
 	bsrl	%eax, %eax
 	addl	$VEC_SIZE, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x2):
 	bsrl	%eax, %eax
 	addl	$(VEC_SIZE * 2), %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x3):
@@ -232,8 +235,7 @@ L(last_vec_x1_check):
 	jl	L(zero)
 	addl	$VEC_SIZE, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x3_check):
@@ -243,12 +245,14 @@ L(last_vec_x3_check):
 	jl	L(zero)
 	addl	$(VEC_SIZE * 3), %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(zero):
-	VZEROUPPER
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+
+	.p2align 4
 L(null):
 	xorl	%eax, %eax
 	ret
@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
 
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_or_less):
@@ -315,8 +318,7 @@ L(last_vec_or_less):
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
 	addq	%r8, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_2x_aligned):
@@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
 	addq	%r8, %rax
-	VZEROUPPER
-	ret
-END (__memrchr_avx2)
+	VZEROUPPER_RETURN
+END (MEMRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
new file mode 100644
index 0000000000..8ac3e479bb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -0,0 +1,10 @@
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return)
+
+#define SECTION(p) p##.avx.rtm
+#define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+#define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+
+#include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 7ab3d89849..ae0860f36a 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -14,9 +14,15 @@
   movq r, %rax; \
   vpbroadcastd %xmm0, %ymm0
 
-# define SECTION(p)		p##.avx
-# define MEMSET_SYMBOL(p,s)	p##_avx2_##s
-# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+# ifndef SECTION
+#  define SECTION(p)		p##.avx
+# endif
+# ifndef MEMSET_SYMBOL
+#  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# endif
+# ifndef WMEMSET_SYMBOL
+#  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+# endif
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9f14e956d1..7747bc5d8b 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -45,17 +45,14 @@
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
+#  define VZEROUPPER_SHORT_RETURN	vzeroupper; ret
 # else
 #  define VZEROUPPER
 # endif
 #endif
 
 #ifndef VZEROUPPER_SHORT_RETURN
-# if VEC_SIZE > 16
-#  define VZEROUPPER_SHORT_RETURN	vzeroupper
-# else
-#  define VZEROUPPER_SHORT_RETURN	rep
-# endif
+# define VZEROUPPER_SHORT_RETURN	rep; ret
 #endif
 
 #ifndef MOVQ
@@ -127,8 +124,7 @@ L(entry_from_bzero):
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
@@ -151,14 +147,12 @@ ENTRY (__memset_erms)
 ENTRY (MEMSET_SYMBOL (__memset, erms))
 # endif
 L(stosb):
-	/* Issue vzeroupper before rep stosb.  */
-	VZEROUPPER
 	mov	%RDX_LP, %RCX_LP
 	movzbl	%sil, %eax
 	mov	%RDI_LP, %RDX_LP
 	rep stosb
 	mov	%RDX_LP, %RAX_LP
-	ret
+	VZEROUPPER_RETURN
 # if VEC_SIZE == 16
 END (__memset_erms)
 # else
@@ -185,8 +179,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 L(stosb_more_2x_vec):
 	cmpq	$REP_STOSB_THRESHOLD, %rdx
@@ -200,8 +193,11 @@ L(more_2x_vec):
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 L(return):
-	VZEROUPPER
+#if VEC_SIZE > 16
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
 	ret
+#endif
 
 L(loop_start):
 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
@@ -227,7 +223,6 @@ L(loop):
 	cmpq	%rcx, %rdx
 	jne	L(loop)
 	VZEROUPPER_SHORT_RETURN
-	ret
 L(less_vec):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -251,40 +246,34 @@ L(less_vec):
 	jb	1f
 	movb	%cl, (%rdi)
 1:
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 # if VEC_SIZE > 32
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
 	VMOVU	%YMM0, -32(%rdi,%rdx)
 	VMOVU	%YMM0, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 # endif
 # if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
 	VMOVU	%XMM0, -16(%rdi,%rdx)
 	VMOVU	%XMM0, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 # endif
 	/* From 8 to 15.  No branch when size == 8.  */
 L(between_8_15):
 	movq	%rcx, -8(%rdi,%rdx)
 	movq	%rcx, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 	movl	%ecx, -4(%rdi,%rdx)
 	movl	%ecx, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
 	movw	%cx, -2(%rdi,%rdx)
 	movw	%cx, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
new file mode 100644
index 0000000000..acc5f6e2fb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2_rtm
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
new file mode 100644
index 0000000000..60a2ccfe53
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
new file mode 100644
index 0000000000..637fb557c4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCAT
+# define STRCAT __strcat_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
new file mode 100644
index 0000000000..81f20d1d8e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCHR
+# define STRCHR __strchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 47bc3c9949..da7d262065 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -38,9 +38,13 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCHR)
 	movl	%edi, %ecx
 	/* Broadcast CHAR to YMM0.  */
@@ -93,8 +97,8 @@ L(cros_page_boundary):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(aligned_more):
@@ -190,8 +194,7 @@ L(first_vec_x0):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x1):
@@ -205,8 +208,7 @@ L(first_vec_x1):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2):
@@ -220,8 +222,7 @@ L(first_vec_x2):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(4x_vec_end):
@@ -247,8 +248,7 @@ L(first_vec_x3):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 END (STRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index efe6584076..9bae2099d9 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -29,6 +29,7 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
 	  && CPU_FEATURES_CPU_P (cpu_features, BMI2))
 	return OPTIMIZE (evex);
 
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2);
     }
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
new file mode 100644
index 0000000000..cdcf818b91
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2_rtm
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
new file mode 100644
index 0000000000..aecd30d97f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCMP
+# define STRCMP __strcmp_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 8fb8eedcde..5d1c9d9018 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -55,6 +55,10 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -75,7 +79,7 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCMP)
 # ifdef USE_AS_STRNCMP
 	/* Check for simple cases (0 or 1) in offset.  */
@@ -137,8 +141,8 @@ L(return):
 	movzbl	(%rsi, %rdx), %edx
 	subl	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(return_vec_size):
@@ -171,8 +175,7 @@ L(return_vec_size):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(return_2_vec_size):
@@ -205,8 +208,7 @@ L(return_2_vec_size):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(return_3_vec_size):
@@ -239,8 +241,7 @@ L(return_3_vec_size):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(next_3_vectors):
@@ -366,8 +367,7 @@ L(back_to_loop):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(test_vec):
@@ -410,8 +410,7 @@ L(test_vec):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(test_2_vec):
@@ -454,8 +453,7 @@ L(test_2_vec):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(test_3_vec):
@@ -496,8 +494,7 @@ L(test_3_vec):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(loop_cross_page):
@@ -566,8 +563,7 @@ L(loop_cross_page):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(loop_cross_page_2_vec):
@@ -641,8 +637,7 @@ L(loop_cross_page_2_vec):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNCMP
 L(string_nbyte_offset_check):
@@ -684,8 +679,7 @@ L(cross_page_loop):
 # ifndef USE_AS_WCSCMP
 L(different):
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_WCSCMP
 	.p2align 4
@@ -695,16 +689,14 @@ L(different):
 	setl	%al
 	negl	%eax
 	orl	$1, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 # endif
 
 # ifdef USE_AS_STRNCMP
 	.p2align 4
 L(zero):
 	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(char0):
@@ -718,8 +710,7 @@ L(char0):
 	movzbl	(%rdi), %eax
 	subl	%ecx, %eax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 # endif
 
 	.p2align 4
@@ -744,8 +735,7 @@ L(last_vector):
 	movzbl	(%rsi, %rdx), %edx
 	subl	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	/* Comparing on page boundary region requires special treatment:
 	   It must done one vector at the time, starting with the wider
@@ -866,7 +856,6 @@ L(cross_page_4bytes):
 	testl	%eax, %eax
 	jne	L(cross_page_loop)
 	subl	%ecx, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 END (STRCMP)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index e947cefb08..c99c08aa3f 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
 	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
 	return OPTIMIZE (evex);
 
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2);
     }
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
new file mode 100644
index 0000000000..c2c581ecf7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCPY
+# define STRCPY __strcpy_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
new file mode 100644
index 0000000000..75b4b7612c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRLEN
+# define STRLEN __strlen_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 645e04461f..82826e1098 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -36,9 +36,13 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check for zero length.  */
@@ -111,8 +115,8 @@ L(cros_page_boundary):
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 # endif
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(aligned_more):
@@ -231,8 +235,7 @@ L(last_4x_vec_or_less):
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_2x_vec):
@@ -253,8 +256,7 @@ L(last_2x_vec):
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x0_check):
@@ -267,8 +269,7 @@ L(first_vec_x0_check):
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x1_check):
@@ -282,8 +283,7 @@ L(first_vec_x1_check):
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2_check):
@@ -297,8 +297,7 @@ L(first_vec_x2_check):
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x3_check):
@@ -312,8 +311,7 @@ L(first_vec_x3_check):
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(max):
@@ -321,8 +319,7 @@ L(max):
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(zero):
@@ -338,8 +335,7 @@ L(first_vec_x0):
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x1):
@@ -350,8 +346,7 @@ L(first_vec_x1):
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2):
@@ -362,8 +357,7 @@ L(first_vec_x2):
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(4x_vec_end):
@@ -389,8 +383,7 @@ L(first_vec_x3):
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 END (STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
new file mode 100644
index 0000000000..0dcea18dbb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_avx2_rtm
+#include "strcat-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
new file mode 100644
index 0000000000..37d1224bb9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCMP	__strncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index 4069946f80..880e39659f 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
 	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
 	return OPTIMIZE (evex);
 
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2);
     }
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
new file mode 100644
index 0000000000..79e7083299
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
new file mode 100644
index 0000000000..04f1626a5c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2_rtm
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
new file mode 100644
index 0000000000..5def14ec1c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRRCHR
+# define STRRCHR __strrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 4381e6ab3e..9f22a15e25 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -36,9 +36,13 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE	32
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRRCHR)
 	movd	%esi, %xmm4
 	movl	%edi, %ecx
@@ -166,8 +170,8 @@ L(return_value):
 # endif
 	bsrl	%eax, %eax
 	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(match):
@@ -198,8 +202,7 @@ L(find_nul):
 	jz	L(return_value)
 	bsrl	%eax, %eax
 	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(char_and_nul):
@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
 	jz	L(return_null)
 	bsrl	%eax, %eax
 	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(return_null):
 	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 END (STRRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
new file mode 100644
index 0000000000..d49dbbf0b4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2_rtm
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
new file mode 100644
index 0000000000..d6ca2b8064
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_avx2_rtm
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
new file mode 100644
index 0000000000..35658d7365
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2_rtm
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
new file mode 100644
index 0000000000..4e88c70cc6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
new file mode 100644
index 0000000000..7437ebee2d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2_rtm
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
index f0bcfd180d..95f3cfb2cf 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -29,6 +29,7 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
 	  && CPU_FEATURES_CPU_P (cpu_features, BMI2))
 	return OPTIMIZE (evex);
 
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2);
     }
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
new file mode 100644
index 0000000000..9bf760833f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2_rtm
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
new file mode 100644
index 0000000000..58ed21db01
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2_rtm
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
new file mode 100644
index 0000000000..31104d1215
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2_movbe_rtm
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2-movbe-rtm.S"
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index 1738d7f955..223f1a5949 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -95,6 +95,28 @@ lose:									      \
 #define R14_LP	r14
 #define R15_LP	r15
 
+/* Zero upper vector registers and return with xtest.  NB: Use VZEROALL
+   to avoid RTM abort triggered by VZEROUPPER inside transactionally.  */
+#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
+	xtest;							\
+	jz	1f;						\
+	vzeroall;						\
+	ret;							\
+1:								\
+	vzeroupper;						\
+	ret
+
+/* Zero upper vector registers and return.  */
+#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+	VZEROUPPER;						\
+	ret
+#endif
+
+#ifndef VZEROUPPER_RETURN
+# define VZEROUPPER_RETURN	VZEROUPPER; ret
+#endif
+
 #else	/* __ASSEMBLER__ */
 
 /* Long and pointer size in bytes.  */