about summary refs log tree commit diff
path: root/sysdeps/aarch64/multiarch
diff options
context:
space:
mode:
authorSiddhesh Poyarekar <siddhesh@sourceware.org>2017-11-20 18:25:04 +0530
committerSiddhesh Poyarekar <siddhesh@sourceware.org>2017-11-20 18:25:04 +0530
commit5a67c4fa010abb27e704aa4ea3896f3aa2b39ed7 (patch)
treeeaff55b8981265c960c737070c8f65999e78fba9 /sysdeps/aarch64/multiarch
parenteb332f9feb7637eeefed037a683d2a6130d058b1 (diff)
downloadglibc-5a67c4fa010abb27e704aa4ea3896f3aa2b39ed7.tar.gz
glibc-5a67c4fa010abb27e704aa4ea3896f3aa2b39ed7.tar.xz
glibc-5a67c4fa010abb27e704aa4ea3896f3aa2b39ed7.zip
aarch64: Optimized memset for falkor
The generic memset reads dczid_el0 on every memset.  This has a
significant impact on falkor for a range of sizes because reading
dczid_el0 is slow.

The DZP bit in the dczid_el0 register does not change dynamically, so
it is safe to read once during program startup.  With this patch
dczid_el0 is read once during startup and zva_size is cached.  This is
used to invoke the falkor-specific memset; the generic memset routine
remains unchanged.

The gains due to this are significant for falkor, with run time
reductions as high as 48%.  Here's a sample from the falkor tests:

Function: memset
Variant: walk
                      simple_memset	__memset_falkor	__memset_generic
=====================================================================
length=256, char=0:   139.96 (-698.28%)	   9.07 ( 48.26%)  17.53
length=257, char=0:   140.50 (-699.03%)	   9.53 ( 45.80%)  17.58
length=258, char=0:   140.96 (-703.95%)	   9.58 ( 45.36%)  17.53
length=259, char=0:   141.56 (-705.16%)	   9.53 ( 45.79%)  17.58
length=260, char=0:   142.15 (-710.76%)	   9.57 ( 45.39%)  17.53
length=261, char=0:   142.50 (-710.39%)	   9.53 ( 45.78%)  17.58
length=262, char=0:   142.97 (-715.09%)	   9.57 ( 45.42%)  17.54
length=263, char=0:   143.51 (-716.18%)	   9.53 ( 45.80%)  17.58
length=264, char=0:   143.93 (-720.55%)	   9.58 ( 45.39%)  17.54
length=265, char=0:   144.56 (-722.07%)	   9.53 ( 45.80%)  17.59
length=266, char=0:   144.98 (-726.42%)	   9.58 ( 45.42%)  17.54
length=267, char=0:   145.53 (-727.53%)	   9.53 ( 45.80%)  17.59
length=268, char=0:   146.25 (-731.81%)	   9.53 ( 45.79%)  17.58
length=269, char=0:   146.52 (-735.39%)	   9.53 ( 45.66%)  17.54
length=270, char=0:   146.97 (-735.81%)	   9.53 ( 45.80%)  17.58
length=271, char=0:   147.54 (-741.08%)	   9.58 ( 45.38%)  17.54
length=512, char=0:   268.26 (-1307.85%)  12.06 ( 36.71%)  19.05
length=513, char=0:   268.73 (-1273.89%)  13.56 ( 30.68%)  19.56
length=514, char=0:   269.31 (-1276.89%)  13.56 ( 30.68%)  19.56
length=515, char=0:   269.73 (-1279.05%)  13.56 ( 30.68%)  19.56
length=516, char=0:   270.34 (-1282.24%)  13.56 ( 30.67%)  19.56
length=517, char=0:   270.83 (-1284.71%)  13.56 ( 30.66%)  19.56
length=518, char=0:   271.20 (-1286.54%)  13.56 ( 30.67%)  19.56
length=519, char=0:   271.67 (-1288.67%)  13.65 ( 30.24%)  19.56
length=520, char=0:   272.14 (-1291.04%)  13.65 ( 30.22%)  19.56
length=521, char=0:   272.66 (-1293.69%)  13.65 ( 30.23%)  19.56
length=522, char=0:   273.14 (-1296.13%)  13.65 ( 30.20%)  19.56
length=523, char=0:   273.64 (-1298.75%)  13.65 ( 30.23%)  19.56
length=524, char=0:   274.34 (-1302.16%)  13.66 ( 30.20%)  19.57
length=525, char=0:   274.64 (-1297.78%)  13.56 ( 30.99%)  19.65
length=526, char=0:   275.20 (-1300.04%)  13.56 ( 31.01%)  19.66
length=527, char=0:   275.66 (-1302.86%)  13.56 ( 30.99%)  19.65
length=1024, char=0:  524.46 (-2169.75%)  20.12 ( 12.92%)  23.11
length=1025, char=0:  525.14 (-2124.63%)  21.62 (  8.40%)  23.61
length=1026, char=0:  525.59 (-2125.36%)  21.88 (  7.37%)  23.62
length=1027, char=0:  525.98 (-2127.14%)  21.62 (  8.46%)  23.62
length=1028, char=0:  526.68 (-2131.10%)  21.62 (  8.42%)  23.61
length=1029, char=0:  527.10 (-2131.70%)  21.79 (  7.73%)  23.62
length=1030, char=0:  527.54 (-2118.51%)  21.62 (  9.10%)  23.78
length=1031, char=0:  527.98 (-2136.37%)  21.62 (  8.43%)  23.61
length=1032, char=0:  528.70 (-2139.38%)  21.62 (  8.43%)  23.61
length=1033, char=0:  529.25 (-2124.37%)  21.62 (  9.11%)  23.79
length=1034, char=0:  529.48 (-2142.95%)  21.62 (  8.43%)  23.61
length=1035, char=0:  530.11 (-2145.13%)  21.62 (  8.44%)  23.61
length=1036, char=0:  530.76 (-2147.10%)  21.79 (  7.73%)  23.62
length=1037, char=0:  531.03 (-2149.45%)  21.62 (  8.42%)  23.61
length=1038, char=0:  531.64 (-2151.87%)  21.62 (  8.42%)  23.61
length=1039, char=0:  531.99 (-2151.63%)  21.80 (  7.75%)  23.63

	* sysdeps/aarch64/memset-reg.h: New file.
	* sysdeps/aarch64/memset.S: Use it.
	(__memset): Rename to MEMSET macro.
	[ZVA_MACRO]: Use zva_macro.
	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
	Add memset_generic and memset_falkor.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add memset ifuncs.
	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
	local variable zva_size.
	* sysdeps/aarch64/multiarch/memset.c: New file.
	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
	* sysdeps/aarch64/multiarch/memset_falkor.S: New file.
	* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
	(DCZID_DZP_MASK): New macro.
	(DCZID_BS_MASK): Likewise.
	(init_cpu_features): Read and set zva_size.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
	(struct cpu_features): New member zva_size.
Diffstat (limited to 'sysdeps/aarch64/multiarch')
-rw-r--r--sysdeps/aarch64/multiarch/Makefile2
-rw-r--r--sysdeps/aarch64/multiarch/ifunc-impl-list.c5
-rw-r--r--sysdeps/aarch64/multiarch/init-arch.h8
-rw-r--r--sysdeps/aarch64/multiarch/memset.c40
-rw-r--r--sysdeps/aarch64/multiarch/memset_falkor.S53
-rw-r--r--sysdeps/aarch64/multiarch/memset_generic.S27
-rw-r--r--sysdeps/aarch64/multiarch/rtld-memset.S23
7 files changed, 154 insertions, 4 deletions
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 9aa1e79a80..aa179c499e 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,4 +1,4 @@
 ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
-		   memmove_falkor
+		   memmove_falkor memset_generic memset_falkor
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 2cb74d5b43..b19fcdb689 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -46,6 +46,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+  IFUNC_IMPL (i, name, memset,
+	      /* Enable this on non-falkor processors too so that other cores
+		 can do a comparative analysis with __memset_generic.  */
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
 
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index 3af442c538..a756dada10 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -18,6 +18,8 @@
 
 #include <ldsodefs.h>
 
-#define INIT_ARCH()				\
-  uint64_t __attribute__((unused)) midr =	\
-    GLRO(dl_aarch64_cpu_features).midr_el1;
+#define INIT_ARCH()							      \
+  uint64_t __attribute__((unused)) midr =				      \
+    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
+  unsigned __attribute__((unused)) zva_size =				      \
+    GLRO(dl_aarch64_cpu_features).zva_size;
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
new file mode 100644
index 0000000000..66e00eee00
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -0,0 +1,40 @@
+/* Multiple versions of memset. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
+extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+
+libc_ifunc (__libc_memset, (IS_FALKOR (midr) && zva_size == 64
+			    ? __memset_falkor
+			    : __memset_generic));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
new file mode 100644
index 0000000000..4fcfb2ff50
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_falkor.S
@@ -0,0 +1,53 @@
+/* Memset for falkor.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <memset-reg.h>
+
+/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
+   resolver and assume ZVA size of 64 bytes.  The IFUNC resolver takes care to
+   use this function only when ZVA is enabled.  */
+
+#if IS_IN (libc)
+.macro zva_macro
+	.p2align 4
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.  */
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+# define ZVA_MACRO zva_macro
+# define MEMSET __memset_falkor
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
new file mode 100644
index 0000000000..55134ed4ea
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_generic.S
@@ -0,0 +1,27 @@
+/* Memset for aarch64, default version for internal use.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_generic
+/* Add a hidden definition for use within libc.so.  */
+# ifdef SHARED
+	.globl __GI_memset; __GI_memset = __memset_generic
+# endif
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
new file mode 100644
index 0000000000..44373930bb
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/rtld-memset.S
@@ -0,0 +1,23 @@
+/* Memset for aarch64, for the dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (rtld)
+# define MEMSET memset
+# include <sysdeps/aarch64/memset.S>
+#endif