about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/strspn-sse4.c
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-06-16 15:07:12 -0700
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-06-16 20:17:45 -0700
commitc22eb807b0c8125101f6a274795425be2bbd0386 (patch)
tree12f3b90aa183a801d36ac26efbd0b21dd4ec6f8e /sysdeps/x86_64/multiarch/strspn-sse4.c
parent8da9f346cb2051844348785b8a932ec44489e0b7 (diff)
downloadglibc-c22eb807b0c8125101f6a274795425be2bbd0386.tar.gz
glibc-c22eb807b0c8125101f6a274795425be2bbd0386.tar.xz
glibc-c22eb807b0c8125101f6a274795425be2bbd0386.zip
x86: Rename generic functions with unique postfix for clarity
No functions are changed. It just renames generic implementations from
'{func}_sse2' to '{func}_generic'. This is just because the postfix
"_sse2" was overloaded and was used for files that had hand-optimized
sse2 assembly implementations and files that just redirected back
to the generic implementation.

Full xcheck passed on x86_64.
Diffstat (limited to 'sysdeps/x86_64/multiarch/strspn-sse4.c')
-rw-r--r--sysdeps/x86_64/multiarch/strspn-sse4.c136
1 files changed, 136 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strspn-sse4.c b/sysdeps/x86_64/multiarch/strspn-sse4.c
new file mode 100644
index 0000000000..d044916688
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strspn-sse4.c
@@ -0,0 +1,136 @@
+/* strspn with SSE4.2 intrinsics
+   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+#include "varshift.h"
+
+/* We use 0x12:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_NEGATIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any non-A byte and
+   the offset of the first byte.  There are 2 cases:
+
+   1. The first 16byte data element has the non-A byte, including
+      EOS, at the offset X.
+   2. The first 16byte data element is valid and doesn't have the non-A
+      byte.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		 X	  1	 0/1	  0
+    2		16	  0	  0	  0
+
+   We exit from the loop for case 1.  */
+
+extern size_t __strspn_generic (const char *, const char *) attribute_hidden;
+
+
+size_t
+__attribute__ ((section (".text.sse4.2")))
+__strspn_sse42 (const char *s, const char *a)
+{
+  if (*a == 0)
+    return 0;
+
+  const char *aligned;
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & -16L);
+      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
+
+      /* Find where the NULL terminator is.  */
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
+    }
+
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return __strspn_generic (s, a);
+    }
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
+
+  if (offset != 0)
+    {
+    start_unaligned:
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & -16L);
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      __m128i adj_value = __m128i_shift_right (value, offset);
+
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
+      /* No need to check CFlag since it is always 1.  */
+      if (length < 16 - offset)
+	return length;
+      /* Find where the NULL terminator is.  */
+      maskz = _mm_cmpeq_epi8 (value, zero);
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+	return length;
+      aligned += 16;
+    }
+
+start_loop:
+  while (1)
+    {
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
+      if (cflag)
+	return (size_t) (aligned + index - s);
+      aligned += 16;
+    }
+}