diff options
author | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-06-16 15:07:12 -0700 |
---|---|---|
committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-06-16 20:17:45 -0700 |
commit | c22eb807b0c8125101f6a274795425be2bbd0386 (patch) | |
tree | 12f3b90aa183a801d36ac26efbd0b21dd4ec6f8e /sysdeps/x86_64/multiarch/strspn-sse4.c | |
parent | 8da9f346cb2051844348785b8a932ec44489e0b7 (diff) | |
download | glibc-c22eb807b0c8125101f6a274795425be2bbd0386.tar.gz glibc-c22eb807b0c8125101f6a274795425be2bbd0386.tar.xz glibc-c22eb807b0c8125101f6a274795425be2bbd0386.zip |
x86: Rename generic functions with unique postfix for clarity
No functions are changed. It just renames generic implementations from '{func}_sse2' to '{func}_generic'. This is just because the postfix "_sse2" was overloaded and was used for files that had hand-optimized sse2 assembly implementations and files that just redirected back to the generic implementation. Full xcheck passed on x86_64.
Diffstat (limited to 'sysdeps/x86_64/multiarch/strspn-sse4.c')
-rw-r--r-- | sysdeps/x86_64/multiarch/strspn-sse4.c | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strspn-sse4.c b/sysdeps/x86_64/multiarch/strspn-sse4.c new file mode 100644 index 0000000000..d044916688 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn-sse4.c @@ -0,0 +1,136 @@ +/* strspn with SSE4.2 intrinsics + Copyright (C) 2009-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <nmmintrin.h> +#include <string.h> +#include "varshift.h" + +/* We use 0x12: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any non-A byte and + the offset of the first byte. There are 2 cases: + + 1. The first 16byte data element has the non-A byte, including + EOS, at the offset X. + 2. The first 16byte data element is valid and doesn't have the non-A + byte. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + case ECX CFlag ZFlag SFlag + 1 X 1 0/1 0 + 2 16 0 0 0 + + We exit from the loop for case 1. */ + +extern size_t __strspn_generic (const char *, const char *) attribute_hidden; + + +size_t +__attribute__ ((section (".text.sse4.2"))) +__strspn_sse42 (const char *s, const char *a) +{ + if (*a == 0) + return 0; + + const char *aligned; + __m128i mask, maskz, zero; + unsigned int maskz_bits; + unsigned int offset = (int) ((size_t) a & 15); + zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + { + mask = __m128i_shift_right (mask0, offset); + offset = (unsigned int) ((size_t) s & 15); + if (offset) + goto start_unaligned; + + aligned = s; + goto start_loop; + } + } + + /* A is aligned. */ + mask = _mm_loadu_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (mask, zero); + maskz_bits = _mm_movemask_epi8 (maskz); + if (maskz_bits == 0) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_generic (s, a); + } + aligned = s; + offset = (unsigned int) ((size_t) s & 15); + + if (offset != 0) + { + start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + __m128i adj_value = __m128i_shift_right (value, offset); + + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (value, zero); + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + return length; + aligned += 16; + } + +start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + unsigned int index = _mm_cmpistri (mask, value, 0x12); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + } +} |