From cc9f2e47a0a1b4ab0d78ff1d036ec7f8ebc74294 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 16 Jul 2010 15:37:38 -0700 Subject: Speed up SSE4.2 strcasestr by avoiding indirect function call. --- sysdeps/x86_64/multiarch/Makefile | 3 +- sysdeps/x86_64/multiarch/strcasestr-nonascii.c | 50 +++++++++++++++++++ sysdeps/x86_64/multiarch/strcasestr.c | 4 ++ sysdeps/x86_64/multiarch/strstr.c | 68 ++++++++------------------ 4 files changed, 76 insertions(+), 49 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/strcasestr-nonascii.c (limited to 'sysdeps') diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 0ca914a377..f1251a0a50 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -7,7 +7,7 @@ ifeq ($(subdir),string) sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ - memmove-ssse3-back + memmove-ssse3-back strcasestr-nonascii ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 @@ -15,5 +15,6 @@ CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 CFLAGS-strstr.c += -msse4 CFLAGS-strcasestr.c += -msse4 +CFLAGS-strcasestr-nonascii.c += -msse4 endif endif diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c new file mode 100644 index 0000000000..0804e96de7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c @@ -0,0 +1,50 @@ +/* strstr with SSE4.2 intrinsics + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +# include + + +/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C + locale. */ +static inline __m128i +__m128i_strloadu_tolower (const unsigned char * p) +{ + union + { + char b[16]; + __m128i x; + } u; + + for (int i = 0; i < 16; ++i) + if (p[i] == 0) + { + u.b[i] = 0; + break; + } + else + u.b[i] = tolower (p[i]); + + return u.x; +} + + +#define STRCASESTR_NONASCII +#define USE_AS_STRCASESTR +#define STRSTR_SSE42 attribute_hidden __strcasestr_sse42_nonascii +#include "strstr.c" diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c index 064e3ef4fd..d1cfb3b264 100644 --- a/sysdeps/x86_64/multiarch/strcasestr.c +++ b/sysdeps/x86_64/multiarch/strcasestr.c @@ -1,3 +1,7 @@ +extern char *__strcasestr_sse42_nonascii (const unsigned char *s1, + const unsigned char *s2) + attribute_hidden; + #define USE_AS_STRCASESTR #define STRSTR_SSE42 __strcasestr_sse42 #include "strstr.c" diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index 76d5ad16df..f647354971 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -1,5 +1,5 @@ /* strstr with SSE4.2 intrinsics - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009, 2010 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -67,10 +67,10 @@ case ECX CFlag ZFlag SFlag 3 X 1 0 0/1 - 4a 0 1 0 0 - 4b 0 1 0 1 - 4c 0 < X 1 0 0/1 - 5 16 0 1 0 + 4a 0 1 0 0 + 4b 0 1 0 1 + 4c 0 < X 1 0 0/1 + 5 16 0 1 0 3. An initial ordered-comparison fragment match, we fix up to do subsequent string comparison @@ -147,8 +147,7 @@ __m128i_shift_right (__m128i value, int offset) If EOS occurs within less than 16B before 4KB boundary, we don't cross to next page. */ -static __m128i -__attribute__ ((section (".text.sse4.2"))) +static inline __m128i __m128i_strloadu (const unsigned char * p) { int offset = ((size_t) p & (16 - 1)); @@ -164,14 +163,12 @@ __m128i_strloadu (const unsigned char * p) return _mm_loadu_si128 ((__m128i *) p); } -#ifdef USE_AS_STRCASESTR +#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII /* Similar to __m128i_strloadu. Convert to lower case for POSIX/C locale. */ - -static __m128i -__attribute__ ((section (".text.sse4.2"))) -__m128i_strloadu_tolower_posix (const unsigned char * p) +static inline __m128i +__m128i_strloadu_tolower (const unsigned char * p) { __m128i frag = __m128i_strloadu (p); @@ -184,39 +181,13 @@ __m128i_strloadu_tolower_posix (const unsigned char * p) return _mm_blendv_epi8 (frag, mask2, mask1); } -/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C - locale. */ - -static __m128i -__attribute__ ((section (".text.sse4.2"))) -__m128i_strloadu_tolower (const unsigned char * p) -{ - union - { - char b[16]; - __m128i x; - } u; - - for (int i = 0; i < 16; i++) - if (p[i] == 0) - { - u.b[i] = 0; - break; - } - else - u.b[i] = tolower (p[i]); - - return u.x; -} #endif /* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP algorithm) overlap for a fully populated 16B vector. Input parameter: 1st 16Byte loaded from the reference string of a strstr function. - We don't use KMP algorithm if reference string is less than 16B. - */ - + We don't use KMP algorithm if reference string is less than 16B. */ static int __inline__ __attribute__ ((__always_inline__,)) KMP16Bovrlap (__m128i s2) @@ -236,7 +207,7 @@ KMP16Bovrlap (__m128i s2) return 1; else if (!k1) { - /* There are al least two ditinct char in s2. If byte 0 and 1 are + /* There are al least two distinct chars in s2. If byte 0 and 1 are idential and the distinct value lies farther down, we can deduce the next byte offset to restart full compare is least no earlier than byte 3. */ @@ -256,23 +227,24 @@ STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2) #define p1 s1 const unsigned char *p2 = s2; - if (p2[0] == '\0') +#ifndef STRCASESTR_NONASCII + if (__builtin_expect (p2[0] == '\0', 0)) return (char *) p1; - if (p1[0] == '\0') + if (__builtin_expect (p1[0] == '\0', 0)) return NULL; /* Check if p1 length is 1 byte long. */ - if (p1[1] == '\0') + if (__builtin_expect (p1[1] == '\0', 0)) return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL; +#endif #ifdef USE_AS_STRCASESTR - __m128i (*strloadu) (const unsigned char *); + if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) + != 0, 0)) + return __strcasestr_sse42_nonascii (s1, s2); - if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) == 0) - strloadu = __m128i_strloadu_tolower_posix; - else - strloadu = __m128i_strloadu_tolower; +# define strloadu __m128i_strloadu_tolower #else # define strloadu __m128i_strloadu #endif -- cgit 1.4.1