From 42e08a5438ddbd9d550d914733c0bc5ba96d79ec Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 30 Jul 2010 00:14:04 -0700 Subject: Implement optimized strcaecmp for x86-64. --- ChangeLog | 11 ++ NEWS | 2 +- string/Makefile | 2 +- string/test-strcasecmp.c | 276 +++++++++++++++++++++++++++++++++ sysdeps/x86_64/Makefile | 3 +- sysdeps/x86_64/locale-defines.sym | 11 ++ sysdeps/x86_64/strcasecmp.S | 1 + sysdeps/x86_64/strcasecmp_l-nonascii.c | 5 + sysdeps/x86_64/strcasecmp_l.S | 6 + sysdeps/x86_64/strcmp.S | 136 +++++++++++++++- 10 files changed, 449 insertions(+), 4 deletions(-) create mode 100644 string/test-strcasecmp.c create mode 100644 sysdeps/x86_64/locale-defines.sym create mode 100644 sysdeps/x86_64/strcasecmp.S create mode 100644 sysdeps/x86_64/strcasecmp_l-nonascii.c create mode 100644 sysdeps/x86_64/strcasecmp_l.S diff --git a/ChangeLog b/ChangeLog index f19b63b908..7b8c416f0c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,16 @@ 2010-07-30 Ulrich Drepper + * string/Makefile (strop-tests): Add strcasecmp. + * sysdeps/x86_64/Makefile [subdir=string] (sysdep_routines): Add + strcasecmp_l-nonascii. + (gen-as-const-headers): Add locale-defines.sym. + * sysdeps/x86_64/strcmp.S: Add support for strcasecmp implementation. + * sysdeps/x86_64/strcasecmp.S: New file. + * sysdeps/x86_64/strcasecmp_l.S: New file. + * sysdeps/x86_64/strcasecmp_l-nonascii.c: New file. + * sysdeps/x86_64/locale-defines.sym: New file. + * string/test-strcasecmp.c: New file. + * string/test-strcasestr.c: Test both ends of the range of characters. * sysdeps/x86_64/multiarch/strstr.c: Fix UCHIGH definition. diff --git a/NEWS b/NEWS index 8358f62efe..8d9bb43ec3 100644 --- a/NEWS +++ b/NEWS @@ -13,7 +13,7 @@ Version 2.13 * POWER7 optimizations: memset, memcmp, strncmp -* New optimized string functions for x86-64: strnlen +* New optimized string functions for x86-64: strnlen, strcasecmp Implemented by Ulrich Drepper. Version 2.12 diff --git a/string/Makefile b/string/Makefile index e8eb514155..4c160e9d2d 100644 --- a/string/Makefile +++ b/string/Makefile @@ -49,7 +49,7 @@ o-objects.ob := memcpy.o memset.o memchr.o strop-tests := memchr memcmp memcpy memmove mempcpy memset memccpy \ stpcpy stpncpy strcat strchr strcmp strcpy strcspn \ strlen strncmp strncpy strpbrk strrchr strspn memmem \ - strstr strcasestr strnlen + strstr strcasestr strnlen strcasecmp tests := tester inl-tester noinl-tester testcopy test-ffs \ tst-strlen stratcliff tst-svc tst-inlcall \ bug-strncat1 bug-strspn1 bug-strpbrk1 tst-bswap \ diff --git a/string/test-strcasecmp.c b/string/test-strcasecmp.c new file mode 100644 index 0000000000..7d1d110148 --- /dev/null +++ b/string/test-strcasecmp.c @@ -0,0 +1,276 @@ +/* Test and measure strcasecmp functions. + Copyright (C) 1999, 2002, 2003, 2005, 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Jakub Jelinek , 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#define TEST_MAIN +#include "test-string.h" + +typedef int (*proto_t) (const char *, const char *); +static int simple_strcasecmp (const char *, const char *); +static int stupid_strcasecmp (const char *, const char *); + +IMPL (stupid_strcasecmp, 0) +IMPL (simple_strcasecmp, 0) +IMPL (strcasecmp, 1) + +static int +simple_strcasecmp (const char *s1, const char *s2) +{ + int ret; + + while ((ret = ((unsigned char) tolower (*s1) + - (unsigned char) tolower (*s2))) == 0 + && *s1++) + ++s2; + return ret; +} + +static int +stupid_strcasecmp (const char *s1, const char *s2) +{ + size_t ns1 = strlen (s1) + 1, ns2 = strlen (s2) + 1; + size_t n = ns1 < ns2 ? ns1 : ns2; + int ret = 0; + + while (n--) + { + if ((ret = ((unsigned char) tolower (*s1) + - (unsigned char) tolower (*s2))) != 0) + break; + ++s1; + ++s2; + } + return ret; +} + +static void +do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result) +{ + int result = CALL (impl, s1, s2); + if ((exp_result == 0 && result != 0) + || (exp_result < 0 && result >= 0) + || (exp_result > 0 && result <= 0)) + { + error (0, 0, "Wrong result in function %s %d %d", impl->name, + result, exp_result); + ret = 1; + return; + } + + if (HP_TIMING_AVAIL) + { + hp_timing_t start __attribute ((unused)); + hp_timing_t stop __attribute ((unused)); + hp_timing_t best_time = ~ (hp_timing_t) 0; + size_t i; + + for (i = 0; i < 32; ++i) + { + HP_TIMING_NOW (start); + CALL (impl, s1, s2); + HP_TIMING_NOW (stop); + HP_TIMING_BEST (best_time, start, stop); + } + + printf ("\t%zd", (size_t) best_time); + } +} + +static void +do_test (size_t align1, size_t align2, size_t len, int max_char, + int exp_result) +{ + size_t i; + char *s1, *s2; + + if (len == 0) + return; + + align1 &= 7; + if (align1 + len + 1 >= page_size) + return; + + align2 &= 7; + if (align2 + len + 1 >= page_size) + return; + + s1 = (char *) (buf1 + align1); + s2 = (char *) (buf2 + align2); + + for (i = 0; i < len; i++) + { + s1[i] = toupper (1 + 23 * i % max_char); + s2[i] = tolower (s1[i]); + } + + s1[len] = s2[len] = 0; + s1[len + 1] = 23; + s2[len + 1] = 24 + exp_result; + if ((s2[len - 1] == 'z' && exp_result == -1) + || (s2[len - 1] == 'a' && exp_result == 1)) + s1[len - 1] += exp_result; + else + s2[len - 1] -= exp_result; + + if (HP_TIMING_AVAIL) + printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2); + + FOR_EACH_IMPL (impl, 0) + do_one_test (impl, s1, s2, exp_result); + + if (HP_TIMING_AVAIL) + putchar ('\n'); +} + +static void +do_random_tests (void) +{ + size_t i, j, n, align1, align2, pos, len1, len2; + int result; + long r; + unsigned char *p1 = buf1 + page_size - 512; + unsigned char *p2 = buf2 + page_size - 512; + + for (n = 0; n < ITERATIONS; n++) + { + align1 = random () & 31; + if (random () & 1) + align2 = random () & 31; + else + align2 = align1 + (random () & 24); + pos = random () & 511; + j = align1 > align2 ? align1 : align2; + if (pos + j >= 511) + pos = 510 - j - (random () & 7); + len1 = random () & 511; + if (pos >= len1 && (random () & 1)) + len1 = pos + (random () & 7); + if (len1 + j >= 512) + len1 = 511 - j - (random () & 7); + if (pos >= len1) + len2 = len1; + else + len2 = len1 + (len1 != 511 - j ? random () % (511 - j - len1) : 0); + j = (pos > len2 ? pos : len2) + align1 + 64; + if (j > 512) + j = 512; + for (i = 0; i < j; ++i) + { + p1[i] = tolower (random () & 255); + if (i < len1 + align1 && !p1[i]) + { + p1[i] = tolower (random () & 255); + if (!p1[i]) + p1[i] = tolower (1 + (random () & 127)); + } + } + for (i = 0; i < j; ++i) + { + p2[i] = toupper (random () & 255); + if (i < len2 + align2 && !p2[i]) + { + p2[i] = toupper (random () & 255); + if (!p2[i]) + toupper (p2[i] = 1 + (random () & 127)); + } + } + + result = 0; + memcpy (p2 + align2, p1 + align1, pos); + if (pos < len1) + { + if (tolower (p2[align2 + pos]) == p1[align1 + pos]) + { + p2[align2 + pos] = toupper (random () & 255); + if (tolower (p2[align2 + pos]) == p1[align1 + pos]) + p2[align2 + pos] = toupper (p1[align1 + pos] + + 3 + (random () & 127)); + } + + if (p1[align1 + pos] < tolower (p2[align2 + pos])) + result = -1; + else + result = 1; + } + p1[len1 + align1] = 0; + p2[len2 + align2] = 0; + + FOR_EACH_IMPL (impl, 1) + { + r = CALL (impl, (char *) (p1 + align1), (char *) (p2 + align2)); + /* Test whether on 64-bit architectures where ABI requires + callee to promote has the promotion been done. */ + asm ("" : "=g" (r) : "0" (r)); + if ((r == 0 && result) + || (r < 0 && result >= 0) + || (r > 0 && result <= 0)) + { + error (0, 0, "Iteration %zd - wrong result in function %s (%zd, %zd, %zd, %zd, %zd) %ld != %d, p1 %p p2 %p", + n, impl->name, align1, align2, len1, len2, pos, r, result, p1, p2); + ret = 1; + } + } + } +} + +int +test_main (void) +{ + size_t i; + + test_init (); + + printf ("%23s", ""); + FOR_EACH_IMPL (impl, 0) + printf ("\t%s", impl->name); + putchar ('\n'); + + for (i = 1; i < 16; ++i) + { + do_test (i, i, i, 127, 0); + do_test (i, i, i, 127, 1); + do_test (i, i, i, 127, -1); + } + + for (i = 1; i < 10; ++i) + { + do_test (0, 0, 2 << i, 127, 0); + do_test (0, 0, 2 << i, 254, 0); + do_test (0, 0, 2 << i, 127, 1); + do_test (0, 0, 2 << i, 254, 1); + do_test (0, 0, 2 << i, 127, -1); + do_test (0, 0, 2 << i, 254, -1); + } + + for (i = 1; i < 8; ++i) + { + do_test (i, 2 * i, 8 << i, 127, 0); + do_test (2 * i, i, 8 << i, 254, 0); + do_test (i, 2 * i, 8 << i, 127, 1); + do_test (2 * i, i, 8 << i, 254, 1); + do_test (i, 2 * i, 8 << i, 127, -1); + do_test (2 * i, i, 8 << i, 254, -1); + } + + do_random_tests (); + return ret; +} + +#include "../test-skeleton.c" diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index e8d0285e26..f7eeb155ed 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -12,7 +12,8 @@ sysdep_routines += _mcount endif ifeq ($(subdir),string) -sysdep_routines += cacheinfo +sysdep_routines += cacheinfo strcasecmp_l-nonascii +gen-as-const-headers += locale-defines.sym endif ifeq ($(subdir),elf) diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym new file mode 100644 index 0000000000..aebff9a4f9 --- /dev/null +++ b/sysdeps/x86_64/locale-defines.sym @@ -0,0 +1,11 @@ +#include +#include +#include + +-- + +LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales) +LC_CTYPE +_NL_CTYPE_NONASCII_CASE +LOCALE_DATA_VALUES offsetof (struct __locale_data, values) +SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0]) diff --git a/sysdeps/x86_64/strcasecmp.S b/sysdeps/x86_64/strcasecmp.S new file mode 100644 index 0000000000..fe49e820f2 --- /dev/null +++ b/sysdeps/x86_64/strcasecmp.S @@ -0,0 +1 @@ +/* In strcasecmp_l.S. */ diff --git a/sysdeps/x86_64/strcasecmp_l-nonascii.c b/sysdeps/x86_64/strcasecmp_l-nonascii.c new file mode 100644 index 0000000000..7a0a04f345 --- /dev/null +++ b/sysdeps/x86_64/strcasecmp_l-nonascii.c @@ -0,0 +1,5 @@ +#include + +#define __strcasecmp_l __strcasecmp_l_nonascii +#define USE_IN_EXTENDED_LOCALE_MODEL 1 +#include diff --git a/sysdeps/x86_64/strcasecmp_l.S b/sysdeps/x86_64/strcasecmp_l.S new file mode 100644 index 0000000000..5456b3a49e --- /dev/null +++ b/sysdeps/x86_64/strcasecmp_l.S @@ -0,0 +1,6 @@ +#define STRCMP __strcasecmp_l +#define USE_AS_STRCASECMP_L +#include "strcmp.S" + +weak_alias (__strcasecmp_l, strcasecmp_l) +libc_hidden_def (strcasecmp_l) diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index ac3fe14679..7b2b246866 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -51,6 +51,15 @@ je LABEL(strcmp_exitz); \ mov %r9, %r11 +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" + +/* No support for strcasecmp outside libc so far since it is not needed. */ +# ifdef NOT_IN_lib +# error "strcasecmp_l not implemented so far" +# endif + +# define UPDATE_STRNCMP_COUNTER #else # define UPDATE_STRNCMP_COUNTER # ifndef STRCMP @@ -64,6 +73,19 @@ .section .text.ssse3,"ax",@progbits #endif +#ifdef USE_AS_STRCASECMP_L +ENTRY (__strcasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + movq %fs:(%rax),%rdx + + /* 5-byte NOP. */ + .byte 0x0f,0x1f,0x44,0x00,0x00 +END (__strcasecmp) +weak_alias (__strcasecmp, strcasecmp) +libc_hidden_def (__strcasecmp) + /* FALLTHROUGH to strcasecmp_l. */ +#endif + ENTRY (BP_SYM (STRCMP)) #ifdef NOT_IN_libc /* Simple version since we can't use SSE registers in ld.so. */ @@ -84,6 +106,18 @@ L(neq): movl $1, %eax ret END (BP_SYM (STRCMP)) #else /* NOT_IN_libc */ +# ifdef USE_AS_STRCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax +# else + movq (%rdx), %rax +# endif + testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strcasecmp_l_nonascii +# endif + /* * This implementation uses SSE to compare up to 16 bytes at a time. */ @@ -99,6 +133,26 @@ END (BP_SYM (STRCMP)) /* Use 64bit AND here to avoid long NOP padding. */ and $0x3f, %rcx /* rsi alignment in cache line */ and $0x3f, %rax /* rdi alignment in cache line */ +# ifdef USE_AS_STRCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Lbelowupper: + .quad 0x4040404040404040 + .quad 0x4040404040404040 +.Ltopupper: + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +.Ltouppermask: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + movdqa .Lbelowupper(%rip), %xmm5 +# define UCLOW_reg %xmm5 + movdqa .Ltopupper(%rip), %xmm6 +# define UCHIGH_reg %xmm6 + movdqa .Ltouppermask(%rip), %xmm7 +# define LCQWORD_reg %xmm7 +# endif cmp $0x30, %ecx ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ cmp $0x30, %eax @@ -107,6 +161,26 @@ END (BP_SYM (STRCMP)) movlpd (%rsi), %xmm2 movhpd 8(%rdi), %xmm1 movhpd 8(%rsi), %xmm2 +# ifdef USE_AS_STRCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm8; \ + movdqa UCHIGH_reg, %xmm9; \ + movdqa reg2, %xmm10; \ + movdqa UCHIGH_reg, %xmm11; \ + pcmpgtb UCLOW_reg, %xmm8; \ + pcmpgtb reg1, %xmm9; \ + pcmpgtb UCLOW_reg, %xmm10; \ + pcmpgtb reg2, %xmm11; \ + pand %xmm9, %xmm8; \ + pand %xmm11, %xmm10; \ + pand LCQWORD_reg, %xmm8; \ + pand LCQWORD_reg, %xmm10; \ + por %xmm8, reg1; \ + por %xmm10, reg2 + TOLOWER (%xmm1, %xmm2) +# else +# define TOLOWER(reg1, reg2) +# endif pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ @@ -159,7 +233,13 @@ LABEL(ashr_0): movdqa (%rsi), %xmm1 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ +# ifndef USE_AS_STRCASECMP_L pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ +# else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ +# endif psubb %xmm0, %xmm1 /* packed sub of comparison results*/ pmovmskb %xmm1, %r9d shr %cl, %edx /* adjust 0xffff for offset */ @@ -183,6 +263,7 @@ LABEL(ashr_0): LABEL(loop_ashr_0): movdqa (%rsi, %rcx), %xmm1 movdqa (%rdi, %rcx), %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -198,6 +279,7 @@ LABEL(loop_ashr_0): add $16, %rcx movdqa (%rsi, %rcx), %xmm1 movdqa (%rdi, %rcx), %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -214,7 +296,7 @@ LABEL(loop_ashr_0): /* * The following cases will be handled by ashr_1 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(15) n -15 0(15 +(n-15) - n) ashr_1 */ .p2align 4 @@ -224,6 +306,7 @@ LABEL(ashr_1): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ pslldq $15, %xmm2 /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ psubb %xmm0, %xmm2 /* packed sub of comparison results*/ pmovmskb %xmm2, %r9d @@ -263,6 +346,7 @@ LABEL(gobble_ashr_1): # else palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -292,6 +376,7 @@ LABEL(gobble_ashr_1): # else palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -351,6 +436,7 @@ LABEL(ashr_2): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -390,6 +476,7 @@ LABEL(gobble_ashr_2): # else palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -420,6 +507,7 @@ LABEL(gobble_ashr_2): # else palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -472,6 +560,7 @@ LABEL(ashr_3): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -512,6 +601,7 @@ LABEL(gobble_ashr_3): # else palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -542,6 +632,7 @@ LABEL(gobble_ashr_3): # else palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -594,6 +685,7 @@ LABEL(ashr_4): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -634,6 +726,7 @@ LABEL(gobble_ashr_4): # else palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -664,6 +757,7 @@ LABEL(gobble_ashr_4): # else palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -716,6 +810,7 @@ LABEL(ashr_5): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -756,6 +851,7 @@ LABEL(gobble_ashr_5): # else palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -786,6 +882,7 @@ LABEL(gobble_ashr_5): # else palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -838,6 +935,7 @@ LABEL(ashr_6): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -878,6 +976,7 @@ LABEL(gobble_ashr_6): # else palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -908,6 +1007,7 @@ LABEL(gobble_ashr_6): # else palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -960,6 +1060,7 @@ LABEL(ashr_7): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -1000,6 +1101,7 @@ LABEL(gobble_ashr_7): # else palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1030,6 +1132,7 @@ LABEL(gobble_ashr_7): # else palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1082,6 +1185,7 @@ LABEL(ashr_8): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -1122,6 +1226,7 @@ LABEL(gobble_ashr_8): # else palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1152,6 +1257,7 @@ LABEL(gobble_ashr_8): # else palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1204,6 +1310,7 @@ LABEL(ashr_9): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -1244,6 +1351,7 @@ LABEL(gobble_ashr_9): # else palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1274,6 +1382,7 @@ LABEL(gobble_ashr_9): # else palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1326,6 +1435,7 @@ LABEL(ashr_10): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -1366,6 +1476,7 @@ LABEL(gobble_ashr_10): # else palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1396,6 +1507,7 @@ LABEL(gobble_ashr_10): # else palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1448,6 +1560,7 @@ LABEL(ashr_11): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -1488,6 +1601,7 @@ LABEL(gobble_ashr_11): # else palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1518,6 +1632,7 @@ LABEL(gobble_ashr_11): # else palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1570,6 +1685,7 @@ LABEL(ashr_12): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -1610,6 +1726,7 @@ LABEL(gobble_ashr_12): # else palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1640,6 +1757,7 @@ LABEL(gobble_ashr_12): # else palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1692,6 +1810,7 @@ LABEL(ashr_13): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -1732,6 +1851,7 @@ LABEL(gobble_ashr_13): # else palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1762,6 +1882,7 @@ LABEL(gobble_ashr_13): # else palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1814,6 +1935,7 @@ LABEL(ashr_14): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -1854,6 +1976,7 @@ LABEL(gobble_ashr_14): # else palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1884,6 +2007,7 @@ LABEL(gobble_ashr_14): # else palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1936,6 +2060,7 @@ LABEL(ashr_15): movdqa (%rsi), %xmm1 pcmpeqb %xmm1, %xmm0 pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm2 psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d @@ -1978,6 +2103,7 @@ LABEL(gobble_ashr_15): # else palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -2008,6 +2134,7 @@ LABEL(gobble_ashr_15): # else palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ # endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -2049,6 +2176,7 @@ LABEL(ashr_15_exittail): .p2align 4 LABEL(aftertail): + TOLOWER (%xmm1, %xmm3) pcmpeqb %xmm3, %xmm1 psubb %xmm0, %xmm1 pmovmskb %xmm1, %edx @@ -2076,6 +2204,12 @@ LABEL(less16bytes): movzbl (%rsi, %rdx), %ecx movzbl (%rdi, %rdx), %eax +# ifdef USE_AS_STRCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +# endif + sub %ecx, %eax ret -- cgit 1.4.1