diff options
author | H.J. Lu <hongjiu.lu@intel.com> | 2010-02-15 11:17:50 -0800 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2010-02-15 11:17:50 -0800 |
commit | 904057bc17fb3e3127a35ebf35fcac8d5bc8269b (patch) | |
tree | de5ec58dcca85fcc063a43a92e0d1f957eecebdb /sysdeps/i386/i686/multiarch/strcmp-sse4.S | |
parent | 0ab85ce4298875d0dce8bfd4fe2cecd9cda840e3 (diff) | |
download | glibc-904057bc17fb3e3127a35ebf35fcac8d5bc8269b.tar.gz glibc-904057bc17fb3e3127a35ebf35fcac8d5bc8269b.tar.xz glibc-904057bc17fb3e3127a35ebf35fcac8d5bc8269b.zip |
32bit memcmp/strcmp/strncmp optimized for SSSE3/SSS4.2
Diffstat (limited to 'sysdeps/i386/i686/multiarch/strcmp-sse4.S')
-rw-r--r-- | sysdeps/i386/i686/multiarch/strcmp-sse4.S | 378 |
1 files changed, 378 insertions, 0 deletions
diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S new file mode 100644 index 0000000000..977647203f --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -0,0 +1,378 @@ +/* strcmp with SSE4.2 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifndef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strcmp_sse4_2 +# endif +# define STR1 4 +# define STR2 STR1+4 +#else +# ifndef STRCMP +# define STRCMP __strncmp_sse4_2 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +#endif + + .section .text.sse4.2,"ax",@progbits +ENTRY (STRCMP) +#ifdef USE_AS_STRNCMP + PUSH (%ebp) +#endif + mov STR1(%esp), %edx + mov STR2(%esp), %eax +#ifdef USE_AS_STRNCMP + movl CNT(%esp), %ebp + test %ebp, %ebp + je L(eq) +#endif + mov %dx, %cx + and $0xfff, %cx + cmp $0xff0, %cx + ja L(first4bytes) + movdqu (%edx), %xmm2 + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(first4bytes) + movd %xmm2, %ecx + cmp (%eax), %ecx + jne L(less4bytes) + movdqu (%eax), %xmm1 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm0 + ptest %xmm1, %xmm0 + jnc L(less16bytes) + pcmpeqb %xmm0, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + +#ifdef USE_AS_STRNCMP + sub $16, %ebp + jbe L(eq) +#endif + add $16, %edx + add $16, %eax +L(first4bytes): + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + je L(eq) +#endif + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + je L(eq) +#endif + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + je L(eq) +#endif + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + je L(eq) +#endif + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + je L(eq) +#endif + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + je L(eq) +#endif + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + je L(eq) +#endif + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + sub $8, %ebp + je L(eq) +#endif + add $8, %eax + add $8, %edx + + PUSH (%ebx) + PUSH (%edi) + PUSH (%esi) + mov %edx, %edi + mov %eax, %esi + xorl %eax, %eax +L(check_offset): + movl %edi, %ebx + movl %esi, %ecx + andl $0xfff, %ebx + andl $0xfff, %ecx + cmpl %ebx, %ecx + cmovl %ebx, %ecx + lea -0xff0(%ecx), %edx + sub %edx, %edi + sub %edx, %esi + testl %edx, %edx + jg L(crosspage) +L(loop): + movdqu (%esi,%edx), %xmm2 + movdqu (%edi,%edx), %xmm1 + pcmpistri $0x1a, %xmm2, %xmm1 + jbe L(end) + +#ifdef USE_AS_STRNCMP + sub $16, %ebp + jbe L(more16byteseq) +#endif + + add $16, %edx + jle L(loop) +L(crosspage): + movzbl (%edi,%edx), %eax + movzbl (%esi,%edx), %ebx + subl %ebx, %eax + jne L(ret) + testl %ebx, %ebx + je L(ret) +#ifdef USE_AS_STRNCMP + sub $1, %ebp + jbe L(more16byteseq) +#endif + inc %edx + cmp $15, %edx + jle L(crosspage) + add $16, %edi + add $16, %esi + jmp L(check_offset) + +L(end): + jnc L(ret) +#ifdef USE_AS_STRNCMP + sub %ecx, %ebp + jbe L(more16byteseq) +#endif + lea (%ecx,%edx), %ebx + movzbl (%edi,%ebx), %eax + movzbl (%esi,%ebx), %ecx + subl %ecx, %eax +L(ret): + POP (%esi) + POP (%edi) + POP (%ebx) +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + +#ifdef USE_AS_STRNCMP +L(more16byteseq): + POP (%esi) + POP (%edi) + POP (%ebx) +#endif +L(eq): + xorl %eax, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + .p2align 4 +L(less16bytes): + add $0xfefefeff, %ecx + jnc L(less4bytes) + xor (%edx), %ecx + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(less4bytes) + +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + jbe L(eq) +#endif + mov 4(%edx), %ecx + cmp 4(%eax), %ecx + jne L(more4bytes) + add $0xfefefeff, %ecx + jnc L(more4bytes) + xor 4(%edx), %ecx + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(more4bytes) + +#ifdef USE_AS_STRNCMP + sub $8, %ebp + jbe L(eq) +#endif + + add $8, %edx + add $8, %eax +L(less4bytes): + + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + je L(eq) +#endif + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + je L(eq) +#endif + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + je L(eq) +#endif + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +L(more4bytes): +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + je L(eq) +#endif + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + je L(eq) +#endif + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + je L(eq) +#endif + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + je L(eq) +#endif + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +END (STRCMP) + +#endif |