From 693fb948841c7564ce3cd1ec4b31c0130abc8c42 Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Mon, 5 Sep 2011 17:11:11 -0400 Subject: Optimized strchr and strrchr with SSE2 on x86-32 --- sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S | 284 +++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S (limited to 'sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S') diff --git a/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S new file mode 100644 index 0000000000..f1bcb78819 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S @@ -0,0 +1,284 @@ +/* strrchr with SSE2 with bsf and bsr + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + .text +ENTRY (__strrchr_sse2_bsf) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + PUSH (%edi) + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + pshufd $0, %xmm1, %xmm1 + ja L(crosscashe) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + + test %eax, %eax + jnz L(unaligned_match1) + + test %edx, %edx + jnz L(return_null) + + and $-16, %edi + add $16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_return_value1): + bsf %edx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_null) + bsr %eax, %eax + add %edi, %eax + POP (%edi) + ret + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_match1): + test %edx, %edx + jnz L(unaligned_return_value1) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + lea 16(%edi), %esi + and $-16, %edi + add $16, %edi + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 + L(crosscashe): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm3, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + add $16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_return_value): + add %ecx, %edi + bsf %edx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_null) + bsr %eax, %eax + add %edi, %eax + POP (%edi) + ret + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(unaligned_return_value) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + add $16, %edi + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jz L(loop) + +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %ebx, %ebx + jz L(return_null_1) + bsr %ebx, %eax + add %esi, %eax + + POP (%ebx) + POP (%esi) + + sub $16, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(return_value_1) + mov %eax, %ebx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(return_value_1): + bsf %ecx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + + bsr %eax, %eax + add %edi, %eax + sub $16, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + CFI_PUSH (%ebx) + CFI_PUSH (%esi) +/* Return NULL. */ + .p2align 4 +L(return_null_1): + POP (%ebx) + POP (%esi) + POP (%edi) + xor %eax, %eax + ret + +END (__strrchr_sse2_bsf) +#endif + -- cgit 1.4.1