diff options
Diffstat (limited to 'sysdeps/i386/multiarch/memrchr-sse2-bsf.S')
-rw-r--r-- | sysdeps/i386/multiarch/memrchr-sse2-bsf.S | 417 |
1 files changed, 417 insertions, 0 deletions
diff --git a/sysdeps/i386/multiarch/memrchr-sse2-bsf.S b/sysdeps/i386/multiarch/memrchr-sse2-bsf.S new file mode 100644 index 0000000000..043e1bbd23 --- /dev/null +++ b/sysdeps/i386/multiarch/memrchr-sse2-bsf.S @@ -0,0 +1,417 @@ +/* Optimized memrchr with sse2 + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# define MEMCHR __memrchr_sse2_bsf + + .text +ENTRY (MEMCHR) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + add $16, %ecx + add $16, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches0) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + add $64, %ecx + add $64, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + bsr %eax, %eax + + add %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsr %eax, %eax + add %ecx, %eax + ret + + .p2align 4 +L(matches16): + bsr %eax, %eax + lea 16(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches32): + bsr %eax, %eax + lea 32(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches48): + bsr %eax, %eax + lea 48(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches0_1): + bsr %eax, %eax + sub $64, %edx + add %eax, %edx + jl L(return_null) + add %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + bsr %eax, %eax + sub $48, %edx + add %eax, %edx + jl L(return_null) + lea 16(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches32_1): + bsr %eax, %eax + sub $32, %edx + add %eax, %edx + jl L(return_null) + lea 32(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches48_1): + bsr %eax, %eax + sub $16, %edx + add %eax, %edx + jl L(return_null) + lea 48(%ecx, %eax), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + mov %edx, %ecx + + pmovmskb %xmm1, %edx + + and %ecx, %edx + test %edx, %edx + jz L(return_null) + + bsr %edx, %ecx + add %ecx, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + mov %ecx, %eax + punpcklbw %xmm1, %xmm1 + add $16, %edx + jz L(return_null) + + pshufd $0, %xmm1, %xmm1 + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (MEMCHR) +#endif |