From 093ecf92998de275820296058ad5648e354b9e0d Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Fri, 7 Oct 2011 11:49:10 -0400 Subject: Improve 64 bit memchr, memrchr, rawmemchr with SSE2 --- sysdeps/x86_64/memrchr.S | 380 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 380 insertions(+) create mode 100644 sysdeps/x86_64/memrchr.S (limited to 'sysdeps/x86_64/memrchr.S') diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S new file mode 100644 index 0000000000..a85dc6b03b --- /dev/null +++ b/sysdeps/x86_64/memrchr.S @@ -0,0 +1,380 @@ +/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using + + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + + .text +ENTRY (memrchr) + movd %rsi, %xmm1 + + sub $16, %rdx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + add %rdx, %rdi + pshufd $0, %xmm1, %xmm1 + + movdqu (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %rdi + mov %rdi, %rcx + and $15, %rcx + jz L(loop_prolog) + + add $16, %rdi + add $16, %rdx + and $-16, %rdi + sub %rcx, %rdx + + .p2align 4 +L(loop_prolog): + sub $64, %rdx + jbe L(exit_loop) + + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %rdi + sub $64, %rdx + jbe L(exit_loop) + + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches0) + + mov %rdi, %rcx + and $63, %rcx + jz L(align64_loop) + + add $64, %rdi + add $64, %rdx + and $-64, %rdi + sub %rcx, %rdx + + .p2align 4 +L(align64_loop): + sub $64, %rdi + sub $64, %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%rdi), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + bsr %eax, %eax + + add %rdi, %rax + ret + + .p2align 4 +L(exit_loop): + add $64, %rdx + cmp $32, %rdx + jbe L(exit_loop_32) + + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %rdx + jbe L(return_null) + + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %rdx + jbe L(return_null) + + pcmpeqb 32(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsr %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsr %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsr %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches48): + bsr %eax, %eax + lea 48(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches0_1): + bsr %eax, %eax + sub $64, %rdx + add %rax, %rdx + jl L(return_null) + add %rdi, %rax + ret + + .p2align 4 +L(matches16_1): + bsr %eax, %eax + sub $48, %rdx + add %rax, %rdx + jl L(return_null) + lea 16(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches32_1): + bsr %eax, %eax + sub $32, %rdx + add %rax, %rdx + jl L(return_null) + lea 32(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches48_1): + bsr %eax, %eax + sub $16, %rdx + add %rax, %rdx + jl L(return_null) + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%rdi), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + pmovmskb %xmm1, %eax + + and %edx, %eax + test %eax, %eax + jz L(return_null) + + bsr %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + add $16, %rdx + + pshufd $0, %xmm1, %xmm1 + + mov %rdi, %rcx + and $15, %rcx + jz L(length_less16_offset0) + + mov %rdi, %rcx + and $15, %rcx + mov %cl, %dh + mov %rcx, %r8 + add %dl, %dh + and $-16, %rdi + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + + sar %cl, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %eax + test %eax, %eax + jz L(return_null) + + bsr %eax, %eax + add %rdi, %rax + add %r8, %rax + ret + + .p2align 4 +L(length_less16_part2): + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %eax + + test %eax, %eax + jnz L(length_less16_part2_return) + + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + + mov %r8, %rcx + sar %cl, %eax + test %eax, %eax + jz L(return_null) + + bsr %eax, %eax + add %rdi, %rax + add %r8, %rax + ret + + .p2align 4 +L(length_less16_part2_return): + bsr %eax, %eax + lea 16(%rax, %rdi), %rax + ret + +END (memrchr) +strong_alias (memrchr, __memrchr) -- cgit 1.4.1