diff options
author | Liubov Dmitrieva <liubov.dmitrieva@gmail.com> | 2011-10-15 11:10:08 -0400 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-10-15 11:10:08 -0400 |
commit | be13f7bff66e1850f9057dd813d6e7be022d9516 (patch) | |
tree | d918a146db9072ad120f0010481c53d9b450c9a5 /sysdeps/x86_64/multiarch/memcmp-ssse3.S | |
parent | 556a2007974ed39a68c87a8b5181f8057ecd0d6f (diff) | |
download | glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.gz glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.xz glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.zip |
Optimized memcmp and wmemcmp for x86-64 and x86-32
Diffstat (limited to 'sysdeps/x86_64/multiarch/memcmp-ssse3.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1997 |
1 files changed, 1997 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..b3a2ca1edd --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -0,0 +1,1997 @@ +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# ifndef ALIGN +# define ALIGN(n) .p2align n +# endif + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + atom_text_section +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx + test %rdx, %rdx + jz L(equal) +# endif + mov %rdx, %rcx + mov %rdi, %rdx + cmp $48, %rcx; + jae L(48bytesormore) /* LEN => 48 */ + + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +/* ECX >= 32. */ +L(48bytesormore): + movdqu (%rdi), %xmm3 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%rdi), %rdi + lea 16(%rsi), %rsi + sub $0xffff, %edx + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %rdx, %rdi + sub %rdx, %rsi + add %rdx, %rcx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %rdx, %rsi + +# ifndef USE_AS_WMEMCMP + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + ALIGN (2) +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif + + ALIGN (4) +L(shr_0): + cmp $80, %rcx + lea -48(%rcx), %rcx + jae L(shr_0_gobble) + xor %eax, %eax + movdqa (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_0_gobble): + movdqa (%rsi), %xmm0 + xor %eax, %eax + pcmpeqb (%rdi), %xmm0 + sub $32, %rcx + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %rcx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%rsi), %xmm0 + movdqa 48(%rsi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%rdi), %xmm0 + pcmpeqb 48(%rdi), %xmm2 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %rcx + jge L(next) + inc %edx + add $32, %rcx +L(next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_1): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $1, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $1, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_1_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $1, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $1, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $1, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $1, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_1_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_1_gobble_next) + inc %edx + add $32, %rcx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 1(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + + ALIGN (4) +L(shr_2): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $2, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $2, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_2_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $2, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $2, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $2, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $2, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_2_gobble_next) + inc %edx + add $32, %rcx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 2(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_3): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $3, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $3, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_3_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $3, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $3, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $3, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $3, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_3_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_3_gobble_next) + inc %edx + add $32, %rcx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 3(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + ALIGN (4) +L(shr_4): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $4, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $4, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_4_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $4, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $4, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $4, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $4, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_4_gobble_next) + inc %edx + add $32, %rcx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 4(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_5): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $5, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $5, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_5_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $5, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $5, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $5, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $5, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_5_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_5_gobble_next) + inc %edx + add $32, %rcx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 5(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_6): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $6, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $6, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_6_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $6, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $6, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $6, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $6, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_6_gobble_next) + inc %edx + add $32, %rcx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 6(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_7): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $7, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $7, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_7_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $7, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $7, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $7, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $7, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_7_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_7_gobble_next) + inc %edx + add $32, %rcx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 7(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + ALIGN (4) +L(shr_8): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $8, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $8, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_8_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $8, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $8, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $8, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $8, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_8_gobble_next) + inc %edx + add $32, %rcx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 8(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_9): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $9, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $9, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_9_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $9, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $9, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $9, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $9, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_9_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_9_gobble_next) + inc %edx + add $32, %rcx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 9(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_10): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $10, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $10, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_10_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $10, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $10, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $10, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $10, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_10_gobble_next) + inc %edx + add $32, %rcx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 10(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_11): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $11, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_11_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $11, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $11, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $11, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $11, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_11_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_11_gobble_next) + inc %edx + add $32, %rcx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 11(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + ALIGN (4) +L(shr_12): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $12, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_12_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $12, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $12, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $12, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $12, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_12_gobble_next) + inc %edx + add $32, %rcx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 12(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_13): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $13, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_13_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $13, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $13, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $13, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $13, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_13_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_13_gobble_next) + inc %edx + add $32, %rcx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 13(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_14): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $14, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_14_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $14, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $14, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $14, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $14, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_14_gobble_next) + inc %edx + add $32, %rcx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 14(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_15): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $15, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_15_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $15, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $15, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $15, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $15, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_15_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_15_gobble_next) + inc %edx + add $32, %rcx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 15(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) +# endif + ALIGN (4) +L(exit): + pmovmskb %xmm1, %r8d + sub $0xffff, %r8d + jz L(first16bytes) + lea -16(%rsi), %rsi + lea -16(%rdi), %rdi + mov %r8d, %edx +L(first16bytes): + add %rax, %rsi +L(less16bytes): +# ifndef USE_AS_WMEMCMP + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) + + movzbl -9(%rdi), %eax + movzbl -9(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte16): + movzbl -16(%rdi), %eax + movzbl -16(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte17): + movzbl -15(%rdi), %eax + movzbl -15(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte18): + movzbl -14(%rdi), %eax + movzbl -14(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte19): + movzbl -13(%rdi), %eax + movzbl -13(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte20): + movzbl -12(%rdi), %eax + movzbl -12(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte21): + movzbl -11(%rdi), %eax + movzbl -11(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte22): + movzbl -10(%rdi), %eax + movzbl -10(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(next_24_bytes): + lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + mov -9(%rdi), %eax + and $0xff, %eax + mov -9(%rsi), %edx + and $0xff, %edx + sub %edx, %eax + ret +# else +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) + ret + + ALIGN (4) +L(second_double_word): + mov -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) + ret + + ALIGN (4) +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) + ret + + ALIGN (4) +L(fourth_double_word): + mov -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) + ret +# endif + + ALIGN (4) +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) + cmp $0, %ecx + je L(0bytes) +# ifndef USE_AS_WMEMCMP + cmp $1, %ecx + je L(1bytes) + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif + + ALIGN (4) +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) +# ifndef USE_AS_WMEMCMP + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) +# else + jmp L(12bytes) +# endif + + ALIGN (4) +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) +# ifndef USE_AS_WMEMCMP + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) +# else + jmp L(20bytes) +# endif + + ALIGN (4) +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) +# ifndef USE_AS_WMEMCMP + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) +# else + jmp L(28bytes) +# endif + + ALIGN (4) +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) +# ifndef USE_AS_WMEMCMP + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + ALIGN (4) +L(more40bytes): + cmp $40, %ecx + je L(40bytes) +# ifndef USE_AS_WMEMCMP + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + ALIGN (4) +L(44bytes): + movl -44(%rdi), %eax + movl -44(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + movl -40(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + movl -36(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + movl -32(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + movl -28(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + movl -24(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + movl -20(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + movl -16(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + movl -12(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + movl -8(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + movl -4(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# else + ALIGN (4) +L(44bytes): + movl -44(%rdi), %eax + cmp -44(%rsi), %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + cmp -40(%rsi), %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + cmp -36(%rsi), %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + cmp -32(%rsi), %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + cmp -28(%rsi), %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + cmp -24(%rsi), %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + cmp -20(%rsi), %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + ALIGN (4) +L(45bytes): + movl -45(%rdi), %eax + movl -45(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(41bytes): + movl -41(%rdi), %eax + movl -41(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(37bytes): + movl -37(%rdi), %eax + movl -37(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(33bytes): + movl -33(%rdi), %eax + movl -33(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(29bytes): + movl -29(%rdi), %eax + movl -29(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(25bytes): + movl -25(%rdi), %eax + movl -25(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(21bytes): + movl -21(%rdi), %eax + movl -21(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(17bytes): + movl -17(%rdi), %eax + movl -17(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(13bytes): + movl -13(%rdi), %eax + movl -13(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(9bytes): + movl -9(%rdi), %eax + movl -9(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(5bytes): + movl -5(%rdi), %eax + movl -5(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(1bytes): + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + ALIGN (4) +L(46bytes): + movl -46(%rdi), %eax + movl -46(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(42bytes): + movl -42(%rdi), %eax + movl -42(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(38bytes): + movl -38(%rdi), %eax + movl -38(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(34bytes): + movl -34(%rdi), %eax + movl -34(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(30bytes): + movl -30(%rdi), %eax + movl -30(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(26bytes): + movl -26(%rdi), %eax + movl -26(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(22bytes): + movl -22(%rdi), %eax + movl -22(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(18bytes): + movl -18(%rdi), %eax + movl -18(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(14bytes): + movl -14(%rdi), %eax + movl -14(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(10bytes): + movl -10(%rdi), %eax + movl -10(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(6bytes): + movl -6(%rdi), %eax + movl -6(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(2bytes): + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + xor %eax, %eax + ret + + ALIGN (4) +L(47bytes): + movl -47(%rdi), %eax + movl -47(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(43bytes): + movl -43(%rdi), %eax + movl -43(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(39bytes): + movl -39(%rdi), %eax + movl -39(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(35bytes): + movl -35(%rdi), %eax + movl -35(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(31bytes): + movl -31(%rdi), %eax + movl -31(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(27bytes): + movl -27(%rdi), %eax + movl -27(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(23bytes): + movl -23(%rdi), %eax + movl -23(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(19bytes): + movl -19(%rdi), %eax + movl -19(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(15bytes): + movl -15(%rdi), %eax + movl -15(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(11bytes): + movl -11(%rdi), %eax + movl -11(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(7bytes): + movl -7(%rdi), %eax + movl -7(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + ALIGN (4) +L(find_diff): + cmpb %cl, %al + jne L(set) + cmpw %cx, %ax + jne L(set) + shr $16, %eax + shr $16, %ecx + cmpb %cl, %al + jne L(set) + +/* We get there only if we already know there is a +difference. */ + + cmp %ecx, %eax +L(set): + sbb %eax, %eax + sbb $-1, %eax + ret +# else + +/* for wmemcmp */ + ALIGN (4) +L(find_diff): + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + ALIGN (4) +L(find_diff_bigger): + ret +# endif + + ALIGN (4) +L(equal): + xor %eax, %eax + ret + +END (MEMCMP) +#endif |