From be13f7bff66e1850f9057dd813d6e7be022d9516 Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Sat, 15 Oct 2011 11:10:08 -0400 Subject: Optimized memcmp and wmemcmp for x86-64 and x86-32 --- sysdeps/x86_64/multiarch/Makefile | 3 +- sysdeps/x86_64/multiarch/memcmp-sse4.S | 192 ++- sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1997 ++++++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/memcmp.S | 19 +- sysdeps/x86_64/multiarch/wmemcmp-c.c | 5 + sysdeps/x86_64/multiarch/wmemcmp-sse4.S | 4 + sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 + sysdeps/x86_64/multiarch/wmemcmp.S | 47 + 8 files changed, 2242 insertions(+), 29 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp.S (limited to 'sysdeps/x86_64/multiarch') diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index a5254dc93c..e0bb9847a8 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ - strrchr-sse2-no-bsf strchr-sse2-no-bsf + strrchr-sse2-no-bsf strchr-sse2-no-bsf \ + memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index fc439bb013..28dd505d99 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -1,5 +1,5 @@ -/* memcmp with SSE4.1 - Copyright (C) 2010 Free Software Foundation, Inc. +/* memcmp with SSE4.1, wmemcmp with SSE4.1 + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,43 +20,54 @@ #ifndef NOT_IN_libc -#include -#include "asm-syntax.h" +# include -#ifndef MEMCMP -# define MEMCMP __memcmp_sse4_1 -#endif +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_1 +# endif -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif +# ifndef ALIGN +# define ALIGN(n) .p2align n +# endif -#define JMPTBL(I, B) (I - B) +# define JMPTBL(I, B) (I - B) -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ lea TABLE(%rip), %r11; \ movslq (%r11, INDEX, SCALE), %rcx; \ add %r11, %rcx; \ jmp *%rcx; \ ud2 +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + .section .text.sse4.1,"ax",@progbits ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx +# endif pxor %xmm0, %xmm0 cmp $79, %rdx ja L(79bytesormore) +# ifndef USE_AS_WMEMCMP cmp $1, %rdx je L(firstbyte) +# endif add %rdx, %rsi add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +# ifndef USE_AS_WMEMCMP ALIGN (4) L(firstbyte): movzbl (%rdi), %eax movzbl (%rsi), %ecx sub %ecx, %eax ret +# endif ALIGN (4) L(79bytesormore): @@ -308,11 +319,11 @@ L(less32bytesin256): ALIGN (4) L(512bytesormore): -#ifdef DATA_CACHE_SIZE_HALF +# ifdef DATA_CACHE_SIZE_HALF mov $DATA_CACHE_SIZE_HALF, %r8 -#else +# else mov __x86_64_data_cache_size_half(%rip), %r8 -#endif +# endif mov %r8, %r9 shr $1, %r8 add %r9, %r8 @@ -624,11 +635,11 @@ L(less32bytesin256in2alinged): ALIGN (4) L(512bytesormorein2aligned): -#ifdef DATA_CACHE_SIZE_HALF +# ifdef DATA_CACHE_SIZE_HALF mov $DATA_CACHE_SIZE_HALF, %r8 -#else +# else mov __x86_64_data_cache_size_half(%rip), %r8 -#endif +# endif mov %r8, %r9 shr $1, %r8 add %r9, %r8 @@ -667,6 +678,7 @@ L(64bytesormore_loopin2aligned): BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) L(L2_L3_cache_aglined): sub $64, %rdx + ALIGN (4) L(L2_L3_aligned_128bytes_loop): prefetchnta 0x1c0(%rdi) @@ -803,13 +815,19 @@ L(12bytes): jne L(diffin8bytes) L(4bytes): mov -4(%rsi), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%rdi), %eax cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif jne L(diffin4bytes) L(0bytes): xor %eax, %eax ret +# ifndef USE_AS_WMEMCMP +/* unreal case for wmemcmp */ ALIGN (4) L(65bytes): movdqu -65(%rdi), %xmm1 @@ -1017,6 +1035,7 @@ L(1bytes): movzbl -1(%rsi), %ecx sub %ecx, %eax ret +# endif ALIGN (4) L(68bytes): @@ -1047,13 +1066,20 @@ L(20bytes): pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 jnc L(less16bytes) - mov -4(%rdi), %eax mov -4(%rsi), %ecx + +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif jne L(diffin4bytes) xor %eax, %eax ret +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ ALIGN (4) L(69bytes): movdqu -69(%rsi), %xmm1 @@ -1161,6 +1187,7 @@ L(23bytes): jne L(diffin8bytes) xor %eax, %eax ret +# endif ALIGN (4) L(72bytes): @@ -1191,13 +1218,16 @@ L(24bytes): pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 jnc L(less16bytes) - mov -8(%rdi), %rax + mov -8(%rsi), %rcx + mov -8(%rdi), %rax cmp %rax, %rcx jne L(diffin8bytes) xor %eax, %eax ret +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ ALIGN (4) L(73bytes): movdqu -73(%rsi), %xmm1 @@ -1312,7 +1342,7 @@ L(27bytes): jne L(diffin4bytes) xor %eax, %eax ret - +# endif ALIGN (4) L(76bytes): movdqu -76(%rsi), %xmm1 @@ -1346,13 +1376,19 @@ L(28bytes): mov -12(%rsi), %rcx cmp %rax, %rcx jne L(diffin8bytes) - mov -4(%rdi), %eax mov -4(%rsi), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif jne L(diffin4bytes) xor %eax, %eax ret +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ ALIGN (4) L(77bytes): movdqu -77(%rsi), %xmm1 @@ -1474,7 +1510,7 @@ L(31bytes): jne L(diffin8bytes) xor %eax, %eax ret - +# endif ALIGN (4) L(64bytes): movdqu -64(%rdi), %xmm2 @@ -1527,7 +1563,17 @@ L(diffin8bytes): jne L(diffin4bytes) shr $32, %rcx shr $32, %rax + +# ifdef USE_AS_WMEMCMP +/* for wmemcmp */ + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret +# endif + L(diffin4bytes): +# ifndef USE_AS_WMEMCMP cmp %cx, %ax jne L(diffin2bytes) shr $16, %ecx @@ -1546,11 +1592,28 @@ L(end): and $0xff, %ecx sub %ecx, %eax ret +# else + +/* for wmemcmp */ + mov $1, %eax + jl L(nequal_bigger) + neg %eax + ret + + ALIGN (4) +L(nequal_bigger): + ret + +L(unreal_case): + xor %eax, %eax + ret +# endif END (MEMCMP) .section .rodata.sse4.1,"a",@progbits ALIGN (3) +# ifndef USE_AS_WMEMCMP L(table_64bytes): .int JMPTBL (L(0bytes), L(table_64bytes)) .int JMPTBL (L(1bytes), L(table_64bytes)) @@ -1632,4 +1695,87 @@ L(table_64bytes): .int JMPTBL (L(77bytes), L(table_64bytes)) .int JMPTBL (L(78bytes), L(table_64bytes)) .int JMPTBL (L(79bytes), L(table_64bytes)) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) +# endif #endif diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..b3a2ca1edd --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -0,0 +1,1997 @@ +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include + +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# ifndef ALIGN +# define ALIGN(n) .p2align n +# endif + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + atom_text_section +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx + test %rdx, %rdx + jz L(equal) +# endif + mov %rdx, %rcx + mov %rdi, %rdx + cmp $48, %rcx; + jae L(48bytesormore) /* LEN => 48 */ + + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +/* ECX >= 32. */ +L(48bytesormore): + movdqu (%rdi), %xmm3 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%rdi), %rdi + lea 16(%rsi), %rsi + sub $0xffff, %edx + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %rdx, %rdi + sub %rdx, %rsi + add %rdx, %rcx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %rdx, %rsi + +# ifndef USE_AS_WMEMCMP + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + ALIGN (2) +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif + + ALIGN (4) +L(shr_0): + cmp $80, %rcx + lea -48(%rcx), %rcx + jae L(shr_0_gobble) + xor %eax, %eax + movdqa (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_0_gobble): + movdqa (%rsi), %xmm0 + xor %eax, %eax + pcmpeqb (%rdi), %xmm0 + sub $32, %rcx + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %rcx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%rsi), %xmm0 + movdqa 48(%rsi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%rdi), %xmm0 + pcmpeqb 48(%rdi), %xmm2 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %rcx + jge L(next) + inc %edx + add $32, %rcx +L(next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_1): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $1, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $1, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_1_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $1, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $1, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $1, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $1, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_1_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_1_gobble_next) + inc %edx + add $32, %rcx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 1(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + + ALIGN (4) +L(shr_2): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $2, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $2, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_2_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $2, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $2, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $2, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $2, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_2_gobble_next) + inc %edx + add $32, %rcx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 2(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_3): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $3, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $3, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_3_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $3, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $3, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $3, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $3, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_3_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_3_gobble_next) + inc %edx + add $32, %rcx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 3(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + ALIGN (4) +L(shr_4): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $4, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $4, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_4_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $4, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $4, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $4, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $4, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_4_gobble_next) + inc %edx + add $32, %rcx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 4(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_5): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $5, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $5, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_5_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $5, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $5, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $5, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $5, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_5_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_5_gobble_next) + inc %edx + add $32, %rcx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 5(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_6): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $6, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $6, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_6_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $6, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $6, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $6, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $6, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_6_gobble_next) + inc %edx + add $32, %rcx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 6(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_7): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $7, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $7, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_7_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $7, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $7, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $7, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $7, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_7_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_7_gobble_next) + inc %edx + add $32, %rcx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 7(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + ALIGN (4) +L(shr_8): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $8, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $8, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_8_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $8, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $8, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $8, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $8, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_8_gobble_next) + inc %edx + add $32, %rcx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 8(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_9): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $9, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $9, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_9_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $9, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $9, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $9, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $9, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_9_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_9_gobble_next) + inc %edx + add $32, %rcx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 9(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_10): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $10, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $10, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_10_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $10, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $10, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $10, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $10, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_10_gobble_next) + inc %edx + add $32, %rcx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 10(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_11): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $11, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_11_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $11, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $11, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $11, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $11, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_11_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_11_gobble_next) + inc %edx + add $32, %rcx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 11(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + ALIGN (4) +L(shr_12): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $12, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_12_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $12, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $12, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $12, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $12, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_12_gobble_next) + inc %edx + add $32, %rcx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 12(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_13): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $13, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_13_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $13, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $13, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $13, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $13, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_13_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_13_gobble_next) + inc %edx + add $32, %rcx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 13(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_14): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $14, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_14_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $14, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $14, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $14, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $14, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_14_gobble_next) + inc %edx + add $32, %rcx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 14(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_15): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $15, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_15_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $15, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $15, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $15, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $15, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_15_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_15_gobble_next) + inc %edx + add $32, %rcx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 15(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) +# endif + ALIGN (4) +L(exit): + pmovmskb %xmm1, %r8d + sub $0xffff, %r8d + jz L(first16bytes) + lea -16(%rsi), %rsi + lea -16(%rdi), %rdi + mov %r8d, %edx +L(first16bytes): + add %rax, %rsi +L(less16bytes): +# ifndef USE_AS_WMEMCMP + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) + + movzbl -9(%rdi), %eax + movzbl -9(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte16): + movzbl -16(%rdi), %eax + movzbl -16(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte17): + movzbl -15(%rdi), %eax + movzbl -15(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte18): + movzbl -14(%rdi), %eax + movzbl -14(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte19): + movzbl -13(%rdi), %eax + movzbl -13(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte20): + movzbl -12(%rdi), %eax + movzbl -12(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte21): + movzbl -11(%rdi), %eax + movzbl -11(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte22): + movzbl -10(%rdi), %eax + movzbl -10(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(next_24_bytes): + lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + mov -9(%rdi), %eax + and $0xff, %eax + mov -9(%rsi), %edx + and $0xff, %edx + sub %edx, %eax + ret +# else +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) + ret + + ALIGN (4) +L(second_double_word): + mov -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) + ret + + ALIGN (4) +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) + ret + + ALIGN (4) +L(fourth_double_word): + mov -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) + ret +# endif + + ALIGN (4) +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) + cmp $0, %ecx + je L(0bytes) +# ifndef USE_AS_WMEMCMP + cmp $1, %ecx + je L(1bytes) + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif + + ALIGN (4) +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) +# ifndef USE_AS_WMEMCMP + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) +# else + jmp L(12bytes) +# endif + + ALIGN (4) +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) +# ifndef USE_AS_WMEMCMP + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) +# else + jmp L(20bytes) +# endif + + ALIGN (4) +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) +# ifndef USE_AS_WMEMCMP + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) +# else + jmp L(28bytes) +# endif + + ALIGN (4) +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) +# ifndef USE_AS_WMEMCMP + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + ALIGN (4) +L(more40bytes): + cmp $40, %ecx + je L(40bytes) +# ifndef USE_AS_WMEMCMP + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + ALIGN (4) +L(44bytes): + movl -44(%rdi), %eax + movl -44(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + movl -40(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + movl -36(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + movl -32(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + movl -28(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + movl -24(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + movl -20(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + movl -16(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + movl -12(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + movl -8(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + movl -4(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# else + ALIGN (4) +L(44bytes): + movl -44(%rdi), %eax + cmp -44(%rsi), %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + cmp -40(%rsi), %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + cmp -36(%rsi), %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + cmp -32(%rsi), %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + cmp -28(%rsi), %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + cmp -24(%rsi), %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + cmp -20(%rsi), %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + ALIGN (4) +L(45bytes): + movl -45(%rdi), %eax + movl -45(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(41bytes): + movl -41(%rdi), %eax + movl -41(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(37bytes): + movl -37(%rdi), %eax + movl -37(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(33bytes): + movl -33(%rdi), %eax + movl -33(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(29bytes): + movl -29(%rdi), %eax + movl -29(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(25bytes): + movl -25(%rdi), %eax + movl -25(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(21bytes): + movl -21(%rdi), %eax + movl -21(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(17bytes): + movl -17(%rdi), %eax + movl -17(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(13bytes): + movl -13(%rdi), %eax + movl -13(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(9bytes): + movl -9(%rdi), %eax + movl -9(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(5bytes): + movl -5(%rdi), %eax + movl -5(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(1bytes): + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + ALIGN (4) +L(46bytes): + movl -46(%rdi), %eax + movl -46(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(42bytes): + movl -42(%rdi), %eax + movl -42(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(38bytes): + movl -38(%rdi), %eax + movl -38(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(34bytes): + movl -34(%rdi), %eax + movl -34(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(30bytes): + movl -30(%rdi), %eax + movl -30(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(26bytes): + movl -26(%rdi), %eax + movl -26(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(22bytes): + movl -22(%rdi), %eax + movl -22(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(18bytes): + movl -18(%rdi), %eax + movl -18(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(14bytes): + movl -14(%rdi), %eax + movl -14(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(10bytes): + movl -10(%rdi), %eax + movl -10(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(6bytes): + movl -6(%rdi), %eax + movl -6(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(2bytes): + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + xor %eax, %eax + ret + + ALIGN (4) +L(47bytes): + movl -47(%rdi), %eax + movl -47(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(43bytes): + movl -43(%rdi), %eax + movl -43(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(39bytes): + movl -39(%rdi), %eax + movl -39(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(35bytes): + movl -35(%rdi), %eax + movl -35(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(31bytes): + movl -31(%rdi), %eax + movl -31(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(27bytes): + movl -27(%rdi), %eax + movl -27(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(23bytes): + movl -23(%rdi), %eax + movl -23(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(19bytes): + movl -19(%rdi), %eax + movl -19(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(15bytes): + movl -15(%rdi), %eax + movl -15(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(11bytes): + movl -11(%rdi), %eax + movl -11(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(7bytes): + movl -7(%rdi), %eax + movl -7(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + ALIGN (4) +L(find_diff): + cmpb %cl, %al + jne L(set) + cmpw %cx, %ax + jne L(set) + shr $16, %eax + shr $16, %ecx + cmpb %cl, %al + jne L(set) + +/* We get there only if we already know there is a +difference. */ + + cmp %ecx, %eax +L(set): + sbb %eax, %eax + sbb $-1, %eax + ret +# else + +/* for wmemcmp */ + ALIGN (4) +L(find_diff): + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + ALIGN (4) +L(find_diff_bigger): + ret +# endif + + ALIGN (4) +L(equal): + xor %eax, %eax + ret + +END (MEMCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S index 301ab287f5..8bf8f3a417 100644 --- a/sysdeps/x86_64/multiarch/memcmp.S +++ b/sysdeps/x86_64/multiarch/memcmp.S @@ -1,5 +1,5 @@ /* Multiple versions of memcmp - Copyright (C) 2010 Free Software Foundation, Inc. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -29,11 +29,20 @@ ENTRY(memcmp) cmpl $0, KIND_OFFSET+__cpu_features(%rip) jne 1f call __init_cpu_features -1: leaq __memcmp_sse2(%rip), %rax - testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) - jz 2f + +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 2f + leaq __memcmp_sse2(%rip), %rax + ret + +2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 3f leaq __memcmp_sse4_1(%rip), %rax -2: ret + ret + +3: leaq __memcmp_ssse3(%rip), %rax + ret + END(memcmp) # undef ENTRY diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c new file mode 100644 index 0000000000..793f059aff --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define WMEMCMP __wmemcmp_sse2 +#endif + +#include "wcsmbs/wmemcmp.c" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S new file mode 100644 index 0000000000..b07973a4f6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_1 + +#include "memcmp-sse4.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S new file mode 100644 index 0000000000..a41ef95fc1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S new file mode 100644 index 0000000000..7c3b7ed178 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp.S @@ -0,0 +1,47 @@ +/* Multiple versions of wmemcmp + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features + +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 2f + leaq __wmemcmp_sse2(%rip), %rax + ret + +2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 3f + leaq __wmemcmp_sse4_1(%rip), %rax + ret + +3: leaq __wmemcmp_ssse3(%rip), %rax + ret + +END(wmemcmp) +#endif -- cgit 1.4.1