From a51db847c9ca5926c22c9bf2505c3d69886967b8 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 20 Aug 2015 08:20:41 -0700 Subject: Add i386 memcmp multiarch functions --- sysdeps/i386/i686/multiarch/Makefile | 2 +- sysdeps/i386/i686/multiarch/memcmp-i386.S | 1 + sysdeps/i386/i686/multiarch/memcmp-i686.S | 7 + sysdeps/i386/i686/multiarch/memcmp-sse4.S | 1225 ---------------- sysdeps/i386/i686/multiarch/memcmp-ssse3.S | 2157 ---------------------------- sysdeps/i386/i686/multiarch/memcmp.S | 62 - sysdeps/i386/i686/multiarch/memcmp.c | 1 + sysdeps/i386/i686/multiarch/rtld-memcmp.S | 19 + sysdeps/i386/multiarch/Makefile | 3 +- sysdeps/i386/multiarch/ifunc-impl-list.c | 6 +- sysdeps/i386/multiarch/memcmp-i386.S | 12 + sysdeps/i386/multiarch/memcmp-i686.S | 6 + sysdeps/i386/multiarch/memcmp-sse4.S | 1225 ++++++++++++++++ sysdeps/i386/multiarch/memcmp-ssse3.S | 2157 ++++++++++++++++++++++++++++ sysdeps/i386/multiarch/memcmp.c | 57 + sysdeps/i386/multiarch/rtld-memcmp.S | 19 + 16 files changed, 3511 insertions(+), 3448 deletions(-) create mode 100644 sysdeps/i386/i686/multiarch/memcmp-i386.S create mode 100644 sysdeps/i386/i686/multiarch/memcmp-i686.S delete mode 100644 sysdeps/i386/i686/multiarch/memcmp-sse4.S delete mode 100644 sysdeps/i386/i686/multiarch/memcmp-ssse3.S delete mode 100644 sysdeps/i386/i686/multiarch/memcmp.S create mode 100644 sysdeps/i386/i686/multiarch/memcmp.c create mode 100644 sysdeps/i386/i686/multiarch/rtld-memcmp.S create mode 100644 sysdeps/i386/multiarch/memcmp-i386.S create mode 100644 sysdeps/i386/multiarch/memcmp-i686.S create mode 100644 sysdeps/i386/multiarch/memcmp-sse4.S create mode 100644 sysdeps/i386/multiarch/memcmp-ssse3.S create mode 100644 sysdeps/i386/multiarch/memcmp.c create mode 100644 sysdeps/i386/multiarch/rtld-memcmp.S diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index c088a39bea..ec128d7047 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -1,7 +1,7 @@ ifeq ($(subdir),string) sysdep_routines += strcmp-ssse3 \ strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ - memcmp-ssse3 memcmp-sse4 varshift \ + varshift \ strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \ strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \ strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \ diff --git a/sysdeps/i386/i686/multiarch/memcmp-i386.S b/sysdeps/i386/i686/multiarch/memcmp-i386.S new file mode 100644 index 0000000000..9d841c9fd1 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcmp-i386.S @@ -0,0 +1 @@ +/* Dummy file. */ diff --git a/sysdeps/i386/i686/multiarch/memcmp-i686.S b/sysdeps/i386/i686/multiarch/memcmp-i686.S new file mode 100644 index 0000000000..7aaf48b505 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcmp-i686.S @@ -0,0 +1,7 @@ +#include + +#ifdef SHARED + .globl __GI_memcmp + .hidden __GI_memcmp + __GI_memcmp = __memcmp_i686 +#endif diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S deleted file mode 100644 index b3756f4a00..0000000000 --- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S +++ /dev/null @@ -1,1225 +0,0 @@ -/* memcmp with SSE4.2, wmemcmp with SSE4.2 - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) - -# include - -# ifndef MEMCMP -# define MEMCMP __memcmp_sse4_2 -# endif - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# define PARMS 4 -# define BLK1 PARMS -# define BLK2 BLK1 + 4 -# define LEN BLK2 + 4 -# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) - - -# ifdef SHARED -# define JMPTBL(I, B) I - B - -/* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ - -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ -/* We first load PC into EBX. */ \ - SETUP_PIC_REG(bx); \ -/* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ -/* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ -/* We loaded the jump table and adjusted EDX/ESI. Go. */ \ - jmp *%ebx -# else -# define JMPTBL(I, B) I - -/* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -# endif - - -/* Warning! - wmemcmp has to use SIGNED comparison for elements. - memcmp has to use UNSIGNED comparison for elemnts. -*/ - - .section .text.sse4.2,"ax",@progbits -ENTRY (MEMCMP) - movl BLK1(%esp), %eax - movl BLK2(%esp), %edx - movl LEN(%esp), %ecx - -# ifdef USE_AS_WMEMCMP - shl $2, %ecx - test %ecx, %ecx - jz L(return0) -# else - cmp $1, %ecx - jbe L(less1bytes) -# endif - - pxor %xmm0, %xmm0 - cmp $64, %ecx - ja L(64bytesormore) - cmp $8, %ecx - -# ifndef USE_AS_WMEMCMP - PUSH (%ebx) - jb L(less8bytes) -# else - jb L(less8bytes) - PUSH (%ebx) -# endif - - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(less8bytes): - mov (%eax), %bl - cmpb (%edx), %bl - jne L(nonzero) - - mov 1(%eax), %bl - cmpb 1(%edx), %bl - jne L(nonzero) - - cmp $2, %ecx - jz L(0bytes) - - mov 2(%eax), %bl - cmpb 2(%edx), %bl - jne L(nonzero) - - cmp $3, %ecx - jz L(0bytes) - - mov 3(%eax), %bl - cmpb 3(%edx), %bl - jne L(nonzero) - - cmp $4, %ecx - jz L(0bytes) - - mov 4(%eax), %bl - cmpb 4(%edx), %bl - jne L(nonzero) - - cmp $5, %ecx - jz L(0bytes) - - mov 5(%eax), %bl - cmpb 5(%edx), %bl - jne L(nonzero) - - cmp $6, %ecx - jz L(0bytes) - - mov 6(%eax), %bl - cmpb 6(%edx), %bl - je L(0bytes) - -L(nonzero): - POP (%ebx) - mov $1, %eax - ja L(above) - neg %eax -L(above): - ret - CFI_PUSH (%ebx) -# endif - - .p2align 4 -L(0bytes): - POP (%ebx) - xor %eax, %eax - ret - -# ifdef USE_AS_WMEMCMP - -/* for wmemcmp, case N == 1 */ - - .p2align 4 -L(less8bytes): - mov (%eax), %ecx - cmp (%edx), %ecx - je L(return0) - mov $1, %eax - jg L(find_diff_bigger) - neg %eax - ret - - .p2align 4 -L(find_diff_bigger): - ret - - .p2align 4 -L(return0): - xor %eax, %eax - ret -# endif - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(less1bytes): - jb L(0bytesend) - movzbl (%eax), %eax - movzbl (%edx), %edx - sub %edx, %eax - ret - - .p2align 4 -L(0bytesend): - xor %eax, %eax - ret -# endif - .p2align 4 -L(64bytesormore): - PUSH (%ebx) - mov %ecx, %ebx - mov $64, %ecx - sub $64, %ebx -L(64bytesormore_loop): - movdqu (%eax), %xmm1 - movdqu (%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(find_16diff) - - movdqu 16(%eax), %xmm1 - movdqu 16(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(find_32diff) - - movdqu 32(%eax), %xmm1 - movdqu 32(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(find_48diff) - - movdqu 48(%eax), %xmm1 - movdqu 48(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(find_64diff) - add %ecx, %eax - add %ecx, %edx - sub %ecx, %ebx - jae L(64bytesormore_loop) - add %ebx, %ecx - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) - -# ifdef USE_AS_WMEMCMP - -/* Label needs only for table_64bytes filling */ -L(unreal_case): -/* no code here */ - -# endif - .p2align 4 -L(find_16diff): - sub $16, %ecx -L(find_32diff): - sub $16, %ecx -L(find_48diff): - sub $16, %ecx -L(find_64diff): - add %ecx, %edx - add %ecx, %eax - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(16bytes): - mov -16(%eax), %ecx - mov -16(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(12bytes): - mov -12(%eax), %ecx - mov -12(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(8bytes): - mov -8(%eax), %ecx - mov -8(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(4bytes): - mov -4(%eax), %ecx - mov -4(%edx), %ebx - cmp %ebx, %ecx - mov $0, %eax - jne L(find_diff) - RETURN -# else - .p2align 4 -L(16bytes): - mov -16(%eax), %ecx - cmp -16(%edx), %ecx - jne L(find_diff) -L(12bytes): - mov -12(%eax), %ecx - cmp -12(%edx), %ecx - jne L(find_diff) -L(8bytes): - mov -8(%eax), %ecx - cmp -8(%edx), %ecx - jne L(find_diff) -L(4bytes): - mov -4(%eax), %ecx - cmp -4(%edx), %ecx - mov $0, %eax - jne L(find_diff) - RETURN -# endif - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(49bytes): - movdqu -49(%eax), %xmm1 - movdqu -49(%edx), %xmm2 - mov $-49, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(33bytes): - movdqu -33(%eax), %xmm1 - movdqu -33(%edx), %xmm2 - mov $-33, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(17bytes): - mov -17(%eax), %ecx - mov -17(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(13bytes): - mov -13(%eax), %ecx - mov -13(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(9bytes): - mov -9(%eax), %ecx - mov -9(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(5bytes): - mov -5(%eax), %ecx - mov -5(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzbl -1(%eax), %ecx - cmp -1(%edx), %cl - mov $0, %eax - jne L(end) - RETURN - - .p2align 4 -L(50bytes): - mov $-50, %ebx - movdqu -50(%eax), %xmm1 - movdqu -50(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(34bytes): - mov $-34, %ebx - movdqu -34(%eax), %xmm1 - movdqu -34(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(18bytes): - mov -18(%eax), %ecx - mov -18(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(14bytes): - mov -14(%eax), %ecx - mov -14(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(10bytes): - mov -10(%eax), %ecx - mov -10(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(6bytes): - mov -6(%eax), %ecx - mov -6(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(2bytes): - movzwl -2(%eax), %ecx - movzwl -2(%edx), %ebx - cmp %bl, %cl - jne L(end) - cmp %bh, %ch - mov $0, %eax - jne L(end) - RETURN - - .p2align 4 -L(51bytes): - mov $-51, %ebx - movdqu -51(%eax), %xmm1 - movdqu -51(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(35bytes): - mov $-35, %ebx - movdqu -35(%eax), %xmm1 - movdqu -35(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(19bytes): - movl -19(%eax), %ecx - movl -19(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(15bytes): - movl -15(%eax), %ecx - movl -15(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(11bytes): - movl -11(%eax), %ecx - movl -11(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(7bytes): - movl -7(%eax), %ecx - movl -7(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(3bytes): - movzwl -3(%eax), %ecx - movzwl -3(%edx), %ebx - cmpb %bl, %cl - jne L(end) - cmp %bx, %cx - jne L(end) -L(1bytes): - movzbl -1(%eax), %eax - cmpb -1(%edx), %al - mov $0, %eax - jne L(end) - RETURN -# endif - .p2align 4 -L(52bytes): - movdqu -52(%eax), %xmm1 - movdqu -52(%edx), %xmm2 - mov $-52, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(36bytes): - movdqu -36(%eax), %xmm1 - movdqu -36(%edx), %xmm2 - mov $-36, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(20bytes): - movdqu -20(%eax), %xmm1 - movdqu -20(%edx), %xmm2 - mov $-20, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -4(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -4(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -4(%edx), %ecx -# endif - mov $0, %eax - jne L(find_diff) - RETURN - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(53bytes): - movdqu -53(%eax), %xmm1 - movdqu -53(%edx), %xmm2 - mov $-53, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(37bytes): - mov $-37, %ebx - movdqu -37(%eax), %xmm1 - movdqu -37(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(21bytes): - mov $-21, %ebx - movdqu -21(%eax), %xmm1 - movdqu -21(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -5(%eax), %ecx - mov -5(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzbl -1(%eax), %ecx - cmp -1(%edx), %cl - mov $0, %eax - jne L(end) - RETURN - - .p2align 4 -L(54bytes): - movdqu -54(%eax), %xmm1 - movdqu -54(%edx), %xmm2 - mov $-54, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(38bytes): - mov $-38, %ebx - movdqu -38(%eax), %xmm1 - movdqu -38(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(22bytes): - mov $-22, %ebx - movdqu -22(%eax), %xmm1 - movdqu -22(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - mov -6(%eax), %ecx - mov -6(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzwl -2(%eax), %ecx - movzwl -2(%edx), %ebx - cmp %bl, %cl - jne L(end) - cmp %bh, %ch - mov $0, %eax - jne L(end) - RETURN - - .p2align 4 -L(55bytes): - movdqu -55(%eax), %xmm1 - movdqu -55(%edx), %xmm2 - mov $-55, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(39bytes): - mov $-39, %ebx - movdqu -39(%eax), %xmm1 - movdqu -39(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(23bytes): - mov $-23, %ebx - movdqu -23(%eax), %xmm1 - movdqu -23(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - movl -7(%eax), %ecx - movl -7(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzwl -3(%eax), %ecx - movzwl -3(%edx), %ebx - cmpb %bl, %cl - jne L(end) - cmp %bx, %cx - jne L(end) - movzbl -1(%eax), %eax - cmpb -1(%edx), %al - mov $0, %eax - jne L(end) - RETURN -# endif - .p2align 4 -L(56bytes): - movdqu -56(%eax), %xmm1 - movdqu -56(%edx), %xmm2 - mov $-56, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(40bytes): - mov $-40, %ebx - movdqu -40(%eax), %xmm1 - movdqu -40(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(24bytes): - mov $-24, %ebx - movdqu -24(%eax), %xmm1 - movdqu -24(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - mov -8(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -8(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -8(%edx), %ecx -# endif - jne L(find_diff) - - mov -4(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -4(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -4(%edx), %ecx -# endif - mov $0, %eax - jne L(find_diff) - RETURN - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(57bytes): - movdqu -57(%eax), %xmm1 - movdqu -57(%edx), %xmm2 - mov $-57, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(41bytes): - mov $-41, %ebx - movdqu -41(%eax), %xmm1 - movdqu -41(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(25bytes): - mov $-25, %ebx - movdqu -25(%eax), %xmm1 - movdqu -25(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -9(%eax), %ecx - mov -9(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - mov -5(%eax), %ecx - mov -5(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzbl -1(%eax), %ecx - cmp -1(%edx), %cl - mov $0, %eax - jne L(end) - RETURN - - .p2align 4 -L(58bytes): - movdqu -58(%eax), %xmm1 - movdqu -58(%edx), %xmm2 - mov $-58, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(42bytes): - mov $-42, %ebx - movdqu -42(%eax), %xmm1 - movdqu -42(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(26bytes): - mov $-26, %ebx - movdqu -26(%eax), %xmm1 - movdqu -26(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - mov -10(%eax), %ecx - mov -10(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - - mov -6(%eax), %ecx - mov -6(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - - movzwl -2(%eax), %ecx - movzwl -2(%edx), %ebx - cmp %bl, %cl - jne L(end) - cmp %bh, %ch - mov $0, %eax - jne L(end) - RETURN - - .p2align 4 -L(59bytes): - movdqu -59(%eax), %xmm1 - movdqu -59(%edx), %xmm2 - mov $-59, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(43bytes): - mov $-43, %ebx - movdqu -43(%eax), %xmm1 - movdqu -43(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(27bytes): - mov $-27, %ebx - movdqu -27(%eax), %xmm1 - movdqu -27(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - movl -11(%eax), %ecx - movl -11(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movl -7(%eax), %ecx - movl -7(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzwl -3(%eax), %ecx - movzwl -3(%edx), %ebx - cmpb %bl, %cl - jne L(end) - cmp %bx, %cx - jne L(end) - movzbl -1(%eax), %eax - cmpb -1(%edx), %al - mov $0, %eax - jne L(end) - RETURN -# endif - .p2align 4 -L(60bytes): - movdqu -60(%eax), %xmm1 - movdqu -60(%edx), %xmm2 - mov $-60, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(44bytes): - mov $-44, %ebx - movdqu -44(%eax), %xmm1 - movdqu -44(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(28bytes): - mov $-28, %ebx - movdqu -28(%eax), %xmm1 - movdqu -28(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - mov -12(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -12(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -12(%edx), %ecx -# endif - jne L(find_diff) - - mov -8(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -8(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -8(%edx), %ecx -# endif - jne L(find_diff) - - mov -4(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -4(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -4(%edx), %ecx -# endif - mov $0, %eax - jne L(find_diff) - RETURN - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(61bytes): - movdqu -61(%eax), %xmm1 - movdqu -61(%edx), %xmm2 - mov $-61, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(45bytes): - mov $-45, %ebx - movdqu -45(%eax), %xmm1 - movdqu -45(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(29bytes): - mov $-29, %ebx - movdqu -29(%eax), %xmm1 - movdqu -29(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - mov -13(%eax), %ecx - mov -13(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - - mov -9(%eax), %ecx - mov -9(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - - mov -5(%eax), %ecx - mov -5(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzbl -1(%eax), %ecx - cmp -1(%edx), %cl - mov $0, %eax - jne L(end) - RETURN - - .p2align 4 -L(62bytes): - movdqu -62(%eax), %xmm1 - movdqu -62(%edx), %xmm2 - mov $-62, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(46bytes): - mov $-46, %ebx - movdqu -46(%eax), %xmm1 - movdqu -46(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(30bytes): - mov $-30, %ebx - movdqu -30(%eax), %xmm1 - movdqu -30(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -14(%eax), %ecx - mov -14(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - mov -10(%eax), %ecx - mov -10(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - mov -6(%eax), %ecx - mov -6(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzwl -2(%eax), %ecx - movzwl -2(%edx), %ebx - cmp %bl, %cl - jne L(end) - cmp %bh, %ch - mov $0, %eax - jne L(end) - RETURN - - .p2align 4 -L(63bytes): - movdqu -63(%eax), %xmm1 - movdqu -63(%edx), %xmm2 - mov $-63, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(47bytes): - mov $-47, %ebx - movdqu -47(%eax), %xmm1 - movdqu -47(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(31bytes): - mov $-31, %ebx - movdqu -31(%eax), %xmm1 - movdqu -31(%edx), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - movl -15(%eax), %ecx - movl -15(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movl -11(%eax), %ecx - movl -11(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movl -7(%eax), %ecx - movl -7(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzwl -3(%eax), %ecx - movzwl -3(%edx), %ebx - cmpb %bl, %cl - jne L(end) - cmp %bx, %cx - jne L(end) - movzbl -1(%eax), %eax - cmpb -1(%edx), %al - mov $0, %eax - jne L(end) - RETURN -# endif - - .p2align 4 -L(64bytes): - movdqu -64(%eax), %xmm1 - movdqu -64(%edx), %xmm2 - mov $-64, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(48bytes): - movdqu -48(%eax), %xmm1 - movdqu -48(%edx), %xmm2 - mov $-48, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(32bytes): - movdqu -32(%eax), %xmm1 - movdqu -32(%edx), %xmm2 - mov $-32, %ebx - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - mov -16(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -16(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -16(%edx), %ecx -# endif - jne L(find_diff) - - mov -12(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -12(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -12(%edx), %ecx -# endif - jne L(find_diff) - - mov -8(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -8(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -8(%edx), %ecx -# endif - jne L(find_diff) - - mov -4(%eax), %ecx -# ifndef USE_AS_WMEMCMP - mov -4(%edx), %ebx - cmp %ebx, %ecx -# else - cmp -4(%edx), %ecx -# endif - mov $0, %eax - jne L(find_diff) - RETURN - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(less16bytes): - add %ebx, %eax - add %ebx, %edx - - mov (%eax), %ecx - mov (%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - - mov 4(%eax), %ecx - mov 4(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - - mov 8(%eax), %ecx - mov 8(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - - mov 12(%eax), %ecx - mov 12(%edx), %ebx - cmp %ebx, %ecx - mov $0, %eax - jne L(find_diff) - RETURN -# else - .p2align 4 -L(less16bytes): - add %ebx, %eax - add %ebx, %edx - - mov (%eax), %ecx - cmp (%edx), %ecx - jne L(find_diff) - - mov 4(%eax), %ecx - cmp 4(%edx), %ecx - jne L(find_diff) - - mov 8(%eax), %ecx - cmp 8(%edx), %ecx - jne L(find_diff) - - mov 12(%eax), %ecx - cmp 12(%edx), %ecx - - mov $0, %eax - jne L(find_diff) - RETURN -# endif - - .p2align 4 -L(find_diff): -# ifndef USE_AS_WMEMCMP - cmpb %bl, %cl - jne L(end) - cmp %bx, %cx - jne L(end) - shr $16,%ecx - shr $16,%ebx - cmp %bl, %cl - jne L(end) - cmp %bx, %cx -L(end): - POP (%ebx) - mov $1, %eax - ja L(bigger) - neg %eax -L(bigger): - ret -# else - POP (%ebx) - mov $1, %eax - jg L(bigger) - neg %eax - ret - - .p2align 4 -L(bigger): - ret -# endif -END (MEMCMP) - - .section .rodata.sse4.2,"a",@progbits - .p2align 2 - .type L(table_64bytes), @object -# ifndef USE_AS_WMEMCMP -L(table_64bytes): - .int JMPTBL (L(0bytes), L(table_64bytes)) - .int JMPTBL (L(1bytes), L(table_64bytes)) - .int JMPTBL (L(2bytes), L(table_64bytes)) - .int JMPTBL (L(3bytes), L(table_64bytes)) - .int JMPTBL (L(4bytes), L(table_64bytes)) - .int JMPTBL (L(5bytes), L(table_64bytes)) - .int JMPTBL (L(6bytes), L(table_64bytes)) - .int JMPTBL (L(7bytes), L(table_64bytes)) - .int JMPTBL (L(8bytes), L(table_64bytes)) - .int JMPTBL (L(9bytes), L(table_64bytes)) - .int JMPTBL (L(10bytes), L(table_64bytes)) - .int JMPTBL (L(11bytes), L(table_64bytes)) - .int JMPTBL (L(12bytes), L(table_64bytes)) - .int JMPTBL (L(13bytes), L(table_64bytes)) - .int JMPTBL (L(14bytes), L(table_64bytes)) - .int JMPTBL (L(15bytes), L(table_64bytes)) - .int JMPTBL (L(16bytes), L(table_64bytes)) - .int JMPTBL (L(17bytes), L(table_64bytes)) - .int JMPTBL (L(18bytes), L(table_64bytes)) - .int JMPTBL (L(19bytes), L(table_64bytes)) - .int JMPTBL (L(20bytes), L(table_64bytes)) - .int JMPTBL (L(21bytes), L(table_64bytes)) - .int JMPTBL (L(22bytes), L(table_64bytes)) - .int JMPTBL (L(23bytes), L(table_64bytes)) - .int JMPTBL (L(24bytes), L(table_64bytes)) - .int JMPTBL (L(25bytes), L(table_64bytes)) - .int JMPTBL (L(26bytes), L(table_64bytes)) - .int JMPTBL (L(27bytes), L(table_64bytes)) - .int JMPTBL (L(28bytes), L(table_64bytes)) - .int JMPTBL (L(29bytes), L(table_64bytes)) - .int JMPTBL (L(30bytes), L(table_64bytes)) - .int JMPTBL (L(31bytes), L(table_64bytes)) - .int JMPTBL (L(32bytes), L(table_64bytes)) - .int JMPTBL (L(33bytes), L(table_64bytes)) - .int JMPTBL (L(34bytes), L(table_64bytes)) - .int JMPTBL (L(35bytes), L(table_64bytes)) - .int JMPTBL (L(36bytes), L(table_64bytes)) - .int JMPTBL (L(37bytes), L(table_64bytes)) - .int JMPTBL (L(38bytes), L(table_64bytes)) - .int JMPTBL (L(39bytes), L(table_64bytes)) - .int JMPTBL (L(40bytes), L(table_64bytes)) - .int JMPTBL (L(41bytes), L(table_64bytes)) - .int JMPTBL (L(42bytes), L(table_64bytes)) - .int JMPTBL (L(43bytes), L(table_64bytes)) - .int JMPTBL (L(44bytes), L(table_64bytes)) - .int JMPTBL (L(45bytes), L(table_64bytes)) - .int JMPTBL (L(46bytes), L(table_64bytes)) - .int JMPTBL (L(47bytes), L(table_64bytes)) - .int JMPTBL (L(48bytes), L(table_64bytes)) - .int JMPTBL (L(49bytes), L(table_64bytes)) - .int JMPTBL (L(50bytes), L(table_64bytes)) - .int JMPTBL (L(51bytes), L(table_64bytes)) - .int JMPTBL (L(52bytes), L(table_64bytes)) - .int JMPTBL (L(53bytes), L(table_64bytes)) - .int JMPTBL (L(54bytes), L(table_64bytes)) - .int JMPTBL (L(55bytes), L(table_64bytes)) - .int JMPTBL (L(56bytes), L(table_64bytes)) - .int JMPTBL (L(57bytes), L(table_64bytes)) - .int JMPTBL (L(58bytes), L(table_64bytes)) - .int JMPTBL (L(59bytes), L(table_64bytes)) - .int JMPTBL (L(60bytes), L(table_64bytes)) - .int JMPTBL (L(61bytes), L(table_64bytes)) - .int JMPTBL (L(62bytes), L(table_64bytes)) - .int JMPTBL (L(63bytes), L(table_64bytes)) - .int JMPTBL (L(64bytes), L(table_64bytes)) -# else -L(table_64bytes): - .int JMPTBL (L(0bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(4bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(8bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(12bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(16bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(20bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(24bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(28bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(32bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(36bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(40bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(44bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(48bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(52bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(56bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(60bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(64bytes), L(table_64bytes)) -# endif -#endif diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S deleted file mode 100644 index ea2a25b216..0000000000 --- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S +++ /dev/null @@ -1,2157 +0,0 @@ -/* memcmp with SSSE3, wmemcmp with SSSE3 - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) - -# include - -# ifndef MEMCMP -# define MEMCMP __memcmp_ssse3 -# endif - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# define PARMS 4 -# define BLK1 PARMS -# define BLK2 BLK1+4 -# define LEN BLK2+4 -# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret -# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state - -/* Warning! - wmemcmp has to use SIGNED comparison for elements. - memcmp has to use UNSIGNED comparison for elemnts. -*/ - - atom_text_section -ENTRY (MEMCMP) - movl LEN(%esp), %ecx - -# ifdef USE_AS_WMEMCMP - shl $2, %ecx - test %ecx, %ecx - jz L(zero) -# endif - - movl BLK1(%esp), %eax - cmp $48, %ecx - movl BLK2(%esp), %edx - jae L(48bytesormore) - -# ifndef USE_AS_WMEMCMP - cmp $1, %ecx - jbe L(less1bytes) -# endif - - PUSH (%ebx) - add %ecx, %edx - add %ecx, %eax - jmp L(less48bytes) - - CFI_POP (%ebx) - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(less1bytes): - jb L(zero) - movb (%eax), %cl - cmp (%edx), %cl - je L(zero) - mov $1, %eax - ja L(1bytesend) - neg %eax -L(1bytesend): - ret -# endif - - .p2align 4 -L(zero): - xor %eax, %eax - ret - - .p2align 4 -L(48bytesormore): - PUSH (%ebx) - PUSH (%esi) - PUSH (%edi) - cfi_remember_state - movdqu (%eax), %xmm3 - movdqu (%edx), %xmm0 - movl %eax, %edi - movl %edx, %esi - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx - lea 16(%edi), %edi - - sub $0xffff, %edx - lea 16(%esi), %esi - jnz L(less16bytes) - mov %edi, %edx - and $0xf, %edx - xor %edx, %edi - sub %edx, %esi - add %edx, %ecx - mov %esi, %edx - and $0xf, %edx - jz L(shr_0) - xor %edx, %esi - -# ifndef USE_AS_WMEMCMP - cmp $8, %edx - jae L(next_unaligned_table) - cmp $0, %edx - je L(shr_0) - cmp $1, %edx - je L(shr_1) - cmp $2, %edx - je L(shr_2) - cmp $3, %edx - je L(shr_3) - cmp $4, %edx - je L(shr_4) - cmp $5, %edx - je L(shr_5) - cmp $6, %edx - je L(shr_6) - jmp L(shr_7) - - .p2align 2 -L(next_unaligned_table): - cmp $8, %edx - je L(shr_8) - cmp $9, %edx - je L(shr_9) - cmp $10, %edx - je L(shr_10) - cmp $11, %edx - je L(shr_11) - cmp $12, %edx - je L(shr_12) - cmp $13, %edx - je L(shr_13) - cmp $14, %edx - je L(shr_14) - jmp L(shr_15) -# else - cmp $0, %edx - je L(shr_0) - cmp $4, %edx - je L(shr_4) - cmp $8, %edx - je L(shr_8) - jmp L(shr_12) -# endif - - .p2align 4 -L(shr_0): - cmp $80, %ecx - jae L(shr_0_gobble) - lea -48(%ecx), %ecx - xor %eax, %eax - movaps (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - movaps 16(%esi), %xmm2 - pcmpeqb 16(%edi), %xmm2 - pand %xmm1, %xmm2 - pmovmskb %xmm2, %edx - add $32, %edi - add $32, %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_0_gobble): - lea -48(%ecx), %ecx - movdqa (%esi), %xmm0 - xor %eax, %eax - pcmpeqb (%edi), %xmm0 - sub $32, %ecx - movdqa 16(%esi), %xmm2 - pcmpeqb 16(%edi), %xmm2 -L(shr_0_gobble_loop): - pand %xmm0, %xmm2 - sub $32, %ecx - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - movdqa 32(%esi), %xmm0 - movdqa 48(%esi), %xmm2 - sbb $0xffff, %edx - pcmpeqb 32(%edi), %xmm0 - pcmpeqb 48(%edi), %xmm2 - lea 32(%edi), %edi - lea 32(%esi), %esi - jz L(shr_0_gobble_loop) - - pand %xmm0, %xmm2 - cmp $0, %ecx - jge L(shr_0_gobble_loop_next) - inc %edx - add $32, %ecx -L(shr_0_gobble_loop_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_1): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_1_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $1,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $1,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_1_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $1,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $1,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_1_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $1,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $1,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_1_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_1_gobble_next) - inc %edx - add $32, %ecx -L(shr_1_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_2): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_2_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $2,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $2,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_2_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $2,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $2,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_2_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $2,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $2,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_2_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_2_gobble_next) - inc %edx - add $32, %ecx -L(shr_2_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_3): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_3_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $3,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $3,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_3_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $3,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $3,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_3_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $3,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $3,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_3_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_3_gobble_next) - inc %edx - add $32, %ecx -L(shr_3_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) -# endif - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_4): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_4_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $4,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $4,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_4_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $4,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $4,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_4_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $4,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $4,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_4_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_4_gobble_next) - inc %edx - add $32, %ecx -L(shr_4_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_5): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_5_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $5,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $5,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_5_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $5,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $5,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_5_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $5,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $5,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_5_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_5_gobble_next) - inc %edx - add $32, %ecx -L(shr_5_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_6): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_6_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $6,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $6,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_6_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $6,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $6,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_6_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $6,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $6,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_6_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_6_gobble_next) - inc %edx - add $32, %ecx -L(shr_6_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_7): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_7_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $7,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $7,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_7_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $7,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $7,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_7_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $7,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $7,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_7_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_7_gobble_next) - inc %edx - add $32, %ecx -L(shr_7_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) -# endif - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_8): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_8_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $8,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $8,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_8_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $8,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $8,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_8_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $8,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $8,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_8_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_8_gobble_next) - inc %edx - add $32, %ecx -L(shr_8_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_9): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_9_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $9,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $9,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_9_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $9,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $9,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_9_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $9,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $9,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_9_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_9_gobble_next) - inc %edx - add $32, %ecx -L(shr_9_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_10): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_10_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $10, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $10,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_10_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $10, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $10, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_10_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $10,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $10,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_10_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_10_gobble_next) - inc %edx - add $32, %ecx -L(shr_10_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_11): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_11_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $11, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $11, %xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_11_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $11, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $11, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_11_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $11,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $11,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_11_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_11_gobble_next) - inc %edx - add $32, %ecx -L(shr_11_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) -# endif - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_12): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_12_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $12, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $12, %xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_12_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $12, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $12, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_12_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $12,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $12,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_12_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_12_gobble_next) - inc %edx - add $32, %ecx -L(shr_12_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_13): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_13_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $13, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $13, %xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_13_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $13, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $13, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_13_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $13,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $13,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_13_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_13_gobble_next) - inc %edx - add $32, %ecx -L(shr_13_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_14): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_14_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $14, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $14, %xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_14_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $14, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $14, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_14_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $14,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $14,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_14_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_14_gobble_next) - inc %edx - add $32, %ecx -L(shr_14_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_15): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_15_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $15, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $15, %xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(shr_15_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $15, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 - - movdqa 32(%esi), %xmm3 - palignr $15, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 - -L(shr_15_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $15,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $15,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_15_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_15_gobble_next) - inc %edx - add $32, %ecx -L(shr_15_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) -# endif - - cfi_restore_state - cfi_remember_state - .p2align 4 -L(exit): - pmovmskb %xmm1, %ebx - sub $0xffff, %ebx - jz L(first16bytes) - lea -16(%esi), %esi - lea -16(%edi), %edi - mov %ebx, %edx - -L(first16bytes): - add %eax, %esi -L(less16bytes): - -# ifndef USE_AS_WMEMCMP - test %dl, %dl - jz L(next_24_bytes) - - test $0x01, %dl - jnz L(Byte16) - - test $0x02, %dl - jnz L(Byte17) - - test $0x04, %dl - jnz L(Byte18) - - test $0x08, %dl - jnz L(Byte19) - - test $0x10, %dl - jnz L(Byte20) - - test $0x20, %dl - jnz L(Byte21) - - test $0x40, %dl - jnz L(Byte22) -L(Byte23): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx - sub %edx, %eax - RETURN - - .p2align 4 -L(Byte16): - movzbl -16(%edi), %eax - movzbl -16(%esi), %edx - sub %edx, %eax - RETURN - - .p2align 4 -L(Byte17): - movzbl -15(%edi), %eax - movzbl -15(%esi), %edx - sub %edx, %eax - RETURN - - .p2align 4 -L(Byte18): - movzbl -14(%edi), %eax - movzbl -14(%esi), %edx - sub %edx, %eax - RETURN - - .p2align 4 -L(Byte19): - movzbl -13(%edi), %eax - movzbl -13(%esi), %edx - sub %edx, %eax - RETURN - - .p2align 4 -L(Byte20): - movzbl -12(%edi), %eax - movzbl -12(%esi), %edx - sub %edx, %eax - RETURN - - .p2align 4 -L(Byte21): - movzbl -11(%edi), %eax - movzbl -11(%esi), %edx - sub %edx, %eax - RETURN - - .p2align 4 -L(Byte22): - movzbl -10(%edi), %eax - movzbl -10(%esi), %edx - sub %edx, %eax - RETURN - - .p2align 4 -L(next_24_bytes): - lea 8(%edi), %edi - lea 8(%esi), %esi - test $0x01, %dh - jnz L(Byte16) - - test $0x02, %dh - jnz L(Byte17) - - test $0x04, %dh - jnz L(Byte18) - - test $0x08, %dh - jnz L(Byte19) - - test $0x10, %dh - jnz L(Byte20) - - test $0x20, %dh - jnz L(Byte21) - - test $0x40, %dh - jnz L(Byte22) - - .p2align 4 -L(Byte31): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx - sub %edx, %eax - RETURN_END -# else - -/* special for wmemcmp */ - xor %eax, %eax - test %dl, %dl - jz L(next_two_double_words) - and $15, %dl - jz L(second_double_word) - mov -16(%edi), %eax - cmp -16(%esi), %eax - jne L(nequal) - RETURN - - .p2align 4 -L(second_double_word): - mov -12(%edi), %eax - cmp -12(%esi), %eax - jne L(nequal) - RETURN - - .p2align 4 -L(next_two_double_words): - and $15, %dh - jz L(fourth_double_word) - mov -8(%edi), %eax - cmp -8(%esi), %eax - jne L(nequal) - RETURN - - .p2align 4 -L(fourth_double_word): - mov -4(%edi), %eax - cmp -4(%esi), %eax - jne L(nequal) - RETURN - - .p2align 4 -L(nequal): - mov $1, %eax - jg L(nequal_bigger) - neg %eax - RETURN - - .p2align 4 -L(nequal_bigger): - RETURN_END -# endif - - CFI_PUSH (%ebx) - - .p2align 4 -L(more8bytes): - cmp $16, %ecx - jae L(more16bytes) - cmp $8, %ecx - je L(8bytes) -# ifndef USE_AS_WMEMCMP - cmp $9, %ecx - je L(9bytes) - cmp $10, %ecx - je L(10bytes) - cmp $11, %ecx - je L(11bytes) - cmp $12, %ecx - je L(12bytes) - cmp $13, %ecx - je L(13bytes) - cmp $14, %ecx - je L(14bytes) - jmp L(15bytes) -# else - jmp L(12bytes) -# endif - - .p2align 4 -L(more16bytes): - cmp $24, %ecx - jae L(more24bytes) - cmp $16, %ecx - je L(16bytes) -# ifndef USE_AS_WMEMCMP - cmp $17, %ecx - je L(17bytes) - cmp $18, %ecx - je L(18bytes) - cmp $19, %ecx - je L(19bytes) - cmp $20, %ecx - je L(20bytes) - cmp $21, %ecx - je L(21bytes) - cmp $22, %ecx - je L(22bytes) - jmp L(23bytes) -# else - jmp L(20bytes) -# endif - - .p2align 4 -L(more24bytes): - cmp $32, %ecx - jae L(more32bytes) - cmp $24, %ecx - je L(24bytes) -# ifndef USE_AS_WMEMCMP - cmp $25, %ecx - je L(25bytes) - cmp $26, %ecx - je L(26bytes) - cmp $27, %ecx - je L(27bytes) - cmp $28, %ecx - je L(28bytes) - cmp $29, %ecx - je L(29bytes) - cmp $30, %ecx - je L(30bytes) - jmp L(31bytes) -# else - jmp L(28bytes) -# endif - - .p2align 4 -L(more32bytes): - cmp $40, %ecx - jae L(more40bytes) - cmp $32, %ecx - je L(32bytes) -# ifndef USE_AS_WMEMCMP - cmp $33, %ecx - je L(33bytes) - cmp $34, %ecx - je L(34bytes) - cmp $35, %ecx - je L(35bytes) - cmp $36, %ecx - je L(36bytes) - cmp $37, %ecx - je L(37bytes) - cmp $38, %ecx - je L(38bytes) - jmp L(39bytes) -# else - jmp L(36bytes) -# endif - - .p2align 4 -L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) -# ifndef USE_AS_WMEMCMP - cmp $2, %ecx - je L(2bytes) - cmp $3, %ecx - je L(3bytes) - cmp $4, %ecx - je L(4bytes) - cmp $5, %ecx - je L(5bytes) - cmp $6, %ecx - je L(6bytes) - jmp L(7bytes) -# else - jmp L(4bytes) -# endif - - .p2align 4 -L(more40bytes): - cmp $40, %ecx - je L(40bytes) -# ifndef USE_AS_WMEMCMP - cmp $41, %ecx - je L(41bytes) - cmp $42, %ecx - je L(42bytes) - cmp $43, %ecx - je L(43bytes) - cmp $44, %ecx - je L(44bytes) - cmp $45, %ecx - je L(45bytes) - cmp $46, %ecx - je L(46bytes) - jmp L(47bytes) - - .p2align 4 -L(44bytes): - mov -44(%eax), %ecx - mov -44(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(40bytes): - mov -40(%eax), %ecx - mov -40(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(36bytes): - mov -36(%eax), %ecx - mov -36(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(32bytes): - mov -32(%eax), %ecx - mov -32(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(28bytes): - mov -28(%eax), %ecx - mov -28(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(24bytes): - mov -24(%eax), %ecx - mov -24(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(20bytes): - mov -20(%eax), %ecx - mov -20(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(16bytes): - mov -16(%eax), %ecx - mov -16(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(12bytes): - mov -12(%eax), %ecx - mov -12(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(8bytes): - mov -8(%eax), %ecx - mov -8(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(4bytes): - mov -4(%eax), %ecx - mov -4(%edx), %ebx - cmp %ebx, %ecx - mov $0, %eax - jne L(find_diff) - POP (%ebx) - ret - CFI_PUSH (%ebx) -# else - .p2align 4 -L(44bytes): - mov -44(%eax), %ecx - cmp -44(%edx), %ecx - jne L(find_diff) -L(40bytes): - mov -40(%eax), %ecx - cmp -40(%edx), %ecx - jne L(find_diff) -L(36bytes): - mov -36(%eax), %ecx - cmp -36(%edx), %ecx - jne L(find_diff) -L(32bytes): - mov -32(%eax), %ecx - cmp -32(%edx), %ecx - jne L(find_diff) -L(28bytes): - mov -28(%eax), %ecx - cmp -28(%edx), %ecx - jne L(find_diff) -L(24bytes): - mov -24(%eax), %ecx - cmp -24(%edx), %ecx - jne L(find_diff) -L(20bytes): - mov -20(%eax), %ecx - cmp -20(%edx), %ecx - jne L(find_diff) -L(16bytes): - mov -16(%eax), %ecx - cmp -16(%edx), %ecx - jne L(find_diff) -L(12bytes): - mov -12(%eax), %ecx - cmp -12(%edx), %ecx - jne L(find_diff) -L(8bytes): - mov -8(%eax), %ecx - cmp -8(%edx), %ecx - jne L(find_diff) -L(4bytes): - mov -4(%eax), %ecx - xor %eax, %eax - cmp -4(%edx), %ecx - jne L(find_diff) - POP (%ebx) - ret - CFI_PUSH (%ebx) -# endif - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(45bytes): - mov -45(%eax), %ecx - mov -45(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(41bytes): - mov -41(%eax), %ecx - mov -41(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(37bytes): - mov -37(%eax), %ecx - mov -37(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(33bytes): - mov -33(%eax), %ecx - mov -33(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(29bytes): - mov -29(%eax), %ecx - mov -29(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(25bytes): - mov -25(%eax), %ecx - mov -25(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(21bytes): - mov -21(%eax), %ecx - mov -21(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(17bytes): - mov -17(%eax), %ecx - mov -17(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(13bytes): - mov -13(%eax), %ecx - mov -13(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(9bytes): - mov -9(%eax), %ecx - mov -9(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(5bytes): - mov -5(%eax), %ecx - mov -5(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) - movzbl -1(%eax), %ecx - cmp -1(%edx), %cl - mov $0, %eax - jne L(end) - POP (%ebx) - ret - CFI_PUSH (%ebx) - - .p2align 4 -L(46bytes): - mov -46(%eax), %ecx - mov -46(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(42bytes): - mov -42(%eax), %ecx - mov -42(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(38bytes): - mov -38(%eax), %ecx - mov -38(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(34bytes): - mov -34(%eax), %ecx - mov -34(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(30bytes): - mov -30(%eax), %ecx - mov -30(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(26bytes): - mov -26(%eax), %ecx - mov -26(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(22bytes): - mov -22(%eax), %ecx - mov -22(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(18bytes): - mov -18(%eax), %ecx - mov -18(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(14bytes): - mov -14(%eax), %ecx - mov -14(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(10bytes): - mov -10(%eax), %ecx - mov -10(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(6bytes): - mov -6(%eax), %ecx - mov -6(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(2bytes): - movzwl -2(%eax), %ecx - movzwl -2(%edx), %ebx - cmp %bl, %cl - jne L(end) - cmp %bh, %ch - mov $0, %eax - jne L(end) - POP (%ebx) - ret - CFI_PUSH (%ebx) - - .p2align 4 -L(47bytes): - movl -47(%eax), %ecx - movl -47(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(43bytes): - movl -43(%eax), %ecx - movl -43(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(39bytes): - movl -39(%eax), %ecx - movl -39(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(35bytes): - movl -35(%eax), %ecx - movl -35(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(31bytes): - movl -31(%eax), %ecx - movl -31(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(27bytes): - movl -27(%eax), %ecx - movl -27(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(23bytes): - movl -23(%eax), %ecx - movl -23(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(19bytes): - movl -19(%eax), %ecx - movl -19(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(15bytes): - movl -15(%eax), %ecx - movl -15(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(11bytes): - movl -11(%eax), %ecx - movl -11(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(7bytes): - movl -7(%eax), %ecx - movl -7(%edx), %ebx - cmp %ebx, %ecx - jne L(find_diff) -L(3bytes): - movzwl -3(%eax), %ecx - movzwl -3(%edx), %ebx - cmpb %bl, %cl - jne L(end) - cmp %bx, %cx - jne L(end) - movzbl -1(%eax), %eax - cmpb -1(%edx), %al - mov $0, %eax - jne L(end) - POP (%ebx) - ret - CFI_PUSH (%ebx) - - .p2align 4 -L(find_diff): - cmpb %bl, %cl - jne L(end) - cmp %bx, %cx - jne L(end) - shr $16,%ecx - shr $16,%ebx - cmp %bl, %cl - jne L(end) - cmp %bx, %cx - - .p2align 4 -L(end): - POP (%ebx) - mov $1, %eax - ja L(bigger) - neg %eax -L(bigger): - ret -# else - -/* for wmemcmp */ - .p2align 4 -L(find_diff): - POP (%ebx) - mov $1, %eax - jg L(find_diff_bigger) - neg %eax - ret - - .p2align 4 -L(find_diff_bigger): - ret - -# endif -END (MEMCMP) -#endif diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S deleted file mode 100644 index d4d7d2e91d..0000000000 --- a/sysdeps/i386/i686/multiarch/memcmp.S +++ /dev/null @@ -1,62 +0,0 @@ -/* Multiple versions of memcmp - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -/* Define multiple versions only for the definition in libc. */ -#if IS_IN (libc) - .text -ENTRY(memcmp) - .type memcmp, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - LOAD_FUNC_GOT_EAX (__memcmp_ia32) - HAS_CPU_FEATURE (SSSE3) - jz 2f - LOAD_FUNC_GOT_EAX (__memcmp_ssse3) - HAS_CPU_FEATURE (SSE4_2) - jz 2f - LOAD_FUNC_GOT_EAX (__memcmp_sse4_2) -2: ret -END(memcmp) - -# undef ENTRY -# define ENTRY(name) \ - .type __memcmp_ia32, @function; \ - .p2align 4; \ - .globl __memcmp_ia32; \ - .hidden __memcmp_ia32; \ - __memcmp_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32 - -# ifdef SHARED -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memcmp; __GI_memcmp = __memcmp_ia32 -# endif -#endif - -#include "../memcmp.S" diff --git a/sysdeps/i386/i686/multiarch/memcmp.c b/sysdeps/i386/i686/multiarch/memcmp.c new file mode 100644 index 0000000000..63103a072d --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcmp.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/i386/i686/multiarch/rtld-memcmp.S b/sysdeps/i386/i686/multiarch/rtld-memcmp.S new file mode 100644 index 0000000000..85ec290bd5 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/rtld-memcmp.S @@ -0,0 +1,19 @@ +/* memcmp for ld.so + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include diff --git a/sysdeps/i386/multiarch/Makefile b/sysdeps/i386/multiarch/Makefile index 937cea9414..577bbb6c65 100644 --- a/sysdeps/i386/multiarch/Makefile +++ b/sysdeps/i386/multiarch/Makefile @@ -18,5 +18,6 @@ sysdep_routines += bcopy-i386 bcopy-i686 bcopy-sse2-unaligned \ bzero-sse2 bzero-sse2-rep \ memset-i386 memset-i586 memset-i686 \ memset-sse2 memset-sse2-rep \ - memchr-sse2-bsf memchr-sse2 + memchr-sse2-bsf memchr-sse2 \ + memcmp-i386 memcmp-i686 memcmp-ssse3 memcmp-sse4 endif diff --git a/sysdeps/i386/multiarch/ifunc-impl-list.c b/sysdeps/i386/multiarch/ifunc-impl-list.c index 420efb2874..e57134d340 100644 --- a/sysdeps/i386/multiarch/ifunc-impl-list.c +++ b/sysdeps/i386/multiarch/ifunc-impl-list.c @@ -71,15 +71,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memchr_sse2) IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_i386)) -#if 0 /* Support sysdeps/i386/i686/multiarch/memcmp.S. */ IFUNC_IMPL (i, name, memcmp, IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2), __memcmp_sse4_2) IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), __memcmp_ssse3) - IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32)) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_I686, __memcmp_i686) +#if MINIMUM_ISA < 686 + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_i386) #endif + ) /* Support sysdeps/i386/i686/multiarch/memmove_chk.S. */ IFUNC_IMPL (i, name, __memmove_chk, diff --git a/sysdeps/i386/multiarch/memcmp-i386.S b/sysdeps/i386/multiarch/memcmp-i386.S new file mode 100644 index 0000000000..18e7424966 --- /dev/null +++ b/sysdeps/i386/multiarch/memcmp-i386.S @@ -0,0 +1,12 @@ +#define memcmp __memcmp_i386 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) +#undef weak_alias +#define weak_alias(name, aliasname) +#include + +#ifdef SHARED + .globl __GI_memcmp + .hidden __GI_memcmp + __GI_memcmp = __memcmp_i386 +#endif diff --git a/sysdeps/i386/multiarch/memcmp-i686.S b/sysdeps/i386/multiarch/memcmp-i686.S new file mode 100644 index 0000000000..32d1a4da6c --- /dev/null +++ b/sysdeps/i386/multiarch/memcmp-i686.S @@ -0,0 +1,6 @@ +#define memcmp __memcmp_i686 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) +#undef weak_alias +#define weak_alias(name, aliasname) +#include diff --git a/sysdeps/i386/multiarch/memcmp-sse4.S b/sysdeps/i386/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..b3756f4a00 --- /dev/null +++ b/sysdeps/i386/multiarch/memcmp-sse4.S @@ -0,0 +1,1225 @@ +/* memcmp with SSE4.2, wmemcmp with SSE4.2 + Copyright (C) 2010-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_2 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1 + 4 +# define LEN BLK2 + 4 +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) + + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +/* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ +/* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ +/* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ +/* We loaded the jump table and adjusted EDX/ESI. Go. */ \ + jmp *%ebx +# else +# define JMPTBL(I, B) I + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.sse4.2,"ax",@progbits +ENTRY (MEMCMP) + movl BLK1(%esp), %eax + movl BLK2(%esp), %edx + movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(return0) +# else + cmp $1, %ecx + jbe L(less1bytes) +# endif + + pxor %xmm0, %xmm0 + cmp $64, %ecx + ja L(64bytesormore) + cmp $8, %ecx + +# ifndef USE_AS_WMEMCMP + PUSH (%ebx) + jb L(less8bytes) +# else + jb L(less8bytes) + PUSH (%ebx) +# endif + + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less8bytes): + mov (%eax), %bl + cmpb (%edx), %bl + jne L(nonzero) + + mov 1(%eax), %bl + cmpb 1(%edx), %bl + jne L(nonzero) + + cmp $2, %ecx + jz L(0bytes) + + mov 2(%eax), %bl + cmpb 2(%edx), %bl + jne L(nonzero) + + cmp $3, %ecx + jz L(0bytes) + + mov 3(%eax), %bl + cmpb 3(%edx), %bl + jne L(nonzero) + + cmp $4, %ecx + jz L(0bytes) + + mov 4(%eax), %bl + cmpb 4(%edx), %bl + jne L(nonzero) + + cmp $5, %ecx + jz L(0bytes) + + mov 5(%eax), %bl + cmpb 5(%edx), %bl + jne L(nonzero) + + cmp $6, %ecx + jz L(0bytes) + + mov 6(%eax), %bl + cmpb 6(%edx), %bl + je L(0bytes) + +L(nonzero): + POP (%ebx) + mov $1, %eax + ja L(above) + neg %eax +L(above): + ret + CFI_PUSH (%ebx) +# endif + + .p2align 4 +L(0bytes): + POP (%ebx) + xor %eax, %eax + ret + +# ifdef USE_AS_WMEMCMP + +/* for wmemcmp, case N == 1 */ + + .p2align 4 +L(less8bytes): + mov (%eax), %ecx + cmp (%edx), %ecx + je L(return0) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + + .p2align 4 +L(return0): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less1bytes): + jb L(0bytesend) + movzbl (%eax), %eax + movzbl (%edx), %edx + sub %edx, %eax + ret + + .p2align 4 +L(0bytesend): + xor %eax, %eax + ret +# endif + .p2align 4 +L(64bytesormore): + PUSH (%ebx) + mov %ecx, %ebx + mov $64, %ecx + sub $64, %ebx +L(64bytesormore_loop): + movdqu (%eax), %xmm1 + movdqu (%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_16diff) + + movdqu 16(%eax), %xmm1 + movdqu 16(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_32diff) + + movdqu 32(%eax), %xmm1 + movdqu 32(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_48diff) + + movdqu 48(%eax), %xmm1 + movdqu 48(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_64diff) + add %ecx, %eax + add %ecx, %edx + sub %ecx, %ebx + jae L(64bytesormore_loop) + add %ebx, %ecx + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +# ifdef USE_AS_WMEMCMP + +/* Label needs only for table_64bytes filling */ +L(unreal_case): +/* no code here */ + +# endif + .p2align 4 +L(find_16diff): + sub $16, %ecx +L(find_32diff): + sub $16, %ecx +L(find_48diff): + sub $16, %ecx +L(find_64diff): + add %ecx, %edx + add %ecx, %eax + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# else + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + cmp -4(%edx), %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(49bytes): + movdqu -49(%eax), %xmm1 + movdqu -49(%edx), %xmm2 + mov $-49, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%eax), %xmm1 + movdqu -33(%edx), %xmm2 + mov $-33, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(50bytes): + mov $-50, %ebx + movdqu -50(%eax), %xmm1 + movdqu -50(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + mov $-34, %ebx + movdqu -34(%eax), %xmm1 + movdqu -34(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(51bytes): + mov $-51, %ebx + movdqu -51(%eax), %xmm1 + movdqu -51(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + mov $-35, %ebx + movdqu -35(%eax), %xmm1 + movdqu -35(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) +L(1bytes): + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(52bytes): + movdqu -52(%eax), %xmm1 + movdqu -52(%edx), %xmm2 + mov $-52, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%eax), %xmm1 + movdqu -36(%edx), %xmm2 + mov $-36, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%eax), %xmm1 + movdqu -20(%edx), %xmm2 + mov $-20, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(53bytes): + movdqu -53(%eax), %xmm1 + movdqu -53(%edx), %xmm2 + mov $-53, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + mov $-37, %ebx + movdqu -37(%eax), %xmm1 + movdqu -37(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + mov $-21, %ebx + movdqu -21(%eax), %xmm1 + movdqu -21(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(54bytes): + movdqu -54(%eax), %xmm1 + movdqu -54(%edx), %xmm2 + mov $-54, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + mov $-38, %ebx + movdqu -38(%eax), %xmm1 + movdqu -38(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + mov $-22, %ebx + movdqu -22(%eax), %xmm1 + movdqu -22(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(55bytes): + movdqu -55(%eax), %xmm1 + movdqu -55(%edx), %xmm2 + mov $-55, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + mov $-39, %ebx + movdqu -39(%eax), %xmm1 + movdqu -39(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + mov $-23, %ebx + movdqu -23(%eax), %xmm1 + movdqu -23(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(56bytes): + movdqu -56(%eax), %xmm1 + movdqu -56(%edx), %xmm2 + mov $-56, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + mov $-40, %ebx + movdqu -40(%eax), %xmm1 + movdqu -40(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + mov $-24, %ebx + movdqu -24(%eax), %xmm1 + movdqu -24(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(57bytes): + movdqu -57(%eax), %xmm1 + movdqu -57(%edx), %xmm2 + mov $-57, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + mov $-41, %ebx + movdqu -41(%eax), %xmm1 + movdqu -41(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + mov $-25, %ebx + movdqu -25(%eax), %xmm1 + movdqu -25(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(58bytes): + movdqu -58(%eax), %xmm1 + movdqu -58(%edx), %xmm2 + mov $-58, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + mov $-42, %ebx + movdqu -42(%eax), %xmm1 + movdqu -42(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + mov $-26, %ebx + movdqu -26(%eax), %xmm1 + movdqu -26(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(59bytes): + movdqu -59(%eax), %xmm1 + movdqu -59(%edx), %xmm2 + mov $-59, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + mov $-43, %ebx + movdqu -43(%eax), %xmm1 + movdqu -43(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + mov $-27, %ebx + movdqu -27(%eax), %xmm1 + movdqu -27(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(60bytes): + movdqu -60(%eax), %xmm1 + movdqu -60(%edx), %xmm2 + mov $-60, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + mov $-44, %ebx + movdqu -44(%eax), %xmm1 + movdqu -44(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + mov $-28, %ebx + movdqu -28(%eax), %xmm1 + movdqu -28(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif + jne L(find_diff) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(61bytes): + movdqu -61(%eax), %xmm1 + movdqu -61(%edx), %xmm2 + mov $-61, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + mov $-45, %ebx + movdqu -45(%eax), %xmm1 + movdqu -45(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + mov $-29, %ebx + movdqu -29(%eax), %xmm1 + movdqu -29(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(62bytes): + movdqu -62(%eax), %xmm1 + movdqu -62(%edx), %xmm2 + mov $-62, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + mov $-46, %ebx + movdqu -46(%eax), %xmm1 + movdqu -46(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + mov $-30, %ebx + movdqu -30(%eax), %xmm1 + movdqu -30(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(63bytes): + movdqu -63(%eax), %xmm1 + movdqu -63(%edx), %xmm2 + mov $-63, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + mov $-47, %ebx + movdqu -47(%eax), %xmm1 + movdqu -47(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + mov $-31, %ebx + movdqu -31(%eax), %xmm1 + movdqu -31(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + + .p2align 4 +L(64bytes): + movdqu -64(%eax), %xmm1 + movdqu -64(%edx), %xmm2 + mov $-64, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%eax), %xmm1 + movdqu -48(%edx), %xmm2 + mov $-48, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%eax), %xmm1 + movdqu -32(%edx), %xmm2 + mov $-32, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -16(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -16(%edx), %ecx +# endif + jne L(find_diff) + + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif + jne L(find_diff) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + mov (%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + mov 4(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + mov 8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + mov 12(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# else + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + cmp (%edx), %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + cmp 4(%edx), %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + cmp 8(%edx), %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + cmp 12(%edx), %ecx + + mov $0, %eax + jne L(find_diff) + RETURN +# endif + + .p2align 4 +L(find_diff): +# ifndef USE_AS_WMEMCMP + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +# else + POP (%ebx) + mov $1, %eax + jg L(bigger) + neg %eax + ret + + .p2align 4 +L(bigger): + ret +# endif +END (MEMCMP) + + .section .rodata.sse4.2,"a",@progbits + .p2align 2 + .type L(table_64bytes), @object +# ifndef USE_AS_WMEMCMP +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# endif +#endif diff --git a/sysdeps/i386/multiarch/memcmp-ssse3.S b/sysdeps/i386/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..ea2a25b216 --- /dev/null +++ b/sysdeps/i386/multiarch/memcmp-ssse3.S @@ -0,0 +1,2157 @@ +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2010-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1+4 +# define LEN BLK2+4 +# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + atom_text_section +ENTRY (MEMCMP) + movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(zero) +# endif + + movl BLK1(%esp), %eax + cmp $48, %ecx + movl BLK2(%esp), %edx + jae L(48bytesormore) + +# ifndef USE_AS_WMEMCMP + cmp $1, %ecx + jbe L(less1bytes) +# endif + + PUSH (%ebx) + add %ecx, %edx + add %ecx, %eax + jmp L(less48bytes) + + CFI_POP (%ebx) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less1bytes): + jb L(zero) + movb (%eax), %cl + cmp (%edx), %cl + je L(zero) + mov $1, %eax + ja L(1bytesend) + neg %eax +L(1bytesend): + ret +# endif + + .p2align 4 +L(zero): + xor %eax, %eax + ret + + .p2align 4 +L(48bytesormore): + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) + cfi_remember_state + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 + movl %eax, %edi + movl %edx, %esi + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%edi), %edi + + sub $0xffff, %edx + lea 16(%esi), %esi + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %edx, %edi + sub %edx, %esi + add %edx, %ecx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %edx, %esi + +# ifndef USE_AS_WMEMCMP + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + .p2align 2 +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif + + .p2align 4 +L(shr_0): + cmp $80, %ecx + jae L(shr_0_gobble) + lea -48(%ecx), %ecx + xor %eax, %eax + movaps (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + movaps 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + add $32, %edi + add $32, %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_0_gobble): + lea -48(%ecx), %ecx + movdqa (%esi), %xmm0 + xor %eax, %eax + pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %ecx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%esi), %xmm0 + movdqa 48(%esi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%edi), %xmm0 + pcmpeqb 48(%edi), %xmm2 + lea 32(%edi), %edi + lea 32(%esi), %esi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %ecx + jge L(shr_0_gobble_loop_next) + inc %edx + add $32, %ecx +L(shr_0_gobble_loop_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_1): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $1,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_1_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $1,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $1,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $1,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $1,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_1_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_1_gobble_next) + inc %edx + add $32, %ecx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_2): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $2,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_2_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $2,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $2,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $2,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $2,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_2_gobble_next) + inc %edx + add $32, %ecx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_3): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $3,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_3_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $3,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $3,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $3,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $3,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_3_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_3_gobble_next) + inc %edx + add $32, %ecx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_4): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $4,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_4_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $4,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $4,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $4,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $4,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_4_gobble_next) + inc %edx + add $32, %ecx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_5): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $5,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_5_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $5,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $5,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $5,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $5,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_5_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_5_gobble_next) + inc %edx + add $32, %ecx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_6): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $6,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_6_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $6,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $6,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $6,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $6,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_6_gobble_next) + inc %edx + add $32, %ecx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_7): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $7,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_7_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $7,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $7,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $7,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $7,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_7_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_7_gobble_next) + inc %edx + add $32, %ecx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_8): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $8,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_8_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $8,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $8,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $8,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $8,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_8_gobble_next) + inc %edx + add $32, %ecx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_9): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $9,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_9_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $9,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $9,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $9,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $9,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_9_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_9_gobble_next) + inc %edx + add $32, %ecx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_10): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $10,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_10_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $10, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $10, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $10,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $10,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_10_gobble_next) + inc %edx + add $32, %ecx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_11): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_11_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $11, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $11, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $11,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $11,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_11_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_11_gobble_next) + inc %edx + add $32, %ecx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_12): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_12_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $12, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $12, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $12,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $12,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_12_gobble_next) + inc %edx + add $32, %ecx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_13): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_13_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $13, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $13, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $13,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $13,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_13_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_13_gobble_next) + inc %edx + add $32, %ecx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_14): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_14_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $14, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $14, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $14,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $14,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_14_gobble_next) + inc %edx + add $32, %ecx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_15): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_15_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $15, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $15, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $15,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $15,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_15_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_15_gobble_next) + inc %edx + add $32, %ecx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(exit): + pmovmskb %xmm1, %ebx + sub $0xffff, %ebx + jz L(first16bytes) + lea -16(%esi), %esi + lea -16(%edi), %edi + mov %ebx, %edx + +L(first16bytes): + add %eax, %esi +L(less16bytes): + +# ifndef USE_AS_WMEMCMP + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) +L(Byte23): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte16): + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte17): + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte18): + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte19): + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte20): + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte21): + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte22): + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(next_24_bytes): + lea 8(%edi), %edi + lea 8(%esi), %esi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + .p2align 4 +L(Byte31): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN_END +# else + +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%edi), %eax + cmp -16(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov -12(%edi), %eax + cmp -12(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%edi), %eax + cmp -8(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov -4(%edi), %eax + cmp -4(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(nequal_bigger) + neg %eax + RETURN + + .p2align 4 +L(nequal_bigger): + RETURN_END +# endif + + CFI_PUSH (%ebx) + + .p2align 4 +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) +# ifndef USE_AS_WMEMCMP + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) +# else + jmp L(12bytes) +# endif + + .p2align 4 +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) +# ifndef USE_AS_WMEMCMP + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) +# else + jmp L(20bytes) +# endif + + .p2align 4 +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) +# ifndef USE_AS_WMEMCMP + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) +# else + jmp L(28bytes) +# endif + + .p2align 4 +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) +# ifndef USE_AS_WMEMCMP + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) +# ifndef USE_AS_WMEMCMP + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif + + .p2align 4 +L(more40bytes): + cmp $40, %ecx + je L(40bytes) +# ifndef USE_AS_WMEMCMP + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + mov -44(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + mov -40(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + mov -36(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + mov -32(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + mov -28(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + mov -24(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + mov -20(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# else + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + cmp -44(%edx), %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + cmp -40(%edx), %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + cmp -36(%edx), %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + cmp -32(%edx), %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + cmp -28(%edx), %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + cmp -24(%edx), %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + cmp -20(%edx), %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + xor %eax, %eax + cmp -4(%edx), %ecx + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# endif + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(45bytes): + mov -45(%eax), %ecx + mov -45(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(41bytes): + mov -41(%eax), %ecx + mov -41(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(37bytes): + mov -37(%eax), %ecx + mov -37(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(33bytes): + mov -33(%eax), %ecx + mov -33(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(29bytes): + mov -29(%eax), %ecx + mov -29(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(25bytes): + mov -25(%eax), %ecx + mov -25(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(21bytes): + mov -21(%eax), %ecx + mov -21(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(46bytes): + mov -46(%eax), %ecx + mov -46(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(42bytes): + mov -42(%eax), %ecx + mov -42(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(38bytes): + mov -38(%eax), %ecx + mov -38(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(34bytes): + mov -34(%eax), %ecx + mov -34(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(30bytes): + mov -30(%eax), %ecx + mov -30(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(26bytes): + mov -26(%eax), %ecx + mov -26(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(22bytes): + mov -22(%eax), %ecx + mov -22(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(47bytes): + movl -47(%eax), %ecx + movl -47(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(43bytes): + movl -43(%eax), %ecx + movl -43(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(39bytes): + movl -39(%eax), %ecx + movl -39(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(35bytes): + movl -35(%eax), %ecx + movl -35(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(31bytes): + movl -31(%eax), %ecx + movl -31(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(27bytes): + movl -27(%eax), %ecx + movl -27(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(23bytes): + movl -23(%eax), %ecx + movl -23(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(find_diff): + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx + + .p2align 4 +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +# else + +/* for wmemcmp */ + .p2align 4 +L(find_diff): + POP (%ebx) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + +# endif +END (MEMCMP) +#endif diff --git a/sysdeps/i386/multiarch/memcmp.c b/sysdeps/i386/multiarch/memcmp.c new file mode 100644 index 0000000000..36ca7da67e --- /dev/null +++ b/sysdeps/i386/multiarch/memcmp.c @@ -0,0 +1,57 @@ +/* Multiple versions of memcmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) +/* Redefine memcmp so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memcmp +# define memcmp __redirect_memcmp +# include +# undef memcmp + +# include + +extern __typeof (__redirect_memcmp) __memcmp_i386 attribute_hidden; +extern __typeof (__redirect_memcmp) __memcmp_i686 attribute_hidden; +extern __typeof (__redirect_memcmp) __memcmp_ssse3 attribute_hidden; +extern __typeof (__redirect_memcmp) __memcmp_sse4_2 attribute_hidden; + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_memcmp) memcmp; +extern void *memcmp_ifunc (void) __asm__ ("memcmp"); + +void * +memcmp_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSE4_2)) + return __memcmp_sse4_2; + else if (HAS_CPU_FEATURE (SSSE3)) + return __memcmp_ssse3; + + if (USE_I686) + return __memcmp_i686; + else + return __memcmp_i386; +} +__asm__ (".type memcmp, %gnu_indirect_function"); + +weak_alias (memcmp, bcmp) +#endif diff --git a/sysdeps/i386/multiarch/rtld-memcmp.S b/sysdeps/i386/multiarch/rtld-memcmp.S new file mode 100644 index 0000000000..d3a07fe73a --- /dev/null +++ b/sysdeps/i386/multiarch/rtld-memcmp.S @@ -0,0 +1,19 @@ +/* memcmp for ld.so + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include -- cgit 1.4.1