From d5496eb9a14df0fe463c211f5fe05cc73e8e770c Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 20 Aug 2015 15:20:58 -0700 Subject: Add i386 strcat multiarch functions --- sysdeps/i386/i686/multiarch/Makefile | 2 - sysdeps/i386/i686/multiarch/strcat-sse2.S | 1243 --------------------------- sysdeps/i386/i686/multiarch/strcat-ssse3.S | 572 ------------ sysdeps/i386/i686/multiarch/strcat.S | 92 -- sysdeps/i386/i686/multiarch/strlen-sse2.S | 695 --------------- sysdeps/i386/i686/multiarch/strncat-c.c | 8 - sysdeps/i386/i686/multiarch/strncat-sse2.S | 4 - sysdeps/i386/i686/multiarch/strncat-ssse3.S | 4 - sysdeps/i386/i686/multiarch/strncat.S | 5 - sysdeps/i386/multiarch/Makefile | 4 +- sysdeps/i386/multiarch/ifunc-impl-list.c | 8 +- sysdeps/i386/multiarch/strcat-i386.S | 10 + sysdeps/i386/multiarch/strcat-sse2.S | 1243 +++++++++++++++++++++++++++ sysdeps/i386/multiarch/strcat-ssse3.S | 572 ++++++++++++ sysdeps/i386/multiarch/strcat.c | 51 ++ sysdeps/i386/multiarch/strlen-sse2.S | 695 +++++++++++++++ sysdeps/i386/multiarch/strncat-i386.c | 8 + sysdeps/i386/multiarch/strncat-sse2.S | 4 + sysdeps/i386/multiarch/strncat-ssse3.S | 4 + sysdeps/i386/multiarch/strncat.c | 54 ++ 20 files changed, 2647 insertions(+), 2631 deletions(-) delete mode 100644 sysdeps/i386/i686/multiarch/strcat-sse2.S delete mode 100644 sysdeps/i386/i686/multiarch/strcat-ssse3.S delete mode 100644 sysdeps/i386/i686/multiarch/strcat.S delete mode 100644 sysdeps/i386/i686/multiarch/strlen-sse2.S delete mode 100644 sysdeps/i386/i686/multiarch/strncat-c.c delete mode 100644 sysdeps/i386/i686/multiarch/strncat-sse2.S delete mode 100644 sysdeps/i386/i686/multiarch/strncat-ssse3.S delete mode 100644 sysdeps/i386/i686/multiarch/strncat.S create mode 100644 sysdeps/i386/multiarch/strcat-i386.S create mode 100644 sysdeps/i386/multiarch/strcat-sse2.S create mode 100644 sysdeps/i386/multiarch/strcat-ssse3.S create mode 100644 sysdeps/i386/multiarch/strcat.c create mode 100644 sysdeps/i386/multiarch/strlen-sse2.S create mode 100644 sysdeps/i386/multiarch/strncat-i386.c create mode 100644 sysdeps/i386/multiarch/strncat-sse2.S create mode 100644 sysdeps/i386/multiarch/strncat-ssse3.S create mode 100644 sysdeps/i386/multiarch/strncat.c (limited to 'sysdeps/i386') diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index d38a051305..830c77d951 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -1,8 +1,6 @@ ifeq ($(subdir),string) sysdep_routines += varshift \ strlen-sse2 strlen-sse2-bsf \ - strcat-ssse3 \ - strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \ strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ strnlen-sse2 strnlen-c ifeq (yes,$(config-cflags-sse4)) diff --git a/sysdeps/i386/i686/multiarch/strcat-sse2.S b/sysdeps/i386/i686/multiarch/strcat-sse2.S deleted file mode 100644 index 0f9e13c6d3..0000000000 --- a/sysdeps/i386/i686/multiarch/strcat-sse2.S +++ /dev/null @@ -1,1243 +0,0 @@ -/* strcat with SSE2 - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - - -#if IS_IN (libc) - -# include - - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# ifdef SHARED -# define JMPTBL(I, B) I - B - -/* Load an entry in a jump table into ECX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ - -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into ECX. */ \ - SETUP_PIC_REG(cx); \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ecx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ecx,INDEX,SCALE), %ecx; \ - /* We loaded the jump table and adjusted ECX. Go. */ \ - jmp *%ecx -# else -# define JMPTBL(I, B) I - -/* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ - -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -# endif - -# ifndef STRCAT -# define STRCAT __strcat_sse2 -# endif - -# define PARMS 4 -# define STR1 PARMS+4 -# define STR2 STR1+4 - -# ifdef USE_AS_STRNCAT -# define LEN STR2+8 -# define STR3 STR1+4 -# else -# define STR3 STR1 -# endif - -# define USE_AS_STRCAT -# ifdef USE_AS_STRNCAT -# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi); -# else -# define RETURN POP(%esi); ret; CFI_PUSH(%esi); -# endif - -.text -ENTRY (STRCAT) - PUSH (%esi) - mov STR1(%esp), %eax - mov STR2(%esp), %esi -# ifdef USE_AS_STRNCAT - PUSH (%ebx) - movl LEN(%esp), %ebx - test %ebx, %ebx - jz L(ExitZero) -# endif - cmpb $0, (%esi) - mov %esi, %ecx - mov %eax, %edx - jz L(ExitZero) - - and $63, %ecx - and $63, %edx - cmp $32, %ecx - ja L(StrlenCore7_1) - cmp $48, %edx - ja L(alignment_prolog) - - pxor %xmm0, %xmm0 - pxor %xmm4, %xmm4 - pxor %xmm7, %xmm7 - movdqu (%eax), %xmm1 - movdqu (%esi), %xmm5 - pcmpeqb %xmm1, %xmm0 - movdqu 16(%esi), %xmm6 - pmovmskb %xmm0, %ecx - pcmpeqb %xmm5, %xmm4 - pcmpeqb %xmm6, %xmm7 - test %ecx, %ecx - jnz L(exit_less16_) - mov %eax, %ecx - and $-16, %eax - jmp L(loop_prolog) - -L(alignment_prolog): - pxor %xmm0, %xmm0 - pxor %xmm4, %xmm4 - mov %edx, %ecx - pxor %xmm7, %xmm7 - and $15, %ecx - and $-16, %eax - pcmpeqb (%eax), %xmm0 - movdqu (%esi), %xmm5 - movdqu 16(%esi), %xmm6 - pmovmskb %xmm0, %edx - pcmpeqb %xmm5, %xmm4 - shr %cl, %edx - pcmpeqb %xmm6, %xmm7 - test %edx, %edx - jnz L(exit_less16) - add %eax, %ecx - - pxor %xmm0, %xmm0 -L(loop_prolog): - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - .p2align 4 -L(align16_loop): - pcmpeqb 16(%eax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%eax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%eax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%eax), %xmm3 - pmovmskb %xmm3, %edx - lea 64(%eax), %eax - test %edx, %edx - jz L(align16_loop) - bsf %edx, %edx - add %edx, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit16): - bsf %edx, %edx - lea 16(%eax, %edx), %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit32): - bsf %edx, %edx - lea 32(%eax, %edx), %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit48): - bsf %edx, %edx - lea 48(%eax, %edx), %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_less16): - bsf %edx, %edx - add %ecx, %eax - add %edx, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_less16_): - bsf %ecx, %ecx - add %ecx, %eax - - .p2align 4 -L(StartStrcpyPart): - pmovmskb %xmm4, %edx -# ifdef USE_AS_STRNCAT - cmp $16, %ebx - jbe L(CopyFrom1To16BytesTail1Case2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To16BytesTail1) - - movdqu %xmm5, (%eax) - pmovmskb %xmm7, %edx -# ifdef USE_AS_STRNCAT - cmp $32, %ebx - jbe L(CopyFrom1To32Bytes1Case2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To32Bytes1) - - mov %esi, %ecx - and $-16, %esi - and $15, %ecx - pxor %xmm0, %xmm0 -# ifdef USE_AS_STRNCAT - add %ecx, %ebx -# endif - sub %ecx, %eax - jmp L(Unalign16Both) - -L(StrlenCore7_1): - mov %eax, %ecx - pxor %xmm0, %xmm0 - and $15, %ecx - and $-16, %eax - pcmpeqb (%eax), %xmm0 - pmovmskb %xmm0, %edx - shr %cl, %edx - test %edx, %edx - jnz L(exit_less16_1) - add %eax, %ecx - - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - - .p2align 4 -L(align16_loop_1): - pcmpeqb 16(%eax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16_1) - - pcmpeqb 32(%eax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32_1) - - pcmpeqb 48(%eax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48_1) - - pcmpeqb 64(%eax), %xmm3 - pmovmskb %xmm3, %edx - lea 64(%eax), %eax - test %edx, %edx - jz L(align16_loop_1) - bsf %edx, %edx - add %edx, %eax - jmp L(StartStrcpyPart_1) - - .p2align 4 -L(exit16_1): - bsf %edx, %edx - lea 16(%eax, %edx), %eax - jmp L(StartStrcpyPart_1) - - .p2align 4 -L(exit32_1): - bsf %edx, %edx - lea 32(%eax, %edx), %eax - jmp L(StartStrcpyPart_1) - - .p2align 4 -L(exit48_1): - bsf %edx, %edx - lea 48(%eax, %edx), %eax - jmp L(StartStrcpyPart_1) - - .p2align 4 -L(exit_less16_1): - bsf %edx, %edx - add %ecx, %eax - add %edx, %eax - - .p2align 4 -L(StartStrcpyPart_1): - mov %esi, %ecx - and $15, %ecx - and $-16, %esi - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - -# ifdef USE_AS_STRNCAT - cmp $48, %ebx - ja L(BigN) -# endif - pcmpeqb (%esi), %xmm1 -# ifdef USE_AS_STRNCAT - add %ecx, %ebx -# endif - pmovmskb %xmm1, %edx - shr %cl, %edx -# ifdef USE_AS_STRNCAT - cmp $16, %ebx - jbe L(CopyFrom1To16BytesTailCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To16BytesTail) - - pcmpeqb 16(%esi), %xmm0 - pmovmskb %xmm0, %edx -# ifdef USE_AS_STRNCAT - cmp $32, %ebx - jbe L(CopyFrom1To32BytesCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To32Bytes) - - movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ - movdqu %xmm1, (%eax) - sub %ecx, %eax - - .p2align 4 -L(Unalign16Both): - mov $16, %ecx - movdqa (%esi, %ecx), %xmm1 - movaps 16(%esi, %ecx), %xmm2 - movdqu %xmm1, (%eax, %ecx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %edx - add $16, %ecx -# ifdef USE_AS_STRNCAT - sub $48, %ebx - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To16Bytes) -L(Unalign16BothBigN): - movaps 16(%esi, %ecx), %xmm3 - movdqu %xmm2, (%eax, %ecx) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - add $16, %ecx -# ifdef USE_AS_STRNCAT - sub $16, %ebx - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To16Bytes) - - movaps 16(%esi, %ecx), %xmm4 - movdqu %xmm3, (%eax, %ecx) - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %edx - add $16, %ecx -# ifdef USE_AS_STRNCAT - sub $16, %ebx - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To16Bytes) - - movaps 16(%esi, %ecx), %xmm1 - movdqu %xmm4, (%eax, %ecx) - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - add $16, %ecx -# ifdef USE_AS_STRNCAT - sub $16, %ebx - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To16Bytes) - - movaps 16(%esi, %ecx), %xmm2 - movdqu %xmm1, (%eax, %ecx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %edx - add $16, %ecx -# ifdef USE_AS_STRNCAT - sub $16, %ebx - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To16Bytes) - - movaps 16(%esi, %ecx), %xmm3 - movdqu %xmm2, (%eax, %ecx) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - add $16, %ecx -# ifdef USE_AS_STRNCAT - sub $16, %ebx - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyFrom1To16Bytes) - - movdqu %xmm3, (%eax, %ecx) - mov %esi, %edx - lea 16(%esi, %ecx), %esi - and $-0x40, %esi - sub %esi, %edx - sub %edx, %eax -# ifdef USE_AS_STRNCAT - lea 128(%ebx, %edx), %ebx -# endif - movaps (%esi), %xmm2 - movaps %xmm2, %xmm4 - movaps 16(%esi), %xmm5 - movaps 32(%esi), %xmm3 - movaps %xmm3, %xmm6 - movaps 48(%esi), %xmm7 - pminub %xmm5, %xmm2 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx -# ifdef USE_AS_STRNCAT - sub $64, %ebx - jbe L(UnalignedLeaveCase2OrCase3) -# endif - test %edx, %edx - jnz L(Unaligned64Leave) - - .p2align 4 -L(Unaligned64Loop_start): - add $64, %eax - add $64, %esi - movdqu %xmm4, -64(%eax) - movaps (%esi), %xmm2 - movdqa %xmm2, %xmm4 - movdqu %xmm5, -48(%eax) - movaps 16(%esi), %xmm5 - pminub %xmm5, %xmm2 - movaps 32(%esi), %xmm3 - movdqu %xmm6, -32(%eax) - movaps %xmm3, %xmm6 - movdqu %xmm7, -16(%eax) - movaps 48(%esi), %xmm7 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx -# ifdef USE_AS_STRNCAT - sub $64, %ebx - jbe L(UnalignedLeaveCase2OrCase3) -# endif - test %edx, %edx - jz L(Unaligned64Loop_start) - -L(Unaligned64Leave): - pxor %xmm1, %xmm1 - - pcmpeqb %xmm4, %xmm0 - pcmpeqb %xmm5, %xmm1 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %ecx - test %edx, %edx - jnz L(CopyFrom1To16BytesUnaligned_0) - test %ecx, %ecx - jnz L(CopyFrom1To16BytesUnaligned_16) - - pcmpeqb %xmm6, %xmm0 - pcmpeqb %xmm7, %xmm1 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %ecx - test %edx, %edx - jnz L(CopyFrom1To16BytesUnaligned_32) - - bsf %ecx, %edx - movdqu %xmm4, (%eax) - movdqu %xmm5, 16(%eax) - movdqu %xmm6, 32(%eax) - add $48, %esi - add $48, %eax - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) - -# ifdef USE_AS_STRNCAT - .p2align 4 -L(BigN): - pcmpeqb (%esi), %xmm1 - pmovmskb %xmm1, %edx - shr %cl, %edx - test %edx, %edx - jnz L(CopyFrom1To16BytesTail) - - pcmpeqb 16(%esi), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(CopyFrom1To32Bytes) - - movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ - movdqu %xmm1, (%eax) - sub %ecx, %eax - sub $48, %ebx - add %ecx, %ebx - - mov $16, %ecx - movdqa (%esi, %ecx), %xmm1 - movaps 16(%esi, %ecx), %xmm2 - movdqu %xmm1, (%eax, %ecx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %edx - add $16, %ecx - test %edx, %edx - jnz L(CopyFrom1To16Bytes) - jmp L(Unalign16BothBigN) -# endif - -/*------------end of main part-------------------------------*/ - -/* Case1 */ - .p2align 4 -L(CopyFrom1To16Bytes): - add %ecx, %eax - add %ecx, %esi - bsf %edx, %edx - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) - - .p2align 4 -L(CopyFrom1To16BytesTail): - add %ecx, %esi - bsf %edx, %edx - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) - - .p2align 4 -L(CopyFrom1To32Bytes1): - add $16, %esi - add $16, %eax -L(CopyFrom1To16BytesTail1): - bsf %edx, %edx - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) - - .p2align 4 -L(CopyFrom1To32Bytes): - bsf %edx, %edx - add %ecx, %esi - add $16, %edx - sub %ecx, %edx - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) - - .p2align 4 -L(CopyFrom1To16BytesUnaligned_0): - bsf %edx, %edx - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) - - .p2align 4 -L(CopyFrom1To16BytesUnaligned_16): - bsf %ecx, %edx - movdqu %xmm4, (%eax) - add $16, %esi - add $16, %eax - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) - - .p2align 4 -L(CopyFrom1To16BytesUnaligned_32): - bsf %edx, %edx - movdqu %xmm4, (%eax) - movdqu %xmm5, 16(%eax) - add $32, %esi - add $32, %eax - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) - -# ifdef USE_AS_STRNCAT - - .p2align 4 -L(CopyFrom1To16BytesExit): - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) - -/* Case2 */ - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %ebx - add %ecx, %eax - add %ecx, %esi - bsf %edx, %edx - cmp %ebx, %edx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) - - .p2align 4 -L(CopyFrom1To32BytesCase2): - sub %ecx, %ebx - add %ecx, %esi - bsf %edx, %edx - add $16, %edx - sub %ecx, %edx - cmp %ebx, %edx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) - -L(CopyFrom1To16BytesTailCase2): - sub %ecx, %ebx - add %ecx, %esi - bsf %edx, %edx - cmp %ebx, %edx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) - -L(CopyFrom1To16BytesTail1Case2): - bsf %edx, %edx - cmp %ebx, %edx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) - -/* Case2 or Case3, Case3 */ - - .p2align 4 -L(CopyFrom1To16BytesCase2OrCase3): - test %edx, %edx - jnz L(CopyFrom1To16BytesCase2) -L(CopyFrom1To16BytesCase3): - add $16, %ebx - add %ecx, %eax - add %ecx, %esi - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) - - .p2align 4 -L(CopyFrom1To32BytesCase2OrCase3): - test %edx, %edx - jnz L(CopyFrom1To32BytesCase2) - sub %ecx, %ebx - add %ecx, %esi - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) - - .p2align 4 -L(CopyFrom1To16BytesTailCase2OrCase3): - test %edx, %edx - jnz L(CopyFrom1To16BytesTailCase2) - sub %ecx, %ebx - add %ecx, %esi - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) - - .p2align 4 -L(CopyFrom1To32Bytes1Case2OrCase3): - add $16, %eax - add $16, %esi - sub $16, %ebx -L(CopyFrom1To16BytesTail1Case2OrCase3): - test %edx, %edx - jnz L(CopyFrom1To16BytesTail1Case2) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) - -# endif - -# ifdef USE_AS_STRNCAT - .p2align 4 -L(StrncatExit0): - movb %bh, (%eax) - mov STR3(%esp), %eax - RETURN -# endif - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit1): - movb %bh, 1(%eax) -# endif -L(Exit1): -# ifdef USE_AS_STRNCAT - movb (%esi), %dh -# endif - movb %dh, (%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit2): - movb %bh, 2(%eax) -# endif -L(Exit2): - movw (%esi), %dx - movw %dx, (%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit3): - movb %bh, 3(%eax) -# endif -L(Exit3): - movw (%esi), %cx - movw %cx, (%eax) -# ifdef USE_AS_STRNCAT - movb 2(%esi), %dh -# endif - movb %dh, 2(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit4): - movb %bh, 4(%eax) -# endif -L(Exit4): - movl (%esi), %edx - movl %edx, (%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit5): - movb %bh, 5(%eax) -# endif -L(Exit5): - movl (%esi), %ecx -# ifdef USE_AS_STRNCAT - movb 4(%esi), %dh -# endif - movb %dh, 4(%eax) - movl %ecx, (%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit6): - movb %bh, 6(%eax) -# endif -L(Exit6): - movl (%esi), %ecx - movw 4(%esi), %dx - movl %ecx, (%eax) - movw %dx, 4(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit7): - movb %bh, 7(%eax) -# endif -L(Exit7): - movl (%esi), %ecx - movl 3(%esi), %edx - movl %ecx, (%eax) - movl %edx, 3(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit8): - movb %bh, 8(%eax) -# endif -L(Exit8): - movlpd (%esi), %xmm0 - movlpd %xmm0, (%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit9): - movb %bh, 9(%eax) -# endif -L(Exit9): - movlpd (%esi), %xmm0 -# ifdef USE_AS_STRNCAT - movb 8(%esi), %dh -# endif - movb %dh, 8(%eax) - movlpd %xmm0, (%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit10): - movb %bh, 10(%eax) -# endif -L(Exit10): - movlpd (%esi), %xmm0 - movw 8(%esi), %dx - movlpd %xmm0, (%eax) - movw %dx, 8(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit11): - movb %bh, 11(%eax) -# endif -L(Exit11): - movlpd (%esi), %xmm0 - movl 7(%esi), %edx - movlpd %xmm0, (%eax) - movl %edx, 7(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit12): - movb %bh, 12(%eax) -# endif -L(Exit12): - movlpd (%esi), %xmm0 - movl 8(%esi), %edx - movlpd %xmm0, (%eax) - movl %edx, 8(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit13): - movb %bh, 13(%eax) -# endif -L(Exit13): - movlpd (%esi), %xmm0 - movlpd 5(%esi), %xmm1 - movlpd %xmm0, (%eax) - movlpd %xmm1, 5(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit14): - movb %bh, 14(%eax) -# endif -L(Exit14): - movlpd (%esi), %xmm0 - movlpd 6(%esi), %xmm1 - movlpd %xmm0, (%eax) - movlpd %xmm1, 6(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit15): - movb %bh, 15(%eax) -# endif -L(Exit15): - movlpd (%esi), %xmm0 - movlpd 7(%esi), %xmm1 - movlpd %xmm0, (%eax) - movlpd %xmm1, 7(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit16): - movb %bh, 16(%eax) -# endif -L(Exit16): - movdqu (%esi), %xmm0 - movdqu %xmm0, (%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit17): - movb %bh, 17(%eax) -# endif -L(Exit17): - movdqu (%esi), %xmm0 -# ifdef USE_AS_STRNCAT - movb 16(%esi), %dh -# endif - movdqu %xmm0, (%eax) - movb %dh, 16(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit18): - movb %bh, 18(%eax) -# endif -L(Exit18): - movdqu (%esi), %xmm0 - movw 16(%esi), %cx - movdqu %xmm0, (%eax) - movw %cx, 16(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit19): - movb %bh, 19(%eax) -# endif -L(Exit19): - movdqu (%esi), %xmm0 - movl 15(%esi), %ecx - movdqu %xmm0, (%eax) - movl %ecx, 15(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit20): - movb %bh, 20(%eax) -# endif -L(Exit20): - movdqu (%esi), %xmm0 - movl 16(%esi), %ecx - movdqu %xmm0, (%eax) - movl %ecx, 16(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit21): - movb %bh, 21(%eax) -# endif -L(Exit21): - movdqu (%esi), %xmm0 - movl 16(%esi), %ecx -# ifdef USE_AS_STRNCAT - movb 20(%esi), %dh -# endif - movdqu %xmm0, (%eax) - movl %ecx, 16(%eax) - movb %dh, 20(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit22): - movb %bh, 22(%eax) -# endif -L(Exit22): - movdqu (%esi), %xmm0 - movlpd 14(%esi), %xmm3 - movdqu %xmm0, (%eax) - movlpd %xmm3, 14(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit23): - movb %bh, 23(%eax) -# endif -L(Exit23): - movdqu (%esi), %xmm0 - movlpd 15(%esi), %xmm3 - movdqu %xmm0, (%eax) - movlpd %xmm3, 15(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit24): - movb %bh, 24(%eax) -# endif -L(Exit24): - movdqu (%esi), %xmm0 - movlpd 16(%esi), %xmm2 - movdqu %xmm0, (%eax) - movlpd %xmm2, 16(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit25): - movb %bh, 25(%eax) -# endif -L(Exit25): - movdqu (%esi), %xmm0 - movlpd 16(%esi), %xmm2 -# ifdef USE_AS_STRNCAT - movb 24(%esi), %dh -# endif - movdqu %xmm0, (%eax) - movlpd %xmm2, 16(%eax) - movb %dh, 24(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit26): - movb %bh, 26(%eax) -# endif -L(Exit26): - movdqu (%esi), %xmm0 - movlpd 16(%esi), %xmm2 - movw 24(%esi), %cx - movdqu %xmm0, (%eax) - movlpd %xmm2, 16(%eax) - movw %cx, 24(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit27): - movb %bh, 27(%eax) -# endif -L(Exit27): - movdqu (%esi), %xmm0 - movlpd 16(%esi), %xmm2 - movl 23(%esi), %ecx - movdqu %xmm0, (%eax) - movlpd %xmm2, 16(%eax) - movl %ecx, 23(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit28): - movb %bh, 28(%eax) -# endif -L(Exit28): - movdqu (%esi), %xmm0 - movlpd 16(%esi), %xmm2 - movl 24(%esi), %ecx - movdqu %xmm0, (%eax) - movlpd %xmm2, 16(%eax) - movl %ecx, 24(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit29): - movb %bh, 29(%eax) -# endif -L(Exit29): - movdqu (%esi), %xmm0 - movdqu 13(%esi), %xmm2 - movdqu %xmm0, (%eax) - movdqu %xmm2, 13(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit30): - movb %bh, 30(%eax) -# endif -L(Exit30): - movdqu (%esi), %xmm0 - movdqu 14(%esi), %xmm2 - movdqu %xmm0, (%eax) - movdqu %xmm2, 14(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit31): - movb %bh, 31(%eax) -# endif -L(Exit31): - movdqu (%esi), %xmm0 - movdqu 15(%esi), %xmm2 - movdqu %xmm0, (%eax) - movdqu %xmm2, 15(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -# ifdef USE_AS_STRNCAT -L(StrncatExit32): - movb %bh, 32(%eax) -# endif -L(Exit32): - movdqu (%esi), %xmm0 - movdqu 16(%esi), %xmm2 - movdqu %xmm0, (%eax) - movdqu %xmm2, 16(%eax) - mov STR3(%esp), %eax - RETURN - -# ifdef USE_AS_STRNCAT - - .p2align 4 -L(UnalignedLeaveCase2OrCase3): - test %edx, %edx - jnz L(Unaligned64LeaveCase2) -L(Unaligned64LeaveCase3): - lea 64(%ebx), %ecx - and $-16, %ecx - add $48, %ebx - jl L(CopyFrom1To16BytesCase3) - movdqu %xmm4, (%eax) - sub $16, %ebx - jb L(CopyFrom1To16BytesCase3) - movdqu %xmm5, 16(%eax) - sub $16, %ebx - jb L(CopyFrom1To16BytesCase3) - movdqu %xmm6, 32(%eax) - sub $16, %ebx - jb L(CopyFrom1To16BytesCase3) - movdqu %xmm7, 48(%eax) - xor %bh, %bh - movb %bh, 64(%eax) - mov STR3(%esp), %eax - RETURN - - .p2align 4 -L(Unaligned64LeaveCase2): - xor %ecx, %ecx - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %edx - add $48, %ebx - jle L(CopyFrom1To16BytesCase2OrCase3) - test %edx, %edx - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 - pmovmskb %xmm0, %edx - movdqu %xmm4, (%eax) - add $16, %ecx - sub $16, %ebx - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %edx, %edx - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 - pmovmskb %xmm0, %edx - movdqu %xmm5, 16(%eax) - add $16, %ecx - sub $16, %ebx - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %edx, %edx - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %edx - movdqu %xmm6, 32(%eax) - lea 16(%eax, %ecx), %eax - lea 16(%esi, %ecx), %esi - bsf %edx, %edx - cmp %ebx, %edx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) -# endif - .p2align 4 -L(ExitZero): - RETURN - -END (STRCAT) - - .p2align 4 - .section .rodata -L(ExitTable): - .int JMPTBL(L(Exit1), L(ExitTable)) - .int JMPTBL(L(Exit2), L(ExitTable)) - .int JMPTBL(L(Exit3), L(ExitTable)) - .int JMPTBL(L(Exit4), L(ExitTable)) - .int JMPTBL(L(Exit5), L(ExitTable)) - .int JMPTBL(L(Exit6), L(ExitTable)) - .int JMPTBL(L(Exit7), L(ExitTable)) - .int JMPTBL(L(Exit8), L(ExitTable)) - .int JMPTBL(L(Exit9), L(ExitTable)) - .int JMPTBL(L(Exit10), L(ExitTable)) - .int JMPTBL(L(Exit11), L(ExitTable)) - .int JMPTBL(L(Exit12), L(ExitTable)) - .int JMPTBL(L(Exit13), L(ExitTable)) - .int JMPTBL(L(Exit14), L(ExitTable)) - .int JMPTBL(L(Exit15), L(ExitTable)) - .int JMPTBL(L(Exit16), L(ExitTable)) - .int JMPTBL(L(Exit17), L(ExitTable)) - .int JMPTBL(L(Exit18), L(ExitTable)) - .int JMPTBL(L(Exit19), L(ExitTable)) - .int JMPTBL(L(Exit20), L(ExitTable)) - .int JMPTBL(L(Exit21), L(ExitTable)) - .int JMPTBL(L(Exit22), L(ExitTable)) - .int JMPTBL(L(Exit23), L(ExitTable)) - .int JMPTBL(L(Exit24), L(ExitTable)) - .int JMPTBL(L(Exit25), L(ExitTable)) - .int JMPTBL(L(Exit26), L(ExitTable)) - .int JMPTBL(L(Exit27), L(ExitTable)) - .int JMPTBL(L(Exit28), L(ExitTable)) - .int JMPTBL(L(Exit29), L(ExitTable)) - .int JMPTBL(L(Exit30), L(ExitTable)) - .int JMPTBL(L(Exit31), L(ExitTable)) - .int JMPTBL(L(Exit32), L(ExitTable)) -# ifdef USE_AS_STRNCAT -L(ExitStrncatTable): - .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable)) - .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable)) -# endif -#endif diff --git a/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/sysdeps/i386/i686/multiarch/strcat-ssse3.S deleted file mode 100644 index a5b0bc6818..0000000000 --- a/sysdeps/i386/i686/multiarch/strcat-ssse3.S +++ /dev/null @@ -1,572 +0,0 @@ -/* strcat with SSSE3 - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - - -#if IS_IN (libc) - -# include - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# ifndef STRCAT -# define STRCAT __strcat_ssse3 -# endif - -# define PARMS 4 -# define STR1 PARMS+4 -# define STR2 STR1+4 - -# ifdef USE_AS_STRNCAT -# define LEN STR2+8 -# endif - -# define USE_AS_STRCAT - -.text -ENTRY (STRCAT) - PUSH (%edi) - mov STR1(%esp), %edi - mov %edi, %edx - -# define RETURN jmp L(StartStrcpyPart) -# include "strlen-sse2.S" - -L(StartStrcpyPart): - mov STR2(%esp), %ecx - lea (%edi, %eax), %edx -# ifdef USE_AS_STRNCAT - PUSH (%ebx) - mov LEN(%esp), %ebx - test %ebx, %ebx - jz L(StrncatExit0) - cmp $8, %ebx - jbe L(StrncatExit8Bytes) -# endif - cmpb $0, (%ecx) - jz L(Exit1) - cmpb $0, 1(%ecx) - jz L(Exit2) - cmpb $0, 2(%ecx) - jz L(Exit3) - cmpb $0, 3(%ecx) - jz L(Exit4) - cmpb $0, 4(%ecx) - jz L(Exit5) - cmpb $0, 5(%ecx) - jz L(Exit6) - cmpb $0, 6(%ecx) - jz L(Exit7) - cmpb $0, 7(%ecx) - jz L(Exit8) - cmpb $0, 8(%ecx) - jz L(Exit9) -# ifdef USE_AS_STRNCAT - cmp $16, %ebx - jb L(StrncatExit15Bytes) -# endif - cmpb $0, 9(%ecx) - jz L(Exit10) - cmpb $0, 10(%ecx) - jz L(Exit11) - cmpb $0, 11(%ecx) - jz L(Exit12) - cmpb $0, 12(%ecx) - jz L(Exit13) - cmpb $0, 13(%ecx) - jz L(Exit14) - cmpb $0, 14(%ecx) - jz L(Exit15) - cmpb $0, 15(%ecx) - jz L(Exit16) -# ifdef USE_AS_STRNCAT - cmp $16, %ebx - je L(StrncatExit16) - -# define RETURN1 \ - POP (%ebx); \ - POP (%edi); \ - ret; \ - CFI_PUSH (%ebx); \ - CFI_PUSH (%edi) -# define USE_AS_STRNCPY -# else -# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) -# endif -# include "strcpy-ssse3.S" - .p2align 4 -L(CopyFrom1To16Bytes): - add %esi, %edx - add %esi, %ecx - - POP (%esi) - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - movlpd (%ecx), %xmm0 - movlpd 8(%ecx), %xmm1 - movlpd %xmm0, (%edx) - movlpd %xmm1, 8(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit1): - movb %bh, 1(%edx) -L(Exit1): - movb (%ecx), %al - movb %al, (%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit2): - movb %bh, 2(%edx) -L(Exit2): - movw (%ecx), %ax - movw %ax, (%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit3): - movb %bh, 3(%edx) -L(Exit3): - movw (%ecx), %ax - movw %ax, (%edx) - movb 2(%ecx), %al - movb %al, 2(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit4): - movb %bh, 4(%edx) -L(Exit4): - movl (%ecx), %eax - movl %eax, (%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit5): - movb %bh, 5(%edx) -L(Exit5): - movl (%ecx), %eax - movl %eax, (%edx) - movb 4(%ecx), %al - movb %al, 4(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit6): - movb %bh, 6(%edx) -L(Exit6): - movl (%ecx), %eax - movl %eax, (%edx) - movw 4(%ecx), %ax - movw %ax, 4(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit7): - movb %bh, 7(%edx) -L(Exit7): - movl (%ecx), %eax - movl %eax, (%edx) - movl 3(%ecx), %eax - movl %eax, 3(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit8): - movb %bh, 8(%edx) -L(Exit8): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit9): - movb %bh, 9(%edx) -L(Exit9): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movb 8(%ecx), %al - movb %al, 8(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit10): - movb %bh, 10(%edx) -L(Exit10): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movw 8(%ecx), %ax - movw %ax, 8(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit11): - movb %bh, 11(%edx) -L(Exit11): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movl 7(%ecx), %eax - movl %eax, 7(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit12): - movb %bh, 12(%edx) -L(Exit12): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movl 8(%ecx), %eax - movl %eax, 8(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit13): - movb %bh, 13(%edx) -L(Exit13): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movlpd 5(%ecx), %xmm0 - movlpd %xmm0, 5(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit14): - movb %bh, 14(%edx) -L(Exit14): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movlpd 6(%ecx), %xmm0 - movlpd %xmm0, 6(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit15): - movb %bh, 15(%edx) -L(Exit15): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movlpd 7(%ecx), %xmm0 - movlpd %xmm0, 7(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit16): - movb %bh, 16(%edx) -L(Exit16): - movlpd (%ecx), %xmm0 - movlpd 8(%ecx), %xmm1 - movlpd %xmm0, (%edx) - movlpd %xmm1, 8(%edx) - movl %edi, %eax - RETURN1 - -# ifdef USE_AS_STRNCPY - - CFI_PUSH(%esi) - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %ebx - add %esi, %ecx - lea (%esi, %edx), %esi - lea -9(%ebx), %edx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%esi), %edx - POP (%esi) - jz L(ExitHighCase2) - - test $0x01, %al - jnz L(Exit1) - cmp $1, %ebx - je L(StrncatExit1) - test $0x02, %al - jnz L(Exit2) - cmp $2, %ebx - je L(StrncatExit2) - test $0x04, %al - jnz L(Exit3) - cmp $3, %ebx - je L(StrncatExit3) - test $0x08, %al - jnz L(Exit4) - cmp $4, %ebx - je L(StrncatExit4) - test $0x10, %al - jnz L(Exit5) - cmp $5, %ebx - je L(StrncatExit5) - test $0x20, %al - jnz L(Exit6) - cmp $6, %ebx - je L(StrncatExit6) - test $0x40, %al - jnz L(Exit7) - cmp $7, %ebx - je L(StrncatExit7) - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - lea 7(%edx), %eax - cmpb $1, (%eax) - sbb $-1, %eax - xor %cl, %cl - movb %cl, (%eax) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(ExitHighCase2): - test $0x01, %ah - jnz L(Exit9) - cmp $9, %ebx - je L(StrncatExit9) - test $0x02, %ah - jnz L(Exit10) - cmp $10, %ebx - je L(StrncatExit10) - test $0x04, %ah - jnz L(Exit11) - cmp $11, %ebx - je L(StrncatExit11) - test $0x8, %ah - jnz L(Exit12) - cmp $12, %ebx - je L(StrncatExit12) - test $0x10, %ah - jnz L(Exit13) - cmp $13, %ebx - je L(StrncatExit13) - test $0x20, %ah - jnz L(Exit14) - cmp $14, %ebx - je L(StrncatExit14) - test $0x40, %ah - jnz L(Exit15) - cmp $15, %ebx - je L(StrncatExit15) - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movlpd 8(%ecx), %xmm1 - movlpd %xmm1, 8(%edx) - movl %edi, %eax - RETURN1 - - CFI_PUSH(%esi) - -L(CopyFrom1To16BytesCase2OrCase3): - test %eax, %eax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %ebx - add %esi, %edx - add %esi, %ecx - - POP (%esi) - - cmp $8, %ebx - ja L(ExitHighCase3) - cmp $1, %ebx - je L(StrncatExit1) - cmp $2, %ebx - je L(StrncatExit2) - cmp $3, %ebx - je L(StrncatExit3) - cmp $4, %ebx - je L(StrncatExit4) - cmp $5, %ebx - je L(StrncatExit5) - cmp $6, %ebx - je L(StrncatExit6) - cmp $7, %ebx - je L(StrncatExit7) - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movb %bh, 8(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(ExitHighCase3): - cmp $9, %ebx - je L(StrncatExit9) - cmp $10, %ebx - je L(StrncatExit10) - cmp $11, %ebx - je L(StrncatExit11) - cmp $12, %ebx - je L(StrncatExit12) - cmp $13, %ebx - je L(StrncatExit13) - cmp $14, %ebx - je L(StrncatExit14) - cmp $15, %ebx - je L(StrncatExit15) - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movlpd 8(%ecx), %xmm1 - movlpd %xmm1, 8(%edx) - movb %bh, 16(%edx) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit0): - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit15Bytes): - cmp $9, %ebx - je L(StrncatExit9) - cmpb $0, 9(%ecx) - jz L(Exit10) - cmp $10, %ebx - je L(StrncatExit10) - cmpb $0, 10(%ecx) - jz L(Exit11) - cmp $11, %ebx - je L(StrncatExit11) - cmpb $0, 11(%ecx) - jz L(Exit12) - cmp $12, %ebx - je L(StrncatExit12) - cmpb $0, 12(%ecx) - jz L(Exit13) - cmp $13, %ebx - je L(StrncatExit13) - cmpb $0, 13(%ecx) - jz L(Exit14) - cmp $14, %ebx - je L(StrncatExit14) - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movlpd 7(%ecx), %xmm0 - movlpd %xmm0, 7(%edx) - lea 14(%edx), %eax - cmpb $1, (%eax) - sbb $-1, %eax - movb %bh, (%eax) - movl %edi, %eax - RETURN1 - - .p2align 4 -L(StrncatExit8Bytes): - cmpb $0, (%ecx) - jz L(Exit1) - cmp $1, %ebx - je L(StrncatExit1) - cmpb $0, 1(%ecx) - jz L(Exit2) - cmp $2, %ebx - je L(StrncatExit2) - cmpb $0, 2(%ecx) - jz L(Exit3) - cmp $3, %ebx - je L(StrncatExit3) - cmpb $0, 3(%ecx) - jz L(Exit4) - cmp $4, %ebx - je L(StrncatExit4) - cmpb $0, 4(%ecx) - jz L(Exit5) - cmp $5, %ebx - je L(StrncatExit5) - cmpb $0, 5(%ecx) - jz L(Exit6) - cmp $6, %ebx - je L(StrncatExit6) - cmpb $0, 6(%ecx) - jz L(Exit7) - cmp $7, %ebx - je L(StrncatExit7) - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - lea 7(%edx), %eax - cmpb $1, (%eax) - sbb $-1, %eax - movb %bh, (%eax) - movl %edi, %eax - RETURN1 - -# endif -END (STRCAT) -#endif diff --git a/sysdeps/i386/i686/multiarch/strcat.S b/sysdeps/i386/i686/multiarch/strcat.S deleted file mode 100644 index e893815e24..0000000000 --- a/sysdeps/i386/i686/multiarch/strcat.S +++ /dev/null @@ -1,92 +0,0 @@ -/* Multiple versions of strcat - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -#ifndef USE_AS_STRNCAT -# ifndef STRCAT -# define STRCAT strcat -# endif -#endif - -#ifdef USE_AS_STRNCAT -# define STRCAT_SSSE3 __strncat_ssse3 -# define STRCAT_SSE2 __strncat_sse2 -# define STRCAT_IA32 __strncat_ia32 -# define __GI_STRCAT __GI_strncat -#else -# define STRCAT_SSSE3 __strcat_ssse3 -# define STRCAT_SSE2 __strcat_sse2 -# define STRCAT_IA32 __strcat_ia32 -# define __GI_STRCAT __GI_strcat -#endif - - -/* Define multiple versions only for the definition in libc. Don't - define multiple versions for strncat in static library since we - need strncat before the initialization happened. */ -#if IS_IN (libc) - - .text -ENTRY(STRCAT) - .type STRCAT, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - LOAD_FUNC_GOT_EAX (STRCAT_IA32) - HAS_CPU_FEATURE (SSE2) - jz 2f - LOAD_FUNC_GOT_EAX (STRCAT_SSE2) - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - HAS_CPU_FEATURE (SSSE3) - jz 2f - LOAD_FUNC_GOT_EAX (STRCAT_SSSE3) -2: ret -END(STRCAT) - -# undef ENTRY -# define ENTRY(name) \ - .type STRCAT_IA32, @function; \ - .align 16; \ - .globl STRCAT_IA32; \ - .hidden STRCAT_IA32; \ - STRCAT_IA32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32 - -# ifdef SHARED -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strcat calls through a PLT. - The speedup we get from using SSSE3 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32 -# undef libc_hidden_def -# define libc_hidden_def(name) \ - .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32 - -# endif -#endif - -#ifndef USE_AS_STRNCAT -# include "../../strcat.S" -#endif diff --git a/sysdeps/i386/i686/multiarch/strlen-sse2.S b/sysdeps/i386/i686/multiarch/strlen-sse2.S deleted file mode 100644 index 3d30714b7a..0000000000 --- a/sysdeps/i386/i686/multiarch/strlen-sse2.S +++ /dev/null @@ -1,695 +0,0 @@ -/* strlen with SSE2 - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */ - -#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc) - -# ifndef USE_AS_STRCAT - -# include -# define PARMS 4 -# define STR PARMS -# define RETURN ret - -# ifdef USE_AS_STRNLEN -# define LEN PARMS + 8 -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) -# undef RETURN -# define RETURN POP (%edi); CFI_PUSH(%edi); ret -# endif - -# ifndef STRLEN -# define STRLEN __strlen_sse2 -# endif - - atom_text_section -ENTRY (STRLEN) - mov STR(%esp), %edx -# ifdef USE_AS_STRNLEN - PUSH (%edi) - movl LEN(%esp), %edi - sub $4, %edi - jbe L(len_less4_prolog) -# endif -# endif - xor %eax, %eax - cmpb $0, (%edx) - jz L(exit_tail0) - cmpb $0, 1(%edx) - jz L(exit_tail1) - cmpb $0, 2(%edx) - jz L(exit_tail2) - cmpb $0, 3(%edx) - jz L(exit_tail3) - -# ifdef USE_AS_STRNLEN - sub $4, %edi - jbe L(len_less8_prolog) -# endif - - cmpb $0, 4(%edx) - jz L(exit_tail4) - cmpb $0, 5(%edx) - jz L(exit_tail5) - cmpb $0, 6(%edx) - jz L(exit_tail6) - cmpb $0, 7(%edx) - jz L(exit_tail7) - -# ifdef USE_AS_STRNLEN - sub $4, %edi - jbe L(len_less12_prolog) -# endif - - cmpb $0, 8(%edx) - jz L(exit_tail8) - cmpb $0, 9(%edx) - jz L(exit_tail9) - cmpb $0, 10(%edx) - jz L(exit_tail10) - cmpb $0, 11(%edx) - jz L(exit_tail11) - -# ifdef USE_AS_STRNLEN - sub $4, %edi - jbe L(len_less16_prolog) -# endif - - cmpb $0, 12(%edx) - jz L(exit_tail12) - cmpb $0, 13(%edx) - jz L(exit_tail13) - cmpb $0, 14(%edx) - jz L(exit_tail14) - cmpb $0, 15(%edx) - jz L(exit_tail15) - - pxor %xmm0, %xmm0 - lea 16(%edx), %eax - mov %eax, %ecx - and $-16, %eax - -# ifdef USE_AS_STRNLEN - and $15, %edx - add %edx, %edi - sub $64, %edi - jbe L(len_less64) -# endif - - pcmpeqb (%eax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - sub $64, %edi - jbe L(len_less64) -# endif - - pcmpeqb (%eax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - sub $64, %edi - jbe L(len_less64) -# endif - - pcmpeqb (%eax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - sub $64, %edi - jbe L(len_less64) -# endif - - pcmpeqb (%eax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - - pcmpeqb (%eax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%eax), %eax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - mov %eax, %edx - and $63, %edx - add %edx, %edi -# endif - - and $-0x40, %eax - - .p2align 4 -L(aligned_64_loop): -# ifdef USE_AS_STRNLEN - sub $64, %edi - jbe L(len_less64) -# endif - movaps (%eax), %xmm0 - movaps 16(%eax), %xmm1 - movaps 32(%eax), %xmm2 - movaps 48(%eax), %xmm6 - pminub %xmm1, %xmm0 - pminub %xmm6, %xmm2 - pminub %xmm0, %xmm2 - pcmpeqb %xmm3, %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 64(%eax), %eax - jz L(aligned_64_loop) - - pcmpeqb -64(%eax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 48(%ecx), %ecx - jnz L(exit) - - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea -16(%ecx), %ecx - jnz L(exit) - - pcmpeqb -32(%eax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea -16(%ecx), %ecx - jnz L(exit) - - pcmpeqb %xmm6, %xmm3 - pmovmskb %xmm3, %edx - lea -16(%ecx), %ecx -L(exit): - sub %ecx, %eax - test %dl, %dl - jz L(exit_high) - - mov %dl, %cl - and $15, %cl - jz L(exit_8) - test $0x01, %dl - jnz L(exit_tail0) - test $0x02, %dl - jnz L(exit_tail1) - test $0x04, %dl - jnz L(exit_tail2) - add $3, %eax - RETURN - - .p2align 4 -L(exit_8): - test $0x10, %dl - jnz L(exit_tail4) - test $0x20, %dl - jnz L(exit_tail5) - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax - RETURN - - .p2align 4 -L(exit_high): - mov %dh, %ch - and $15, %ch - jz L(exit_high_8) - test $0x01, %dh - jnz L(exit_tail8) - test $0x02, %dh - jnz L(exit_tail9) - test $0x04, %dh - jnz L(exit_tail10) - add $11, %eax - RETURN - - .p2align 4 -L(exit_high_8): - test $0x10, %dh - jnz L(exit_tail12) - test $0x20, %dh - jnz L(exit_tail13) - test $0x40, %dh - jnz L(exit_tail14) - add $15, %eax -L(exit_tail0): - RETURN - -# ifdef USE_AS_STRNLEN - - .p2align 4 -L(len_less64): - pxor %xmm0, %xmm0 - add $64, %edi - - pcmpeqb (%eax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - lea 16(%eax), %eax - test %edx, %edx - jnz L(strnlen_exit) - - sub $16, %edi - jbe L(return_start_len) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edx - lea 16(%eax), %eax - test %edx, %edx - jnz L(strnlen_exit) - - sub $16, %edi - jbe L(return_start_len) - - pcmpeqb (%eax), %xmm0 - pmovmskb %xmm0, %edx - lea 16(%eax), %eax - test %edx, %edx - jnz L(strnlen_exit) - - sub $16, %edi - jbe L(return_start_len) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edx - lea 16(%eax), %eax - test %edx, %edx - jnz L(strnlen_exit) - - movl LEN(%esp), %eax - RETURN - - .p2align 4 -L(strnlen_exit): - sub %ecx, %eax - - test %dl, %dl - jz L(strnlen_exit_high) - mov %dl, %cl - and $15, %cl - jz L(strnlen_exit_8) - test $0x01, %dl - jnz L(exit_tail0) - test $0x02, %dl - jnz L(strnlen_exit_tail1) - test $0x04, %dl - jnz L(strnlen_exit_tail2) - sub $4, %edi - jb L(return_start_len) - lea 3(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_8): - test $0x10, %dl - jnz L(strnlen_exit_tail4) - test $0x20, %dl - jnz L(strnlen_exit_tail5) - test $0x40, %dl - jnz L(strnlen_exit_tail6) - sub $8, %edi - jb L(return_start_len) - lea 7(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_high): - mov %dh, %ch - and $15, %ch - jz L(strnlen_exit_high_8) - test $0x01, %dh - jnz L(strnlen_exit_tail8) - test $0x02, %dh - jnz L(strnlen_exit_tail9) - test $0x04, %dh - jnz L(strnlen_exit_tail10) - sub $12, %edi - jb L(return_start_len) - lea 11(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_high_8): - test $0x10, %dh - jnz L(strnlen_exit_tail12) - test $0x20, %dh - jnz L(strnlen_exit_tail13) - test $0x40, %dh - jnz L(strnlen_exit_tail14) - sub $16, %edi - jb L(return_start_len) - lea 15(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail1): - sub $2, %edi - jb L(return_start_len) - lea 1(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail2): - sub $3, %edi - jb L(return_start_len) - lea 2(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail4): - sub $5, %edi - jb L(return_start_len) - lea 4(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail5): - sub $6, %edi - jb L(return_start_len) - lea 5(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail6): - sub $7, %edi - jb L(return_start_len) - lea 6(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail8): - sub $9, %edi - jb L(return_start_len) - lea 8(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail9): - sub $10, %edi - jb L(return_start_len) - lea 9(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail10): - sub $11, %edi - jb L(return_start_len) - lea 10(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail12): - sub $13, %edi - jb L(return_start_len) - lea 12(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail13): - sub $14, %edi - jb L(return_start_len) - lea 13(%eax), %eax - RETURN - - .p2align 4 -L(strnlen_exit_tail14): - sub $15, %edi - jb L(return_start_len) - lea 14(%eax), %eax - RETURN - - .p2align 4 -L(return_start_len): - movl LEN(%esp), %eax - RETURN - -/* for prolog only */ - - .p2align 4 -L(len_less4_prolog): - xor %eax, %eax - - add $4, %edi - jz L(exit_tail0) - - cmpb $0, (%edx) - jz L(exit_tail0) - cmp $1, %edi - je L(exit_tail1) - - cmpb $0, 1(%edx) - jz L(exit_tail1) - cmp $2, %edi - je L(exit_tail2) - - cmpb $0, 2(%edx) - jz L(exit_tail2) - cmp $3, %edi - je L(exit_tail3) - - cmpb $0, 3(%edx) - jz L(exit_tail3) - mov $4, %eax - RETURN - - .p2align 4 -L(len_less8_prolog): - add $4, %edi - - cmpb $0, 4(%edx) - jz L(exit_tail4) - cmp $1, %edi - je L(exit_tail5) - - cmpb $0, 5(%edx) - jz L(exit_tail5) - cmp $2, %edi - je L(exit_tail6) - - cmpb $0, 6(%edx) - jz L(exit_tail6) - cmp $3, %edi - je L(exit_tail7) - - cmpb $0, 7(%edx) - jz L(exit_tail7) - mov $8, %eax - RETURN - - - .p2align 4 -L(len_less12_prolog): - add $4, %edi - - cmpb $0, 8(%edx) - jz L(exit_tail8) - cmp $1, %edi - je L(exit_tail9) - - cmpb $0, 9(%edx) - jz L(exit_tail9) - cmp $2, %edi - je L(exit_tail10) - - cmpb $0, 10(%edx) - jz L(exit_tail10) - cmp $3, %edi - je L(exit_tail11) - - cmpb $0, 11(%edx) - jz L(exit_tail11) - mov $12, %eax - RETURN - - .p2align 4 -L(len_less16_prolog): - add $4, %edi - - cmpb $0, 12(%edx) - jz L(exit_tail12) - cmp $1, %edi - je L(exit_tail13) - - cmpb $0, 13(%edx) - jz L(exit_tail13) - cmp $2, %edi - je L(exit_tail14) - - cmpb $0, 14(%edx) - jz L(exit_tail14) - cmp $3, %edi - je L(exit_tail15) - - cmpb $0, 15(%edx) - jz L(exit_tail15) - mov $16, %eax - RETURN -# endif - - .p2align 4 -L(exit_tail1): - add $1, %eax - RETURN - -L(exit_tail2): - add $2, %eax - RETURN - -L(exit_tail3): - add $3, %eax - RETURN - -L(exit_tail4): - add $4, %eax - RETURN - -L(exit_tail5): - add $5, %eax - RETURN - -L(exit_tail6): - add $6, %eax - RETURN - -L(exit_tail7): - add $7, %eax - RETURN - -L(exit_tail8): - add $8, %eax - RETURN - -L(exit_tail9): - add $9, %eax - RETURN - -L(exit_tail10): - add $10, %eax - RETURN - -L(exit_tail11): - add $11, %eax - RETURN - -L(exit_tail12): - add $12, %eax - RETURN - -L(exit_tail13): - add $13, %eax - RETURN - -L(exit_tail14): - add $14, %eax - RETURN - -L(exit_tail15): - add $15, %eax -# ifndef USE_AS_STRCAT - RETURN -END (STRLEN) -# endif -#endif diff --git a/sysdeps/i386/i686/multiarch/strncat-c.c b/sysdeps/i386/i686/multiarch/strncat-c.c deleted file mode 100644 index 132a000545..0000000000 --- a/sysdeps/i386/i686/multiarch/strncat-c.c +++ /dev/null @@ -1,8 +0,0 @@ -#define STRNCAT __strncat_ia32 -#ifdef SHARED -#undef libc_hidden_def -#define libc_hidden_def(name) \ - __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32); -#endif - -#include "string/strncat.c" diff --git a/sysdeps/i386/i686/multiarch/strncat-sse2.S b/sysdeps/i386/i686/multiarch/strncat-sse2.S deleted file mode 100644 index f1045b72b8..0000000000 --- a/sysdeps/i386/i686/multiarch/strncat-sse2.S +++ /dev/null @@ -1,4 +0,0 @@ -#define STRCAT __strncat_sse2 -#define USE_AS_STRNCAT - -#include "strcat-sse2.S" diff --git a/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/sysdeps/i386/i686/multiarch/strncat-ssse3.S deleted file mode 100644 index 625b90a978..0000000000 --- a/sysdeps/i386/i686/multiarch/strncat-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define STRCAT __strncat_ssse3 -#define USE_AS_STRNCAT - -#include "strcat-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/strncat.S b/sysdeps/i386/i686/multiarch/strncat.S deleted file mode 100644 index 5c1bf41453..0000000000 --- a/sysdeps/i386/i686/multiarch/strncat.S +++ /dev/null @@ -1,5 +0,0 @@ -/* Multiple versions of strncat - All versions must be listed in ifunc-impl-list.c. */ -#define STRCAT strncat -#define USE_AS_STRNCAT -#include "strcat.S" diff --git a/sysdeps/i386/multiarch/Makefile b/sysdeps/i386/multiarch/Makefile index 78a99a7bbe..87d36a42ca 100644 --- a/sysdeps/i386/multiarch/Makefile +++ b/sysdeps/i386/multiarch/Makefile @@ -31,7 +31,9 @@ sysdep_routines += bcopy-i386 bcopy-i686 bcopy-sse2-unaligned \ strcasecmp_l-ssse3 \ strncase-i386 strncase_l-i386 strncase_l-sse4 \ strncase_l-ssse3 \ - strncmp-i386 strncmp-ssse3 strncmp-sse4 + strncmp-i386 strncmp-ssse3 strncmp-sse4 \ + strcat-i386 strcat-sse2 strcat-ssse3 \ + strncat-i386 strncat-sse2 strncat-ssse3 endif ifeq (mathyes,$(subdir)$(config-cflags-avx)) diff --git a/sysdeps/i386/multiarch/ifunc-impl-list.c b/sysdeps/i386/multiarch/ifunc-impl-list.c index bddc9e75da..05b9bcd622 100644 --- a/sysdeps/i386/multiarch/ifunc-impl-list.c +++ b/sysdeps/i386/multiarch/ifunc-impl-list.c @@ -206,15 +206,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, __strcasecmp_l_i386)) -#if 0 /* Support sysdeps/i386/i686/multiarch/strcat.S. */ IFUNC_IMPL (i, name, strcat, IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2), __strcat_sse2) - IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32)) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_i386)) +#if 0 /* Support sysdeps/i386/i686/multiarch/strchr.S. */ IFUNC_IMPL (i, name, strchr, IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2), @@ -278,15 +278,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, __strncasecmp_l_i386)) -#if 0 /* Support sysdeps/i386/i686/multiarch/strncat.S. */ IFUNC_IMPL (i, name, strncat, IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2), __strncat_sse2) - IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32)) -#endif + IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_i386)) /* Support sysdeps/i386/i686/multiarch/strncpy.S. */ IFUNC_IMPL (i, name, strncpy, diff --git a/sysdeps/i386/multiarch/strcat-i386.S b/sysdeps/i386/multiarch/strcat-i386.S new file mode 100644 index 0000000000..c13503d3e4 --- /dev/null +++ b/sysdeps/i386/multiarch/strcat-i386.S @@ -0,0 +1,10 @@ +#define strcat __strcat_i386 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) +#include + +#ifdef SHARED + .globl __GI_strcat + .hidden __GI_strcat + __GI_strcat = __strcat_i386 +#endif diff --git a/sysdeps/i386/multiarch/strcat-sse2.S b/sysdeps/i386/multiarch/strcat-sse2.S new file mode 100644 index 0000000000..0f9e13c6d3 --- /dev/null +++ b/sysdeps/i386/multiarch/strcat-sse2.S @@ -0,0 +1,1243 @@ +/* strcat with SSE2 + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + + +#if IS_IN (libc) + +# include + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + SETUP_PIC_REG(cx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjusted ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +# ifndef STRCAT +# define STRCAT __strcat_sse2 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# define STR3 STR1+4 +# else +# define STR3 STR1 +# endif + +# define USE_AS_STRCAT +# ifdef USE_AS_STRNCAT +# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi); +# else +# define RETURN POP(%esi); ret; CFI_PUSH(%esi); +# endif + +.text +ENTRY (STRCAT) + PUSH (%esi) + mov STR1(%esp), %eax + mov STR2(%esp), %esi +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) +# endif + cmpb $0, (%esi) + mov %esi, %ecx + mov %eax, %edx + jz L(ExitZero) + + and $63, %ecx + and $63, %edx + cmp $32, %ecx + ja L(StrlenCore7_1) + cmp $48, %edx + ja L(alignment_prolog) + + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm7, %xmm7 + movdqu (%eax), %xmm1 + movdqu (%esi), %xmm5 + pcmpeqb %xmm1, %xmm0 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %ecx + pcmpeqb %xmm5, %xmm4 + pcmpeqb %xmm6, %xmm7 + test %ecx, %ecx + jnz L(exit_less16_) + mov %eax, %ecx + and $-16, %eax + jmp L(loop_prolog) + +L(alignment_prolog): + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + mov %edx, %ecx + pxor %xmm7, %xmm7 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + movdqu (%esi), %xmm5 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %edx + pcmpeqb %xmm5, %xmm4 + shr %cl, %edx + pcmpeqb %xmm6, %xmm7 + test %edx, %edx + jnz L(exit_less16) + add %eax, %ecx + + pxor %xmm0, %xmm0 +L(loop_prolog): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit16): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit32): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit48): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16_): + bsf %ecx, %ecx + add %ecx, %eax + + .p2align 4 +L(StartStrcpyPart): + pmovmskb %xmm4, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + movdqu %xmm5, (%eax) + pmovmskb %xmm7, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + mov %esi, %ecx + and $-16, %esi + and $15, %ecx + pxor %xmm0, %xmm0 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx +# endif + sub %ecx, %eax + jmp L(Unalign16Both) + +L(StrlenCore7_1): + mov %eax, %ecx + pxor %xmm0, %xmm0 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + shr %cl, %edx + test %edx, %edx + jnz L(exit_less16_1) + add %eax, %ecx + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + .p2align 4 +L(align16_loop_1): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16_1) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32_1) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48_1) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop_1) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit16_1): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit32_1): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit48_1): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit_less16_1): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + + .p2align 4 +L(StartStrcpyPart_1): + mov %esi, %ecx + and $15, %ecx + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +# ifdef USE_AS_STRNCAT + cmp $48, %ebx + ja L(BigN) +# endif + pcmpeqb (%esi), %xmm1 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx +# endif + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +L(Unalign16BothBigN): + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%eax, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%eax, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm3, (%eax, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %eax +# ifdef USE_AS_STRNCAT + lea 128(%ebx, %edx), %ebx +# endif + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jnz L(Unaligned64Leave) + + .p2align 4 +L(Unaligned64Loop_start): + add $64, %eax + add $64, %esi + movdqu %xmm4, -64(%eax) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%eax) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%eax) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%eax) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + movdqu %xmm6, 32(%eax) + add $48, %esi + add $48, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(BigN): + pcmpeqb (%esi), %xmm1 + pmovmskb %xmm1, %edx + shr %cl, %edx + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + sub $48, %ebx + add %ecx, %ebx + + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + jmp L(Unalign16BothBigN) +# endif + +/*------------end of main part-------------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16Bytes): + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTail): + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %eax +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%eax) + add $16, %esi + add $16, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + add $32, %esi + add $32, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %eax + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +# endif + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(StrncatExit0): + movb %bh, (%eax) + mov STR3(%esp), %eax + RETURN +# endif + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit1): + movb %bh, 1(%eax) +# endif +L(Exit1): +# ifdef USE_AS_STRNCAT + movb (%esi), %dh +# endif + movb %dh, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit2): + movb %bh, 2(%eax) +# endif +L(Exit2): + movw (%esi), %dx + movw %dx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit3): + movb %bh, 3(%eax) +# endif +L(Exit3): + movw (%esi), %cx + movw %cx, (%eax) +# ifdef USE_AS_STRNCAT + movb 2(%esi), %dh +# endif + movb %dh, 2(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit4): + movb %bh, 4(%eax) +# endif +L(Exit4): + movl (%esi), %edx + movl %edx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit5): + movb %bh, 5(%eax) +# endif +L(Exit5): + movl (%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 4(%esi), %dh +# endif + movb %dh, 4(%eax) + movl %ecx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit6): + movb %bh, 6(%eax) +# endif +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%eax) + movw %dx, 4(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit7): + movb %bh, 7(%eax) +# endif +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%eax) + movl %edx, 3(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit8): + movb %bh, 8(%eax) +# endif +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit9): + movb %bh, 9(%eax) +# endif +L(Exit9): + movlpd (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 8(%esi), %dh +# endif + movb %dh, 8(%eax) + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit10): + movb %bh, 10(%eax) +# endif +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%eax) + movw %dx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit11): + movb %bh, 11(%eax) +# endif +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit12): + movb %bh, 12(%eax) +# endif +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit13): + movb %bh, 13(%eax) +# endif +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 5(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit14): + movb %bh, 14(%eax) +# endif +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 6(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit15): + movb %bh, 15(%eax) +# endif +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit16): + movb %bh, 16(%eax) +# endif +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit17): + movb %bh, 17(%eax) +# endif +L(Exit17): + movdqu (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 16(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movb %dh, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit18): + movb %bh, 18(%eax) +# endif +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%eax) + movw %cx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit19): + movb %bh, 19(%eax) +# endif +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit20): + movb %bh, 20(%eax) +# endif +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit21): + movb %bh, 21(%eax) +# endif +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 20(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + movb %dh, 20(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit22): + movb %bh, 22(%eax) +# endif +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit23): + movb %bh, 23(%eax) +# endif +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit24): + movb %bh, 24(%eax) +# endif +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit25): + movb %bh, 25(%eax) +# endif +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 +# ifdef USE_AS_STRNCAT + movb 24(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movb %dh, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit26): + movb %bh, 26(%eax) +# endif +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movw %cx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit27): + movb %bh, 27(%eax) +# endif +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 23(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit28): + movb %bh, 28(%eax) +# endif +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit29): + movb %bh, 29(%eax) +# endif +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 13(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit30): + movb %bh, 30(%eax) +# endif +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit31): + movb %bh, 31(%eax) +# endif +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit32): + movb %bh, 32(%eax) +# endif +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%eax) + xor %bh, %bh + movb %bh, 64(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%eax) + lea 16(%eax, %ecx), %eax + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) +# endif + .p2align 4 +L(ExitZero): + RETURN + +END (STRCAT) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +# ifdef USE_AS_STRNCAT +L(ExitStrncatTable): + .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable)) +# endif +#endif diff --git a/sysdeps/i386/multiarch/strcat-ssse3.S b/sysdeps/i386/multiarch/strcat-ssse3.S new file mode 100644 index 0000000000..a5b0bc6818 --- /dev/null +++ b/sysdeps/i386/multiarch/strcat-ssse3.S @@ -0,0 +1,572 @@ +/* strcat with SSSE3 + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + + +#if IS_IN (libc) + +# include + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCAT +# define STRCAT __strcat_ssse3 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) + PUSH (%edi) + mov STR1(%esp), %edi + mov %edi, %edx + +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-sse2.S" + +L(StartStrcpyPart): + mov STR2(%esp), %ecx + lea (%edi, %eax), %edx +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + mov LEN(%esp), %ebx + test %ebx, %ebx + jz L(StrncatExit0) + cmp $8, %ebx + jbe L(StrncatExit8Bytes) +# endif + cmpb $0, (%ecx) + jz L(Exit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmpb $0, 7(%ecx) + jz L(Exit8) + cmpb $0, 8(%ecx) + jz L(Exit9) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jb L(StrncatExit15Bytes) +# endif + cmpb $0, 9(%ecx) + jz L(Exit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmpb $0, 14(%ecx) + jz L(Exit15) + cmpb $0, 15(%ecx) + jz L(Exit16) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + je L(StrncatExit16) + +# define RETURN1 \ + POP (%ebx); \ + POP (%edi); \ + ret; \ + CFI_PUSH (%ebx); \ + CFI_PUSH (%edi) +# define USE_AS_STRNCPY +# else +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif +# include "strcpy-ssse3.S" + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit1): + movb %bh, 1(%edx) +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit2): + movb %bh, 2(%edx) +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit3): + movb %bh, 3(%edx) +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit4): + movb %bh, 4(%edx) +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit5): + movb %bh, 5(%edx) +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit6): + movb %bh, 6(%edx) +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit7): + movb %bh, 7(%edx) +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8): + movb %bh, 8(%edx) +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit9): + movb %bh, 9(%edx) +L(Exit9): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit10): + movb %bh, 10(%edx) +L(Exit10): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit11): + movb %bh, 11(%edx) +L(Exit11): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit12): + movb %bh, 12(%edx) +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit13): + movb %bh, 13(%edx) +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit14): + movb %bh, 14(%edx) +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15): + movb %bh, 15(%edx) +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit16): + movb %bh, 16(%edx) +L(Exit16): + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + +# ifdef USE_AS_STRNCPY + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %esi, %ecx + lea (%esi, %edx), %esi + lea -9(%ebx), %edx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%esi), %edx + POP (%esi) + jz L(ExitHighCase2) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + xor %cl, %cl + movb %cl, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase2): + test $0x01, %ah + jnz L(Exit9) + cmp $9, %ebx + je L(StrncatExit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + CFI_PUSH(%esi) + +L(CopyFrom1To16BytesCase2OrCase3): + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %esi, %edx + add %esi, %ecx + + POP (%esi) + + cmp $8, %ebx + ja L(ExitHighCase3) + cmp $1, %ebx + je L(StrncatExit1) + cmp $2, %ebx + je L(StrncatExit2) + cmp $3, %ebx + je L(StrncatExit3) + cmp $4, %ebx + je L(StrncatExit4) + cmp $5, %ebx + je L(StrncatExit5) + cmp $6, %ebx + je L(StrncatExit6) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb %bh, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase3): + cmp $9, %ebx + je L(StrncatExit9) + cmp $10, %ebx + je L(StrncatExit10) + cmp $11, %ebx + je L(StrncatExit11) + cmp $12, %ebx + je L(StrncatExit12) + cmp $13, %ebx + je L(StrncatExit13) + cmp $14, %ebx + je L(StrncatExit14) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movb %bh, 16(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit0): + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15Bytes): + cmp $9, %ebx + je L(StrncatExit9) + cmpb $0, 9(%ecx) + jz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + lea 14(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8Bytes): + cmpb $0, (%ecx) + jz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + +# endif +END (STRCAT) +#endif diff --git a/sysdeps/i386/multiarch/strcat.c b/sysdeps/i386/multiarch/strcat.c new file mode 100644 index 0000000000..5a2aa9cc42 --- /dev/null +++ b/sysdeps/i386/multiarch/strcat.c @@ -0,0 +1,51 @@ +/* Multiple versions of strcat. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +/* Redefine strcat so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef strcat +# define strcat __redirect_strcat +# include +# undef strcat + +# include + +extern __typeof (__redirect_strcat) __strcat_i386 attribute_hidden; +extern __typeof (__redirect_strcat) __strcat_sse2 attribute_hidden; +extern __typeof (__redirect_strcat) __strcat_ssse3 attribute_hidden; + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_strcat) strcat; +extern void *strcat_ifunc (void) __asm__ ("strcat"); + +void * +strcat_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSSE3)) + return __strcat_ssse3; + else if (HAS_CPU_FEATURE (SSE2)) + return __strcat_sse2; + + return __strcat_i386; +} +__asm__ (".type strcat, %gnu_indirect_function"); +#endif diff --git a/sysdeps/i386/multiarch/strlen-sse2.S b/sysdeps/i386/multiarch/strlen-sse2.S new file mode 100644 index 0000000000..3d30714b7a --- /dev/null +++ b/sysdeps/i386/multiarch/strlen-sse2.S @@ -0,0 +1,695 @@ +/* strlen with SSE2 + Copyright (C) 2010-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */ + +#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc) + +# ifndef USE_AS_STRCAT + +# include +# define PARMS 4 +# define STR PARMS +# define RETURN ret + +# ifdef USE_AS_STRNLEN +# define LEN PARMS + 8 +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) +# undef RETURN +# define RETURN POP (%edi); CFI_PUSH(%edi); ret +# endif + +# ifndef STRLEN +# define STRLEN __strlen_sse2 +# endif + + atom_text_section +ENTRY (STRLEN) + mov STR(%esp), %edx +# ifdef USE_AS_STRNLEN + PUSH (%edi) + movl LEN(%esp), %edi + sub $4, %edi + jbe L(len_less4_prolog) +# endif +# endif + xor %eax, %eax + cmpb $0, (%edx) + jz L(exit_tail0) + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmpb $0, 3(%edx) + jz L(exit_tail3) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less8_prolog) +# endif + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmpb $0, 7(%edx) + jz L(exit_tail7) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less12_prolog) +# endif + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmpb $0, 11(%edx) + jz L(exit_tail11) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less16_prolog) +# endif + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmpb $0, 15(%edx) + jz L(exit_tail15) + + pxor %xmm0, %xmm0 + lea 16(%edx), %eax + mov %eax, %ecx + and $-16, %eax + +# ifdef USE_AS_STRNLEN + and $15, %edx + add %edx, %edi + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + mov %eax, %edx + and $63, %edx + add %edx, %edi +# endif + + and $-0x40, %eax + + .p2align 4 +L(aligned_64_loop): +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqb %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%eax), %eax + jz L(aligned_64_loop) + + pcmpeqb -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm6, %xmm3 + pmovmskb %xmm3, %edx + lea -16(%ecx), %ecx +L(exit): + sub %ecx, %eax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(exit_tail1) + test $0x04, %dl + jnz L(exit_tail2) + add $3, %eax + RETURN + + .p2align 4 +L(exit_8): + test $0x10, %dl + jnz L(exit_tail4) + test $0x20, %dl + jnz L(exit_tail5) + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax + RETURN + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_high_8) + test $0x01, %dh + jnz L(exit_tail8) + test $0x02, %dh + jnz L(exit_tail9) + test $0x04, %dh + jnz L(exit_tail10) + add $11, %eax + RETURN + + .p2align 4 +L(exit_high_8): + test $0x10, %dh + jnz L(exit_tail12) + test $0x20, %dh + jnz L(exit_tail13) + test $0x40, %dh + jnz L(exit_tail14) + add $15, %eax +L(exit_tail0): + RETURN + +# ifdef USE_AS_STRNLEN + + .p2align 4 +L(len_less64): + pxor %xmm0, %xmm0 + add $64, %edi + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + movl LEN(%esp), %eax + RETURN + + .p2align 4 +L(strnlen_exit): + sub %ecx, %eax + + test %dl, %dl + jz L(strnlen_exit_high) + mov %dl, %cl + and $15, %cl + jz L(strnlen_exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(strnlen_exit_tail1) + test $0x04, %dl + jnz L(strnlen_exit_tail2) + sub $4, %edi + jb L(return_start_len) + lea 3(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_8): + test $0x10, %dl + jnz L(strnlen_exit_tail4) + test $0x20, %dl + jnz L(strnlen_exit_tail5) + test $0x40, %dl + jnz L(strnlen_exit_tail6) + sub $8, %edi + jb L(return_start_len) + lea 7(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_high): + mov %dh, %ch + and $15, %ch + jz L(strnlen_exit_high_8) + test $0x01, %dh + jnz L(strnlen_exit_tail8) + test $0x02, %dh + jnz L(strnlen_exit_tail9) + test $0x04, %dh + jnz L(strnlen_exit_tail10) + sub $12, %edi + jb L(return_start_len) + lea 11(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_high_8): + test $0x10, %dh + jnz L(strnlen_exit_tail12) + test $0x20, %dh + jnz L(strnlen_exit_tail13) + test $0x40, %dh + jnz L(strnlen_exit_tail14) + sub $16, %edi + jb L(return_start_len) + lea 15(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail1): + sub $2, %edi + jb L(return_start_len) + lea 1(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail2): + sub $3, %edi + jb L(return_start_len) + lea 2(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail4): + sub $5, %edi + jb L(return_start_len) + lea 4(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail5): + sub $6, %edi + jb L(return_start_len) + lea 5(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail6): + sub $7, %edi + jb L(return_start_len) + lea 6(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail8): + sub $9, %edi + jb L(return_start_len) + lea 8(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail9): + sub $10, %edi + jb L(return_start_len) + lea 9(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail10): + sub $11, %edi + jb L(return_start_len) + lea 10(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail12): + sub $13, %edi + jb L(return_start_len) + lea 12(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail13): + sub $14, %edi + jb L(return_start_len) + lea 13(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail14): + sub $15, %edi + jb L(return_start_len) + lea 14(%eax), %eax + RETURN + + .p2align 4 +L(return_start_len): + movl LEN(%esp), %eax + RETURN + +/* for prolog only */ + + .p2align 4 +L(len_less4_prolog): + xor %eax, %eax + + add $4, %edi + jz L(exit_tail0) + + cmpb $0, (%edx) + jz L(exit_tail0) + cmp $1, %edi + je L(exit_tail1) + + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmp $2, %edi + je L(exit_tail2) + + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmp $3, %edi + je L(exit_tail3) + + cmpb $0, 3(%edx) + jz L(exit_tail3) + mov $4, %eax + RETURN + + .p2align 4 +L(len_less8_prolog): + add $4, %edi + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmp $1, %edi + je L(exit_tail5) + + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmp $2, %edi + je L(exit_tail6) + + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmp $3, %edi + je L(exit_tail7) + + cmpb $0, 7(%edx) + jz L(exit_tail7) + mov $8, %eax + RETURN + + + .p2align 4 +L(len_less12_prolog): + add $4, %edi + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmp $1, %edi + je L(exit_tail9) + + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmp $2, %edi + je L(exit_tail10) + + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmp $3, %edi + je L(exit_tail11) + + cmpb $0, 11(%edx) + jz L(exit_tail11) + mov $12, %eax + RETURN + + .p2align 4 +L(len_less16_prolog): + add $4, %edi + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmp $1, %edi + je L(exit_tail13) + + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmp $2, %edi + je L(exit_tail14) + + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmp $3, %edi + je L(exit_tail15) + + cmpb $0, 15(%edx) + jz L(exit_tail15) + mov $16, %eax + RETURN +# endif + + .p2align 4 +L(exit_tail1): + add $1, %eax + RETURN + +L(exit_tail2): + add $2, %eax + RETURN + +L(exit_tail3): + add $3, %eax + RETURN + +L(exit_tail4): + add $4, %eax + RETURN + +L(exit_tail5): + add $5, %eax + RETURN + +L(exit_tail6): + add $6, %eax + RETURN + +L(exit_tail7): + add $7, %eax + RETURN + +L(exit_tail8): + add $8, %eax + RETURN + +L(exit_tail9): + add $9, %eax + RETURN + +L(exit_tail10): + add $10, %eax + RETURN + +L(exit_tail11): + add $11, %eax + RETURN + +L(exit_tail12): + add $12, %eax + RETURN + +L(exit_tail13): + add $13, %eax + RETURN + +L(exit_tail14): + add $14, %eax + RETURN + +L(exit_tail15): + add $15, %eax +# ifndef USE_AS_STRCAT + RETURN +END (STRLEN) +# endif +#endif diff --git a/sysdeps/i386/multiarch/strncat-i386.c b/sysdeps/i386/multiarch/strncat-i386.c new file mode 100644 index 0000000000..0f22fbc035 --- /dev/null +++ b/sysdeps/i386/multiarch/strncat-i386.c @@ -0,0 +1,8 @@ +#define STRNCAT __strncat_i386 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__strncat_i386, __GI___strncat, __strncat_i386); +#endif + +#include "string/strncat.c" diff --git a/sysdeps/i386/multiarch/strncat-sse2.S b/sysdeps/i386/multiarch/strncat-sse2.S new file mode 100644 index 0000000000..f1045b72b8 --- /dev/null +++ b/sysdeps/i386/multiarch/strncat-sse2.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_sse2 +#define USE_AS_STRNCAT + +#include "strcat-sse2.S" diff --git a/sysdeps/i386/multiarch/strncat-ssse3.S b/sysdeps/i386/multiarch/strncat-ssse3.S new file mode 100644 index 0000000000..625b90a978 --- /dev/null +++ b/sysdeps/i386/multiarch/strncat-ssse3.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_ssse3 +#define USE_AS_STRNCAT + +#include "strcat-ssse3.S" diff --git a/sysdeps/i386/multiarch/strncat.c b/sysdeps/i386/multiarch/strncat.c new file mode 100644 index 0000000000..7ab3e38df6 --- /dev/null +++ b/sysdeps/i386/multiarch/strncat.c @@ -0,0 +1,54 @@ +/* Multiple versions of strncat. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncat in static library since we + need strncat before the initialization happened. */ +#if defined SHARED && IS_IN (libc) +# define _HAVE_STRING_ARCH_strncat +/* Redefine strncat so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef strncat +# define strncat __redirect_strncat +# include +# undef strncat + +# include + +extern __typeof (__redirect_strncat) __strncat_i386 attribute_hidden; +extern __typeof (__redirect_strncat) __strncat_sse2 attribute_hidden; +extern __typeof (__redirect_strncat) __strncat_ssse3 attribute_hidden; + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_strncat) strncat; +extern void *strncat_ifunc (void) __asm__ ("strncat"); + +void * +strncat_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSSE3)) + return __strncat_ssse3; + else if (HAS_CPU_FEATURE (SSE2)) + return __strncat_sse2; + + return __strncat_i386; +} +__asm__ (".type strncat, %gnu_indirect_function"); +#endif -- cgit 1.4.1