From 5fa16e9b016b34788b9a48b5ab9752a583bb987c Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Thu, 4 Aug 2011 15:33:38 -0400 Subject: Improve x86-32 strcat functions with SSE2/SSSE3 --- sysdeps/i386/i686/multiarch/Makefile | 3 +- sysdeps/i386/i686/multiarch/strcat-sse2.S | 1244 +++++++++++++++++++++++++++ sysdeps/i386/i686/multiarch/strcat-ssse3.S | 573 ++++++++++++ sysdeps/i386/i686/multiarch/strcat.S | 131 +++ sysdeps/i386/i686/multiarch/strcpy-ssse3.S | 547 ++++++------ sysdeps/i386/i686/multiarch/strlen-sse2.S | 114 ++- sysdeps/i386/i686/multiarch/strncat-c.c | 8 + sysdeps/i386/i686/multiarch/strncat-sse2.S | 4 + sysdeps/i386/i686/multiarch/strncat-ssse3.S | 4 + sysdeps/i386/i686/multiarch/strncat.S | 3 + 10 files changed, 2314 insertions(+), 317 deletions(-) create mode 100644 sysdeps/i386/i686/multiarch/strcat-sse2.S create mode 100644 sysdeps/i386/i686/multiarch/strcat-ssse3.S create mode 100644 sysdeps/i386/i686/multiarch/strcat.S create mode 100644 sysdeps/i386/i686/multiarch/strncat-c.c create mode 100644 sysdeps/i386/i686/multiarch/strncat-sse2.S create mode 100644 sysdeps/i386/i686/multiarch/strncat-ssse3.S create mode 100644 sysdeps/i386/i686/multiarch/strncat.S (limited to 'sysdeps/i386') diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 4bae699caf..83b2818a9d 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -12,7 +12,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \ strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \ strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \ - strncpy-sse2 stpcpy-sse2 stpncpy-sse2 + strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \ + strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/strcat-sse2.S b/sysdeps/i386/i686/multiarch/strcat-sse2.S new file mode 100644 index 0000000000..b692036cec --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcat-sse2.S @@ -0,0 +1,1244 @@ +/* strcat with SSE2 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + + +#ifndef NOT_IN_libc + +# include + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + call __i686.get_pc_thunk.cx; \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjuested ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +# ifndef STRCAT +# define STRCAT __strcat_sse2 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# define STR3 STR1+4 +# else +# define STR3 STR1 +# endif + +# define USE_AS_STRCAT +# ifdef USE_AS_STRNCAT +# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi); +# else +# define RETURN POP(%esi); ret; CFI_PUSH(%esi); +# endif + +.text +ENTRY (STRCAT) + PUSH (%esi) + mov STR1(%esp), %eax + mov STR2(%esp), %esi +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) +# endif + cmpb $0, (%esi) + mov %esi, %ecx + mov %eax, %edx + jz L(ExitZero) + + and $63, %ecx + and $63, %edx + cmp $32, %ecx + ja L(StrlenCore7_1) + cmp $48, %edx + ja L(alignment_prolog) + + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm7, %xmm7 + movdqu (%eax), %xmm1 + movdqu (%esi), %xmm5 + pcmpeqb %xmm1, %xmm0 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %ecx + pcmpeqb %xmm5, %xmm4 + pcmpeqb %xmm6, %xmm7 + test %ecx, %ecx + jnz L(exit_less16_) + mov %eax, %ecx + and $-16, %eax + jmp L(loop_prolog) + +L(alignment_prolog): + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + mov %edx, %ecx + pxor %xmm7, %xmm7 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + movdqu (%esi), %xmm5 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %edx + pcmpeqb %xmm5, %xmm4 + shr %cl, %edx + pcmpeqb %xmm6, %xmm7 + test %edx, %edx + jnz L(exit_less16) + add %eax, %ecx + + pxor %xmm0, %xmm0 +L(loop_prolog): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit16): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit32): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit48): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16_): + bsf %ecx, %ecx + add %ecx, %eax + + .p2align 4 +L(StartStrcpyPart): + pmovmskb %xmm4, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + movdqu %xmm5, (%eax) + pmovmskb %xmm7, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + mov %esi, %ecx + and $-16, %esi + and $15, %ecx + pxor %xmm0, %xmm0 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx +# endif + sub %ecx, %eax + jmp L(Unalign16Both) + +L(StrlenCore7_1): + mov %eax, %ecx + pxor %xmm0, %xmm0 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + shr %cl, %edx + test %edx, %edx + jnz L(exit_less16_1) + add %eax, %ecx + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + .p2align 4 +L(align16_loop_1): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16_1) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32_1) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48_1) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop_1) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit16_1): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit32_1): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit48_1): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit_less16_1): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + + .p2align 4 +L(StartStrcpyPart_1): + mov %esi, %ecx + and $15, %ecx + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +# ifdef USE_AS_STRNCAT + cmp $48, %ebx + ja L(BigN) +# endif + pcmpeqb (%esi), %xmm1 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx +# endif + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +L(Unalign16BothBigN): + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%eax, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%eax, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm3, (%eax, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %eax +# ifdef USE_AS_STRNCAT + lea 128(%ebx, %edx), %ebx +# endif + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jnz L(Unaligned64Leave) + + .p2align 4 +L(Unaligned64Loop_start): + add $64, %eax + add $64, %esi + movdqu %xmm4, -64(%eax) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%eax) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%eax) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%eax) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + movdqu %xmm6, 32(%eax) + add $48, %esi + add $48, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(BigN): + pcmpeqb (%esi), %xmm1 + pmovmskb %xmm1, %edx + shr %cl, %edx + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + sub $48, %ebx + add %ecx, %ebx + + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + jmp L(Unalign16BothBigN) +# endif + +/*------------end of main part-------------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16Bytes): + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTail): + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %eax +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%eax) + add $16, %esi + add $16, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + add $32, %esi + add $32, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %eax + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +# endif + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(StrncatExit0): + movb %bh, (%eax) + mov STR3(%esp), %eax + RETURN +# endif + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit1): + movb %bh, 1(%eax) +# endif +L(Exit1): +# ifdef USE_AS_STRNCAT + movb (%esi), %dh +# endif + movb %dh, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit2): + movb %bh, 2(%eax) +# endif +L(Exit2): + movw (%esi), %dx + movw %dx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit3): + movb %bh, 3(%eax) +# endif +L(Exit3): + movw (%esi), %cx + movw %cx, (%eax) +# ifdef USE_AS_STRNCAT + movb 2(%esi), %dh +# endif + movb %dh, 2(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit4): + movb %bh, 4(%eax) +# endif +L(Exit4): + movl (%esi), %edx + movl %edx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit5): + movb %bh, 5(%eax) +# endif +L(Exit5): + movl (%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 4(%esi), %dh +# endif + movb %dh, 4(%eax) + movl %ecx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit6): + movb %bh, 6(%eax) +# endif +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%eax) + movw %dx, 4(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit7): + movb %bh, 7(%eax) +# endif +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%eax) + movl %edx, 3(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit8): + movb %bh, 8(%eax) +# endif +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit9): + movb %bh, 9(%eax) +# endif +L(Exit9): + movlpd (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 8(%esi), %dh +# endif + movb %dh, 8(%eax) + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit10): + movb %bh, 10(%eax) +# endif +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%eax) + movw %dx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit11): + movb %bh, 11(%eax) +# endif +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit12): + movb %bh, 12(%eax) +# endif +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit13): + movb %bh, 13(%eax) +# endif +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 5(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit14): + movb %bh, 14(%eax) +# endif +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 6(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit15): + movb %bh, 15(%eax) +# endif +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit16): + movb %bh, 16(%eax) +# endif +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit17): + movb %bh, 17(%eax) +# endif +L(Exit17): + movdqu (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 16(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movb %dh, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit18): + movb %bh, 18(%eax) +# endif +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%eax) + movw %cx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit19): + movb %bh, 19(%eax) +# endif +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit20): + movb %bh, 20(%eax) +# endif +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit21): + movb %bh, 21(%eax) +# endif +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 20(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + movb %dh, 20(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit22): + movb %bh, 22(%eax) +# endif +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit23): + movb %bh, 23(%eax) +# endif +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit24): + movb %bh, 24(%eax) +# endif +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit25): + movb %bh, 25(%eax) +# endif +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 +# ifdef USE_AS_STRNCAT + movb 24(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movb %dh, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit26): + movb %bh, 26(%eax) +# endif +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movw %cx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit27): + movb %bh, 27(%eax) +# endif +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 23(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit28): + movb %bh, 28(%eax) +# endif +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit29): + movb %bh, 29(%eax) +# endif +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 13(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit30): + movb %bh, 30(%eax) +# endif +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit31): + movb %bh, 31(%eax) +# endif +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit32): + movb %bh, 32(%eax) +# endif +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%eax) + xor %bh, %bh + movb %bh, 64(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%eax) + lea 16(%eax, %ecx), %eax + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) +# endif + .p2align 4 +L(ExitZero): + RETURN + +END (STRCAT) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +# ifdef USE_AS_STRNCAT +L(ExitStrncatTable): + .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable)) +# endif +#endif diff --git a/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/sysdeps/i386/i686/multiarch/strcat-ssse3.S new file mode 100644 index 0000000000..d03b40a5f3 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcat-ssse3.S @@ -0,0 +1,573 @@ +/* strcat with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + + +#ifndef NOT_IN_libc + +# include + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCAT +# define STRCAT __strcat_ssse3 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) + PUSH (%edi) + mov STR1(%esp), %edi + mov %edi, %edx + +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-sse2.S" + +L(StartStrcpyPart): + mov STR2(%esp), %ecx + lea (%edi, %eax), %edx +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + mov LEN(%esp), %ebx + test %ebx, %ebx + jz L(StrncatExit0) + cmp $8, %ebx + jbe L(StrncatExit8Bytes) +# endif + cmpb $0, (%ecx) + jz L(Exit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmpb $0, 7(%ecx) + jz L(Exit8) + cmpb $0, 8(%ecx) + jz L(Exit9) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jb L(StrncatExit15Bytes) +# endif + cmpb $0, 9(%ecx) + jz L(Exit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmpb $0, 14(%ecx) + jz L(Exit15) + cmpb $0, 15(%ecx) + jz L(Exit16) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + je L(StrncatExit16) + +# define RETURN1 \ + POP (%ebx); \ + POP (%edi); \ + ret; \ + CFI_PUSH (%ebx); \ + CFI_PUSH (%edi) +# define USE_AS_STRNCPY +# else +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif +# include "strcpy-ssse3.S" + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit1): + movb %bh, 1(%edx) +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit2): + movb %bh, 2(%edx) +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit3): + movb %bh, 3(%edx) +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit4): + movb %bh, 4(%edx) +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit5): + movb %bh, 5(%edx) +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit6): + movb %bh, 6(%edx) +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit7): + movb %bh, 7(%edx) +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8): + movb %bh, 8(%edx) +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit9): + movb %bh, 9(%edx) +L(Exit9): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit10): + movb %bh, 10(%edx) +L(Exit10): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit11): + movb %bh, 11(%edx) +L(Exit11): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit12): + movb %bh, 12(%edx) +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit13): + movb %bh, 13(%edx) +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit14): + movb %bh, 14(%edx) +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15): + movb %bh, 15(%edx) +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit16): + movb %bh, 16(%edx) +L(Exit16): + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + +# ifdef USE_AS_STRNCPY + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %esi, %ecx + lea (%esi, %edx), %esi + lea -9(%ebx), %edx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%esi), %edx + POP (%esi) + jz L(ExitHighCase2) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + xor %cl, %cl + movb %cl, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase2): + test $0x01, %ah + jnz L(Exit9) + cmp $9, %ebx + je L(StrncatExit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + CFI_PUSH(%esi) + +L(CopyFrom1To16BytesCase2OrCase3): + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %esi, %edx + add %esi, %ecx + + POP (%esi) + + cmp $8, %ebx + ja L(ExitHighCase3) + cmp $1, %ebx + je L(StrncatExit1) + cmp $2, %ebx + je L(StrncatExit2) + cmp $3, %ebx + je L(StrncatExit3) + cmp $4, %ebx + je L(StrncatExit4) + cmp $5, %ebx + je L(StrncatExit5) + cmp $6, %ebx + je L(StrncatExit6) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb %bh, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase3): + cmp $9, %ebx + je L(StrncatExit9) + cmp $10, %ebx + je L(StrncatExit10) + cmp $11, %ebx + je L(StrncatExit11) + cmp $12, %ebx + je L(StrncatExit12) + cmp $13, %ebx + je L(StrncatExit13) + cmp $14, %ebx + je L(StrncatExit14) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movb %bh, 16(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit0): + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15Bytes): + cmp $9, %ebx + je L(StrncatExit9) + cmpb $0, 9(%ecx) + jz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + lea 14(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8Bytes): + cmpb $0, (%ecx) + jz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + +# endif +END (STRCAT) +#endif diff --git a/sysdeps/i386/i686/multiarch/strcat.S b/sysdeps/i386/i686/multiarch/strcat.S new file mode 100644 index 0000000000..50850f977f --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcat.S @@ -0,0 +1,131 @@ +/* Multiple versions of strcat + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +#ifndef USE_AS_STRNCAT +# ifndef STRCAT +# define STRCAT strcat +# endif +#endif + +#ifdef USE_AS_STRNCAT +# define STRCAT_SSSE3 __strncat_ssse3 +# define STRCAT_SSE2 __strncat_sse2 +# define STRCAT_IA32 __strncat_ia32 +# define __GI_STRCAT __GI_strncat +#else +# define STRCAT_SSSE3 __strcat_ssse3 +# define STRCAT_SSE2 __strcat_sse2 +# define STRCAT_IA32 __strcat_ia32 +# define __GI_STRCAT __GI_strcat +#endif + + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncat in static library since we + need strncat before the initialization happened. */ +#ifndef NOT_IN_libc + +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(STRCAT) + .type STRCAT, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal STRCAT_IA32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal STRCAT_SSE2@GOTOFF(%ebx), %eax + testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx) + jnz 2f + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal STRCAT_SSSE3@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(STRCAT) +# else + +ENTRY(STRCAT) + .type STRCAT, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal STRCAT_IA32, %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features + jz 2f + leal STRCAT_SSE2, %eax + testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features + jnz 2f + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features + jz 2f + leal STRCAT_SSSE3, %eax +2: ret +END(STRCAT) + +# endif + +# undef ENTRY +# define ENTRY(name) \ + .type STRCAT_IA32, @function; \ + .align 16; \ + STRCAT_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcat calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32 + +# endif +#endif + +#ifndef USE_AS_STRNCAT +# include "../../i486/strcat.S" +#endif + diff --git a/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S index 75a1952e62..073856ff84 100644 --- a/sysdeps/i386/i686/multiarch/strcpy-ssse3.S +++ b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S @@ -20,39 +20,39 @@ #ifndef NOT_IN_libc +# ifndef USE_AS_STRCAT +# include -# include - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ cfi_rel_offset (REG, 0) -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ cfi_restore (REG) -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -# ifndef STRCPY -# define STRCPY __strcpy_ssse3 -# endif +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif -# ifdef USE_AS_STRNCPY -# define PARMS 8 -# define ENTRANCE PUSH(%ebx) -# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx); -# define RETURN1 POP(%edi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi) -# else -# define PARMS 4 -# define ENTRANCE -# define RETURN ret -# define RETURN1 POP(%edi); ret; CFI_PUSH(%edi) -# endif +# ifdef USE_AS_STRNCPY +# define PARMS 8 +# define ENTRANCE PUSH(%ebx) +# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx); +# define RETURN1 POP(%edi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi) +# else +# define PARMS 4 +# define ENTRANCE +# define RETURN ret +# define RETURN1 POP(%edi); ret; CFI_PUSH(%edi) +# endif -# define STR1 PARMS -# define STR2 STR1+4 -# define LEN STR2+4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 /* In this code following instructions are used for copying: movb - 1 byte @@ -60,9 +60,9 @@ movl - 4 byte movlpd - 8 byte movaps - 16 byte - requires 16 byte alignment - of sourse and destination adresses. + of sourse and destination adresses. 16 byte alignment: adress is 32bit value, - right four bit of adress shall be 0. + right four bit of adress shall be 0. */ .text @@ -70,13 +70,13 @@ ENTRY (STRCPY) ENTRANCE mov STR1(%esp), %edx mov STR2(%esp), %ecx -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY movl LEN(%esp), %ebx test %ebx, %ebx jz L(ExitTail0) cmp $8, %ebx jbe L(StrncpyExit8Bytes) -# endif +# endif cmpb $0, (%ecx) jz L(ExitTail1) cmpb $0, 1(%ecx) @@ -93,10 +93,10 @@ ENTRY (STRCPY) jz L(ExitTail7) cmpb $0, 7(%ecx) jz L(ExitTail8) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY cmp $16, %ebx jb L(StrncpyExit15Bytes) -# endif +# endif cmpb $0, 8(%ecx) jz L(ExitTail9) cmpb $0, 9(%ecx) @@ -111,18 +111,20 @@ ENTRY (STRCPY) jz L(ExitTail14) cmpb $0, 14(%ecx) jz L(ExitTail15) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY cmp $16, %ebx je L(ExitTail16) -# endif +# endif cmpb $0, 15(%ecx) jz L(ExitTail16) PUSH (%edi) mov %edx, %edi +# endif PUSH (%esi) # ifdef USE_AS_STRNCPY mov %ecx, %esi + sub $16, %ebx and $0xf, %esi /* add 16 bytes ecx_shift to ebx */ @@ -159,7 +161,7 @@ ENTRY (STRCPY) /* eax = 0: there isn't end of string from position esi to esi+15 */ # ifdef USE_AS_STRNCPY - sub $32, %ebx + sub $16, %ebx jbe L(CopyFrom1To16BytesCase2OrCase3) # endif test %eax, %eax @@ -2217,12 +2219,17 @@ L(Shl15LoopExit): mov $1, %esi palignr $15, %xmm1, %xmm6 movaps %xmm6, (%edx) +# ifdef USE_AS_STRCAT + jmp L(CopyFrom1To16Bytes) +# endif + +# ifndef USE_AS_STRCAT .p2align 4 L(CopyFrom1To16Bytes): -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY add $16, %ebx -# endif +# endif add %esi, %edx add %esi, %ecx @@ -2248,20 +2255,20 @@ L(CopyFrom1To16Bytes): L(Exit8): movlpd (%ecx), %xmm0 movlpd %xmm0, (%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 7(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $8, %ebx lea 8(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2287,23 +2294,23 @@ L(Exit16): movlpd %xmm0, (%edx) movlpd 8(%ecx), %xmm0 movlpd %xmm0, 8(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 15(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $16, %ebx lea 16(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY CFI_PUSH(%esi) @@ -2425,46 +2432,46 @@ L(Less12Case3): /* but more than 8 */ jl L(Exit9) je L(Exit10) jg L(Exit11) -# endif +# endif .p2align 4 L(Exit1): movb (%ecx), %al movb %al, (%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea (%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $1, %ebx lea 1(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 L(Exit2): movw (%ecx), %ax movw %ax, (%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 1(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $2, %ebx lea 2(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2473,40 +2480,40 @@ L(Exit3): movw %ax, (%edx) movb 2(%ecx), %al movb %al, 2(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 2(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $3, %ebx lea 3(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 L(Exit4): movl (%ecx), %eax movl %eax, (%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 3(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $4, %ebx lea 4(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2515,20 +2522,20 @@ L(Exit5): movl %eax, (%edx) movb 4(%ecx), %al movb %al, 4(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 4(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $5, %ebx lea 5(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2537,20 +2544,20 @@ L(Exit6): movl %eax, (%edx) movw 4(%ecx), %ax movw %ax, 4(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 5(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $6, %ebx lea 6(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2559,20 +2566,20 @@ L(Exit7): movl %eax, (%edx) movl 3(%ecx), %eax movl %eax, 3(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 6(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $7, %ebx lea 7(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2581,20 +2588,20 @@ L(Exit9): movlpd %xmm0, (%edx) movb 8(%ecx), %al movb %al, 8(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 8(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $9, %ebx lea 9(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2603,20 +2610,20 @@ L(Exit10): movlpd %xmm0, (%edx) movw 8(%ecx), %ax movw %ax, 8(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 9(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $10, %ebx lea 10(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2625,20 +2632,20 @@ L(Exit11): movlpd %xmm0, (%edx) movl 7(%ecx), %eax movl %eax, 7(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 10(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $11, %ebx lea 11(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2647,20 +2654,20 @@ L(Exit12): movlpd %xmm0, (%edx) movl 8(%ecx), %eax movl %eax, 8(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 11(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $12, %ebx lea 12(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2669,20 +2676,20 @@ L(Exit13): movlpd %xmm0, (%edx) movlpd 5(%ecx), %xmm0 movlpd %xmm0, 5(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 12(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $13, %ebx lea 13(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2691,20 +2698,20 @@ L(Exit14): movlpd %xmm0, (%edx) movlpd 6(%ecx), %xmm0 movlpd %xmm0, 6(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 13(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $14, %ebx lea 14(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 .p2align 4 @@ -2713,25 +2720,25 @@ L(Exit15): movlpd %xmm0, (%edx) movlpd 7(%ecx), %xmm0 movlpd %xmm0, 7(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 14(%edx), %eax -# else +# else movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $15, %ebx lea 15(%edx), %ecx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN1 CFI_POP (%edi) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY .p2align 4 L(Fill0): RETURN @@ -2865,11 +2872,11 @@ L(FillLess12): /* but more than 8 */ je L(Fill10) jmp L(Fill11) - CFI_PUSH(%edi) + CFI_PUSH (%edi) .p2align 4 L(StrncpyFillTailWithZero1): - POP (%edi) + POP (%edi) L(StrncpyFillTailWithZero): pxor %xmm0, %xmm0 xor %edx, %edx @@ -2916,46 +2923,46 @@ L(StrncpyFillLess32): movdqa %xmm0, (%ecx) lea 16(%ecx), %ecx jmp L(FillFrom1To16Bytes) -# endif +# endif .p2align 4 L(ExitTail1): movb (%ecx), %al movb %al, (%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea (%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $1, %ebx lea 1(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 L(ExitTail2): movw (%ecx), %ax movw %ax, (%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 1(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $2, %ebx lea 2(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -2964,40 +2971,40 @@ L(ExitTail3): movw %ax, (%edx) movb 2(%ecx), %al movb %al, 2(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 2(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $3, %ebx lea 3(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 L(ExitTail4): movl (%ecx), %eax movl %eax, (%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 3(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $4, %ebx lea 4(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3006,20 +3013,20 @@ L(ExitTail5): movl %eax, (%edx) movb 4(%ecx), %al movb %al, 4(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 4(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $5, %ebx lea 5(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3028,20 +3035,20 @@ L(ExitTail6): movl %eax, (%edx) movw 4(%ecx), %ax movw %ax, 4(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 5(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $6, %ebx lea 6(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3050,20 +3057,40 @@ L(ExitTail7): movl %eax, (%edx) movl 3(%ecx), %eax movl %eax, 3(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 6(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $7, %ebx lea 7(%edx), %ecx jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) # ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3072,20 +3099,20 @@ L(ExitTail9): movlpd %xmm0, (%edx) movb 8(%ecx), %al movb %al, 8(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 8(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $9, %ebx lea 9(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3094,20 +3121,20 @@ L(ExitTail10): movlpd %xmm0, (%edx) movw 8(%ecx), %ax movw %ax, 8(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 9(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $10, %ebx lea 10(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3116,20 +3143,20 @@ L(ExitTail11): movlpd %xmm0, (%edx) movl 7(%ecx), %eax movl %eax, 7(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 10(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $11, %ebx lea 11(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3138,20 +3165,20 @@ L(ExitTail12): movlpd %xmm0, (%edx) movl 8(%ecx), %eax movl %eax, 8(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 11(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $12, %ebx lea 12(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3160,20 +3187,20 @@ L(ExitTail13): movlpd %xmm0, (%edx) movlpd 5(%ecx), %xmm0 movlpd %xmm0, 5(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 12(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $13, %ebx lea 13(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3182,20 +3209,42 @@ L(ExitTail14): movlpd %xmm0, (%edx) movlpd 6(%ecx), %xmm0 movlpd %xmm0, 6(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 13(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $14, %ebx lea 14(%edx), %ecx jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) # ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN .p2align 4 @@ -3204,24 +3253,28 @@ L(ExitTail16): movlpd %xmm0, (%edx) movlpd 8(%ecx), %xmm0 movlpd %xmm0, 8(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 15(%edx), %eax -# else +# else movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $16, %ebx lea 16(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax +# endif # endif -# endif RETURN +#endif + # ifdef USE_AS_STRNCPY - CFI_PUSH (%esi) - CFI_PUSH (%edi) +# ifndef USE_AS_STRCAT + CFI_PUSH (%esi) + CFI_PUSH (%edi) +# endif L(StrncpyLeaveCase2OrCase3): test %eax, %eax jnz L(Aligned64LeaveCase2) @@ -3979,9 +4032,13 @@ L(StrncpyExit15): movaps %xmm6, (%edx, %esi) lea 1(%esi), %esi jmp L(CopyFrom1To16BytesCase3) +# endif + +# ifndef USE_AS_STRCAT +# ifdef USE_AS_STRNCPY + CFI_POP (%esi) + CFI_POP (%edi) - CFI_POP (%esi) - CFI_POP (%edi) .p2align 4 L(ExitTail0): movl %edx, %eax @@ -4013,31 +4070,19 @@ L(StrncpyExit15Bytes): je L(ExitTail14) cmpb $0, 13(%ecx) jz L(ExitTail14) -# endif - - .p2align 4 -L(ExitTail15): movlpd (%ecx), %xmm0 movlpd %xmm0, (%edx) movlpd 7(%ecx), %xmm0 movlpd %xmm0, 7(%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 14(%edx), %eax -# else - movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY - sub $15, %ebx - lea 15(%edx), %ecx - jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax -# endif -# endif +# else + movl %edx, %eax +# endif RETURN -# ifdef USE_AS_STRNCPY .p2align 4 L(StrncpyExit8Bytes): cmp $1, %ebx @@ -4068,27 +4113,19 @@ L(StrncpyExit8Bytes): je L(ExitTail7) cmpb $0, 6(%ecx) jz L(ExitTail7) -# endif - .p2align 4 -L(ExitTail8): movlpd (%ecx), %xmm0 movlpd %xmm0, (%edx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 7(%edx), %eax -# else - movl %edx, %eax -# endif -# ifdef USE_AS_STRNCPY - sub $8, %ebx - lea 8(%edx), %ecx - jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax -# endif -# endif +# else + movl %edx, %eax +# endif RETURN +# endif -END (STRCPY) +END (STRCPY) +# endif #endif diff --git a/sysdeps/i386/i686/multiarch/strlen-sse2.S b/sysdeps/i386/i686/multiarch/strlen-sse2.S index 65809d985b..0eb872733d 100644 --- a/sysdeps/i386/i686/multiarch/strlen-sse2.S +++ b/sysdeps/i386/i686/multiarch/strlen-sse2.S @@ -1,5 +1,5 @@ /* strlen with SSE2 - Copyright (C) 2010 Free Software Foundation, Inc. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -18,30 +18,32 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#if defined SHARED && !defined NOT_IN_libc +#if (defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc +# ifndef USE_AS_STRCAT -#include -#include "asm-syntax.h" +# include +# include "asm-syntax.h" -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) -#define PARMS 4 -#define STR PARMS -#define ENTRANCE -#define RETURN ret +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) +# define PARMS 4 +# define STR PARMS +# define ENTRANCE +# define RETURN ret .text ENTRY (__strlen_sse2) ENTRANCE mov STR(%esp), %edx +# endif xor %eax, %eax cmpb $0, (%edx) jz L(exit_tail0) @@ -77,9 +79,8 @@ ENTRY (__strlen_sse2) jz L(exit_tail15) pxor %xmm0, %xmm0 mov %edx, %eax - mov %edx, %ecx + lea 16(%edx), %ecx and $-16, %eax - add $16, %ecx add $16, %eax pcmpeqb (%eax), %xmm0 @@ -183,51 +184,41 @@ ENTRY (__strlen_sse2) jnz L(exit) and $-0x40, %eax - PUSH (%esi) - PUSH (%edi) - PUSH (%ebx) - PUSH (%ebp) - xor %ebp, %ebp L(aligned_64): - pcmpeqb (%eax), %xmm0 - pcmpeqb 16(%eax), %xmm1 - pcmpeqb 32(%eax), %xmm2 - pcmpeqb 48(%eax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %esi - pmovmskb %xmm2, %edi - pmovmskb %xmm3, %ebx - or %edx, %ebp - or %esi, %ebp - or %edi, %ebp - or %ebx, %ebp + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqb %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx lea 64(%eax), %eax jz L(aligned_64) -L(48leave): + + pcmpeqb -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %edx test %edx, %edx - jnz L(aligned_64_exit_16) - test %esi, %esi - jnz L(aligned_64_exit_32) - test %edi, %edi - jnz L(aligned_64_exit_48) - mov %ebx, %edx - lea (%eax), %eax - jmp L(aligned_64_exit) -L(aligned_64_exit_48): - lea -16(%eax), %eax - mov %edi, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_32): - lea -32(%eax), %eax - mov %esi, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_16): - lea -48(%eax), %eax -L(aligned_64_exit): - POP (%ebp) - POP (%ebx) - POP (%edi) - POP (%esi) + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm6, %xmm3 + pmovmskb %xmm3, %edx + lea -16(%ecx), %ecx L(exit): sub %ecx, %eax test %dl, %dl @@ -340,8 +331,9 @@ L(exit_tail14): L(exit_tail15): add $15, %eax +# ifndef USE_AS_STRCAT ret - END (__strlen_sse2) - +# endif #endif + diff --git a/sysdeps/i386/i686/multiarch/strncat-c.c b/sysdeps/i386/i686/multiarch/strncat-c.c new file mode 100644 index 0000000000..132a000545 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncat-c.c @@ -0,0 +1,8 @@ +#define STRNCAT __strncat_ia32 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32); +#endif + +#include "string/strncat.c" diff --git a/sysdeps/i386/i686/multiarch/strncat-sse2.S b/sysdeps/i386/i686/multiarch/strncat-sse2.S new file mode 100644 index 0000000000..f1045b72b8 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncat-sse2.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_sse2 +#define USE_AS_STRNCAT + +#include "strcat-sse2.S" diff --git a/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/sysdeps/i386/i686/multiarch/strncat-ssse3.S new file mode 100644 index 0000000000..625b90a978 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncat-ssse3.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_ssse3 +#define USE_AS_STRNCAT + +#include "strcat-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/strncat.S b/sysdeps/i386/i686/multiarch/strncat.S new file mode 100644 index 0000000000..fd569c2234 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncat.S @@ -0,0 +1,3 @@ +#define STRCAT strncat +#define USE_AS_STRNCAT +#include "strcat.S" -- cgit 1.4.1