/* strcpy with SSSE3 Copyright (C) 2009 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ #include #include #if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) # ifndef STRCPY # define STRCPY strcpy # endif #endif #ifdef USE_AS_STPCPY # ifdef USE_AS_STRNCPY # define STRCPY_SSSE3 __stpncpy_ssse3 # define STRCPY_SSE2 __stpncpy_sse2 # define __GI_STRCPY __GI_stpncpy # else # define STRCPY_SSSE3 __stpcpy_ssse3 # define STRCPY_SSE2 __stpcpy_sse2 # define __GI_STRCPY __GI_stpcpy # define __GI___STRCPY __GI___stpcpy # endif #else # ifdef USE_AS_STRNCPY # define STRCPY_SSSE3 __strncpy_ssse3 # define STRCPY_SSE2 __strncpy_sse2 # define __GI_STRCPY __GI_strncpy # else # define STRCPY_SSSE3 __strcpy_ssse3 # define STRCPY_SSE2 __strcpy_sse2 # define __GI_STRCPY __GI_strcpy # endif #endif #ifndef LABEL #define LABEL(l) L(l) #endif /* Define multiple versions only for the definition in libc. */ #ifndef NOT_IN_libc .text ENTRY(STRCPY) .type STRCPY, @gnu_indirect_function cmpl $0, __cpu_features+KIND_OFFSET(%rip) jne 1f call __init_cpu_features 1: leaq STRCPY_SSE2(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) jz 2f leaq STRCPY_SSSE3(%rip), %rax 2: ret END(STRCPY) .section .text.ssse3,"ax",@progbits STRCPY_SSSE3: cfi_startproc CALL_MCOUNT /* * This implementation uses SSE to copy up to 16 bytes at a time. */ #ifdef USE_AS_STRNCPY test %rdx, %rdx jz LABEL(strncpy_exitz) mov %rdx, %r8 #else xor %edx, %edx #endif mov %esi, %ecx and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/ and $15, %ecx mov %rdi, %rax /*store return parameter*/ pxor %xmm0, %xmm0 /* clear %xmm0 */ pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ shr %cl, %edx /* get real bits left in edx*/ test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */ jnz LABEL(less16bytes) #ifdef USE_AS_STRNCPY lea -16(%r8,%rcx), %r11 cmp $0, %r11 jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */ #endif mov %rcx, %r9 or %edi, %ecx and $15, %ecx lea -16(%r9), %r10 jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/ neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/ pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/ pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(less32bytes) /* * at least 16 byte available to fill destination rdi */ #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(less32bytes_strncpy_truncation) #endif mov (%rsi, %r9), %rdx mov %rdx, (%rdi) mov 8(%rsi, %r9), %rdx mov %rdx, 8(%rdi) /* * so far destatination rdi may be aligned by 16, re-calculate rsi to jump * crossponding case * rcx is offset of rsi * rax is offset of rdi */ and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ mov %rax, %rdx /* rax store orignal rdi */ xor %rdi, %rdx /* equal to and $15, %rdx */ #ifdef USE_AS_STRNCPY add %rdx, %r8 #endif add $16, %rdi /* next 16 bytes for rdi */ sub %rdx, %r9 lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */ mov %esi, %ecx /*store offset of rsi */ and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/ jz LABEL(ashr_0) lea -16(%rcx), %r10 mov %rcx, %r9 neg %r10 lea LABEL(unaligned_table)(%rip), %r11 movslq (%r11, %rcx,4), %rcx lea (%r11, %rcx), %rcx jmp *%rcx /* * The following cases will be handled by ashr_0 & ashr_0_start * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * 0 0 0 ashr_0 * n(1~15) n(1~15) 0 ashr_0_start * */ .p2align 5 LABEL(ashr_0): #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */ movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */ add $16, %rsi add $16, %rdi pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */ pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ test %edx, %edx /* edx must be 0 if there is no null char in rsi*/ jnz LABEL(aligned_16bytes) LABEL(ashr_0_loop): #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi, %rcx), %xmm1 movdqa %xmm1, (%rdi, %rcx) add $16, %rcx pcmpeqb (%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(aligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi, %rcx), %xmm1 movdqa %xmm1, (%rdi, %rcx) add $16, %rcx pcmpeqb (%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(aligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi, %rcx), %xmm1 movdqa %xmm1, (%rdi, %rcx) add $16, %rcx pcmpeqb (%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(aligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi, %rcx), %xmm1 movdqa %xmm1, (%rdi, %rcx) add $16, %rcx pcmpeqb (%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jz LABEL(ashr_0_loop) jmp LABEL(aligned_exit) .p2align 4 /* * The following cases will be handled by ashr_15 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_15): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_15_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $15, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $15, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_15_use_ssse3) /* * The following cases will be handled by ashr_14 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_14): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_14_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $14, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $14, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_14_use_ssse3) /* * The following cases will be handled by ashr_13 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_13): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_13_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $13, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $13, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_13_use_ssse3) /* * The following cases will be handled by ashr_12 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_12): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_12_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $12, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $12, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_12_use_ssse3) /* * The following cases will be handled by ashr_11 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_11): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_11_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $11, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $11, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_11_use_ssse3) /* * The following cases will be handled by ashr_10 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_10): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_10_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $10, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $10, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_10_use_ssse3) /* * The following cases will be handled by ashr_9 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_9): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_9_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $9, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $9, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_9_use_ssse3) /* * The following cases will be handled by ashr_8 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_8): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_8_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $8, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $8, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_8_use_ssse3) /* * The following cases will be handled by ashr_7 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_7): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_7_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $7, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $7, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_7_use_ssse3) /* * The following cases will be handled by ashr_6 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_6): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_6_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $6, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $6, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_6_use_ssse3) /* * The following cases will be handled by ashr_5 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_5): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_5_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $5, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $5, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_5_use_ssse3) /* * * The following cases will be handled by ashr_4 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_4): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_4_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $4, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $4, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_4_use_ssse3) /* * * The following cases will be handled by ashr_3 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_3): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_3_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $3, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $3, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_3_use_ssse3) /* * * The following cases will be handled by ashr_2 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_2): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_2_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $2, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $2, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_2_use_ssse3) /* * * The following cases will be handled by ashr_1 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1 * * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte */ .p2align 4 LABEL(ashr_1): xor %ecx, %ecx /*clear ecx */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif .p2align 4 LABEL(ashr_1_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $1, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif palignr $1, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_1_use_ssse3) .p2align 4 LABEL(less32bytes): xor %ecx, %ecx LABEL(unaligned_exit): add %r9, %rsi /* r9 stores original offset of rsi*/ mov %rcx, %r9 mov %r10, %rcx shl %cl, %edx /* after shl, calculate the exact number to be filled*/ mov %r9, %rcx .p2align 4 LABEL(aligned_exit): add %rcx, %rdi /*locate exact address for rdi */ LABEL(less16bytes): add %rcx, %rsi /*locate exact address for rsi */ LABEL(aligned_16bytes): #ifdef USE_AS_STRNCPY mov $1, %r9d lea -1(%r8), %rcx shl %cl, %r9d cmp $32, %r8 ja LABEL(strncpy_tail) or %r9d, %edx LABEL(strncpy_tail): #endif bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/ lea LABEL(tail_table)(%rip), %r11 movslq (%r11, %rcx,4), %rcx lea (%r11, %rcx), %rcx jmp *%rcx #ifdef USE_AS_STRNCPY .p2align 4 LABEL(less32bytes_strncpy_truncation): xor %ecx, %ecx LABEL(strncpy_truncation_unaligned): add %r9, %rsi LABEL(strncpy_truncation_aligned): add %rcx, %rdi add %rcx, %rsi add $16, %r8 lea -1(%r8), %rcx lea LABEL(tail_table)(%rip), %r11 movslq (%r11, %rcx,4), %rcx lea (%r11, %rcx), %rcx jmp *%rcx .p2align 4 LABEL(strncpy_exitz): mov %rdi, %rax ret #endif #ifdef USE_AS_STRNCPY .p2align 4 LABEL(strncpy_fill_tail): mov %rax, %rdx movzx %cl, %rax mov %r8, %rcx add %rax, %rdi xor %eax, %eax shr $3, %ecx jz LABEL(strncpy_fill_less_8) rep stosq LABEL(strncpy_fill_less_8): mov %r8, %rcx and $7, %ecx jz LABEL(strncpy_fill_return) LABEL(strncpy_fill_less_7): sub $1, %ecx mov %al, (%rdi, %rcx) jnz LABEL(strncpy_fill_less_7) LABEL(strncpy_fill_return): #ifdef USE_AS_STPCPY cmpb $1, (%rdx) sbb $-1, %rdx #endif mov %rdx, %rax ret #endif .p2align 4 LABEL(tail_0): mov (%rsi), %cl mov %cl, (%rdi) #ifdef USE_AS_STPCPY mov %rdi, %rax #endif #ifdef USE_AS_STRNCPY mov $1, %cl sub $1, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_1): mov (%rsi), %cx mov %cx, (%rdi) #ifdef USE_AS_STPCPY lea 1(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $2, %cl sub $2, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_2): mov (%rsi), %cx mov %cx, (%rdi) mov 1(%rsi), %cx mov %cx, 1(%rdi) #ifdef USE_AS_STPCPY lea 2(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $3, %cl sub $3, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_3): mov (%rsi), %ecx mov %ecx, (%rdi) #ifdef USE_AS_STPCPY lea 3(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $4, %cl sub $4, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_4): mov (%rsi), %ecx mov %ecx, (%rdi) mov 1(%rsi), %edx mov %edx, 1(%rdi) #ifdef USE_AS_STPCPY lea 4(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $5, %cl sub $5, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_5): mov (%rsi), %ecx mov %ecx, (%rdi) mov 2(%rsi), %edx mov %edx, 2(%rdi) #ifdef USE_AS_STPCPY lea 5(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $6, %cl sub $6, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_6): mov (%rsi), %ecx mov %ecx, (%rdi) mov 3(%rsi), %edx mov %edx,3(%rdi) #ifdef USE_AS_STPCPY lea 6(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $7, %cl sub $7, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_7): mov (%rsi), %rcx mov %rcx, (%rdi) #ifdef USE_AS_STPCPY lea 7(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $8, %cl sub $8, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_8): mov (%rsi), %rcx mov %rcx, (%rdi) mov 5(%rsi), %edx mov %edx, 5(%rdi) #ifdef USE_AS_STPCPY lea 8(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $9, %cl sub $9, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_9): mov (%rsi), %rcx mov %rcx, (%rdi) mov 6(%rsi), %edx mov %edx, 6(%rdi) #ifdef USE_AS_STPCPY lea 9(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $10, %cl sub $10, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_10): mov (%rsi), %rcx mov %rcx, (%rdi) mov 7(%rsi), %edx mov %edx, 7(%rdi) #ifdef USE_AS_STPCPY lea 10(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $11, %cl sub $11, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_11): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %edx mov %edx, 8(%rdi) #ifdef USE_AS_STPCPY lea 11(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $12, %cl sub $12, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_12): mov (%rsi), %rcx mov %rcx, (%rdi) mov 5(%rsi), %rcx mov %rcx, 5(%rdi) #ifdef USE_AS_STPCPY lea 12(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $13, %cl sub $13, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_13): mov (%rsi), %rcx mov %rcx, (%rdi) mov 6(%rsi), %rcx mov %rcx, 6(%rdi) #ifdef USE_AS_STPCPY lea 13(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $14, %cl sub $14, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_14): mov (%rsi), %rcx mov %rcx, (%rdi) mov 7(%rsi), %rcx mov %rcx, 7(%rdi) #ifdef USE_AS_STPCPY lea 14(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $15, %cl sub $15, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret LABEL(tail_15): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) #ifdef USE_AS_STPCPY lea 15(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $16, %cl sub $16, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_16): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %cl mov %cl, 16(%rdi) #ifdef USE_AS_STPCPY lea 16(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $17, %cl sub $17, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_17): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %cx mov %cx, 16(%rdi) #ifdef USE_AS_STPCPY lea 17(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $18, %cl sub $18, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_18): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 15(%rsi), %ecx mov %ecx,15(%rdi) #ifdef USE_AS_STPCPY lea 18(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $19, %cl sub $19, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_19): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %ecx mov %ecx, 16(%rdi) #ifdef USE_AS_STPCPY lea 19(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $20, %cl sub $20, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_20): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 13(%rsi), %rcx mov %rcx, 13(%rdi) #ifdef USE_AS_STPCPY lea 20(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $21, %cl sub $21, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_21): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 14(%rsi), %rcx mov %rcx, 14(%rdi) #ifdef USE_AS_STPCPY lea 21(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $22, %cl sub $22, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_22): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 15(%rsi), %rcx mov %rcx, 15(%rdi) #ifdef USE_AS_STPCPY lea 22(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $23, %cl sub $23, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_23): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) #ifdef USE_AS_STPCPY lea 23(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $24, %cl sub $24, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_24): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 21(%rsi), %edx mov %edx, 21(%rdi) #ifdef USE_AS_STPCPY lea 24(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $25, %cl sub $25, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_25): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 22(%rsi), %edx mov %edx, 22(%rdi) #ifdef USE_AS_STPCPY lea 25(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $26, %cl sub $26, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_26): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 23(%rsi), %edx mov %edx, 23(%rdi) #ifdef USE_AS_STPCPY lea 26(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $27, %cl sub $27, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_27): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 24(%rsi), %edx mov %edx, 24(%rdi) #ifdef USE_AS_STPCPY lea 27(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $28, %cl sub $28, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_28): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 21(%rsi), %rdx mov %rdx, 21(%rdi) #ifdef USE_AS_STPCPY lea 28(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $29, %cl sub $29, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_29): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 22(%rsi), %rdx mov %rdx, 22(%rdi) #ifdef USE_AS_STPCPY lea 29(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $30, %cl sub $30, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_30): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 23(%rsi), %rdx mov %rdx, 23(%rdi) #ifdef USE_AS_STPCPY lea 30(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $31, %cl sub $31, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret .p2align 4 LABEL(tail_31): mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 24(%rsi), %rdx mov %rdx, 24(%rdi) #ifdef USE_AS_STPCPY lea 31(%rdi), %rax #endif #ifdef USE_AS_STRNCPY mov $32, %cl sub $32, %r8 jnz LABEL(strncpy_fill_tail) #ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax #endif #endif ret cfi_endproc .size STRCPY_SSSE3, .-STRCPY_SSSE3 .p2align 4 .section .rodata.ssse3,"a",@progbits LABEL(tail_table): .int LABEL(tail_0) - LABEL(tail_table) .int LABEL(tail_1) - LABEL(tail_table) .int LABEL(tail_2) - LABEL(tail_table) .int LABEL(tail_3) - LABEL(tail_table) .int LABEL(tail_4) - LABEL(tail_table) .int LABEL(tail_5) - LABEL(tail_table) .int LABEL(tail_6) - LABEL(tail_table) .int LABEL(tail_7) - LABEL(tail_table) .int LABEL(tail_8) - LABEL(tail_table) .int LABEL(tail_9) - LABEL(tail_table) .int LABEL(tail_10) - LABEL(tail_table) .int LABEL(tail_11) - LABEL(tail_table) .int LABEL(tail_12) - LABEL(tail_table) .int LABEL(tail_13) - LABEL(tail_table) .int LABEL(tail_14) - LABEL(tail_table) .int LABEL(tail_15) - LABEL(tail_table) .int LABEL(tail_16) - LABEL(tail_table) .int LABEL(tail_17) - LABEL(tail_table) .int LABEL(tail_18) - LABEL(tail_table) .int LABEL(tail_19) - LABEL(tail_table) .int LABEL(tail_20) - LABEL(tail_table) .int LABEL(tail_21) - LABEL(tail_table) .int LABEL(tail_22) - LABEL(tail_table) .int LABEL(tail_23) - LABEL(tail_table) .int LABEL(tail_24) - LABEL(tail_table) .int LABEL(tail_25) - LABEL(tail_table) .int LABEL(tail_26) - LABEL(tail_table) .int LABEL(tail_27) - LABEL(tail_table) .int LABEL(tail_28) - LABEL(tail_table) .int LABEL(tail_29) - LABEL(tail_table) .int LABEL(tail_30) - LABEL(tail_table) .int LABEL(tail_31) - LABEL(tail_table) .p2align 4 LABEL(unaligned_table): .int LABEL(ashr_0) - LABEL(unaligned_table) .int LABEL(ashr_1) - LABEL(unaligned_table) .int LABEL(ashr_2) - LABEL(unaligned_table) .int LABEL(ashr_3) - LABEL(unaligned_table) .int LABEL(ashr_4) - LABEL(unaligned_table) .int LABEL(ashr_5) - LABEL(unaligned_table) .int LABEL(ashr_6) - LABEL(unaligned_table) .int LABEL(ashr_7) - LABEL(unaligned_table) .int LABEL(ashr_8) - LABEL(unaligned_table) .int LABEL(ashr_9) - LABEL(unaligned_table) .int LABEL(ashr_10) - LABEL(unaligned_table) .int LABEL(ashr_11) - LABEL(unaligned_table) .int LABEL(ashr_12) - LABEL(unaligned_table) .int LABEL(ashr_13) - LABEL(unaligned_table) .int LABEL(ashr_14) - LABEL(unaligned_table) .int LABEL(ashr_15) - LABEL(unaligned_table) # undef ENTRY # define ENTRY(name) \ .type STRCPY_SSE2, @function; \ .align 16; \ STRCPY_SSE2: cfi_startproc; \ CALL_MCOUNT # undef END # define END(name) \ cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2 # undef libc_hidden_builtin_def /* It doesn't make sense to send libc-internal strcpy calls through a PLT. The speedup we get from using SSSE3 instruction is likely eaten away by the indirect call in the PLT. */ # define libc_hidden_builtin_def(name) \ .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2 # undef libc_hidden_def # define libc_hidden_def(name) \ .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2 #endif #ifndef USE_AS_STRNCPY #include "../strcpy.S" #endif