about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/strcpy.S
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2011-06-24 15:14:22 -0400
committerUlrich Drepper <drepper@gmail.com>2011-06-24 15:14:22 -0400
commit8912479f9ea9f56dc188d3d00c4ba4259f600661 (patch)
treefc91331de86b054859ce0dfe3fdec2a06812aa4c /sysdeps/x86_64/multiarch/strcpy.S
parentd5495a116c6271c0ae8f6955b64b7b010b1b341a (diff)
downloadglibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.tar.gz
glibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.tar.xz
glibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.zip
Improved st{r,p}{,n}cpy for SSE2 and SSSE3 on x86-64
Diffstat (limited to 'sysdeps/x86_64/multiarch/strcpy.S')
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.S1860
1 files changed, 24 insertions, 1836 deletions
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 02fa8d0710..381060f643 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -1,5 +1,5 @@
-/* strcpy with SSSE3
-   Copyright (C) 2009 Free Software Foundation, Inc.
+/* Multiple versions of strcpy
+   Copyright (C) 2009, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -29,30 +29,32 @@
 
 #ifdef USE_AS_STPCPY
 # ifdef USE_AS_STRNCPY
-#  define STRCPY_SSSE3	__stpncpy_ssse3
-#  define STRCPY_SSE2	__stpncpy_sse2
-#  define __GI_STRCPY	__GI_stpncpy
+#  define STRCPY_SSSE3		__stpncpy_ssse3
+#  define STRCPY_SSE2		__stpncpy_sse2
+#  define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_stpncpy
+#  define __GI___STRCPY		__GI___stpncpy
 # else
-#  define STRCPY_SSSE3	__stpcpy_ssse3
-#  define STRCPY_SSE2	__stpcpy_sse2
-#  define __GI_STRCPY	__GI_stpcpy
-#  define __GI___STRCPY	__GI___stpcpy
+#  define STRCPY_SSSE3		__stpcpy_ssse3
+#  define STRCPY_SSE2		__stpcpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__stpcpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_stpcpy
+#  define __GI___STRCPY		__GI___stpcpy
 # endif
 #else
 # ifdef USE_AS_STRNCPY
-#  define STRCPY_SSSE3	__strncpy_ssse3
-#  define STRCPY_SSE2	__strncpy_sse2
-#  define __GI_STRCPY	__GI_strncpy
+#  define STRCPY_SSSE3		__strncpy_ssse3
+#  define STRCPY_SSE2		__strncpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__strncpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_strncpy
 # else
-#  define STRCPY_SSSE3	__strcpy_ssse3
-#  define STRCPY_SSE2	__strcpy_sse2
-#  define __GI_STRCPY	__GI_strcpy
+#  define STRCPY_SSSE3		__strcpy_ssse3
+#  define STRCPY_SSE2		__strcpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__strcpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_strcpy
 # endif
 #endif
 
-#ifndef LABEL
-#define LABEL(l) L(l)
-#endif
 
 /* Define multiple versions only for the definition in libc.  */
 #ifndef NOT_IN_libc
@@ -62,1830 +64,16 @@ ENTRY(STRCPY)
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	STRCPY_SSE2(%rip), %rax
+1:	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
+	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	jnz	2f
+	leaq	STRCPY_SSE2(%rip), %rax
 	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
 	jz	2f
 	leaq	STRCPY_SSSE3(%rip), %rax
 2:	ret
 END(STRCPY)
 
-	.section .text.ssse3,"ax",@progbits
-STRCPY_SSSE3:
-	cfi_startproc
-	CALL_MCOUNT
-
-/*
- * This implementation uses SSE to copy up to 16 bytes at a time.
- */
-#ifdef USE_AS_STRNCPY
-	test    %rdx, %rdx
-	jz      LABEL(strncpy_exitz)
-	mov     %rdx, %r8
-#else
-	xor	%edx, %edx
-#endif
-	mov	%esi, %ecx
-	and	$0xfffffffffffffff0, %rsi	/*force rsi 16 byte align*/
-	and	$15, %ecx
-	mov	%rdi, %rax			/*store return parameter*/
-
-
-	pxor	%xmm0, %xmm0			/* clear %xmm0 */
-	pcmpeqb	(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
-	pmovmskb %xmm0, %edx			/* move each byte mask of %xmm0 to edx*/
-	shr	%cl, %edx			/* get real bits left in edx*/
-	test	%edx, %edx			/* edx must be 0 if there is no null char from rsi+%rcx */
-	jnz	LABEL(less16bytes)
-
-#ifdef USE_AS_STRNCPY
-	lea	-16(%r8,%rcx), %r11
-	cmp	$0, %r11
-	jle	LABEL(less16bytes)		/* if r8 + rcx <= 16, branch to less16bytes.  */
-#endif
-
-	mov	%rcx, %r9
-	or	%edi, %ecx
-	and	$15, %ecx
-	lea	-16(%r9), %r10
-	jz	LABEL(ashr_0)			/* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
-
-	neg	%r10				/* store the rest in rsi aligned 16 bytes for unaligned_exit*/
-
-	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation*/
-	pcmpeqb	16(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(less32bytes)
-	/*
-	* at least 16 byte available to fill destination rdi
-	*/
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(less32bytes_strncpy_truncation)
-#endif
-	mov	(%rsi, %r9), %rdx
-	mov	%rdx, (%rdi)
-	mov	8(%rsi, %r9), %rdx
-	mov	%rdx, 8(%rdi)
-
-	/*
-	* so far destatination rdi may be aligned by 16, re-calculate rsi to jump
-	* crossponding case
-	* rcx is offset of rsi
-	* rax is offset of rdi
-	*/
-
-	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
-	mov	%rax, %rdx			/* rax store orignal rdi */
-	xor	%rdi, %rdx			/* equal to and $15, %rdx */
-#ifdef USE_AS_STRNCPY
-	add     %rdx, %r8
-#endif
-
-	add	$16, %rdi			/* next 16 bytes for rdi */
-	sub	%rdx, %r9
-
-	lea	16(%r9, %rsi), %rsi		/*re-calculate rsi by (16 - rdx)+ rcx */
-	mov	%esi, %ecx			/*store offset of rsi */
-	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
-
-	and	$15, %ecx			/* ecx must be 0 if rdx is equal to rcx*/
-	jz	LABEL(ashr_0)
-
-	lea	-16(%rcx), %r10
-	mov	%rcx, %r9
-	neg	%r10
-	lea	LABEL(unaligned_table)(%rip), %r11
-	movslq  (%r11, %rcx,4), %rcx
-	lea	(%r11, %rcx), %rcx
-	jmp	*%rcx
-
- /*
- * The following cases will be handled by ashr_0 & ashr_0_start
- *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
- *	0		    0		  0		 ashr_0
- *	n(1~15)	     n(1~15)	   0		 ashr_0_start
- *
- */
-	.p2align 5
-LABEL(ashr_0):
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi), %xmm1	   /* fetch first 16 bytes from rsi */
-	movdqa  %xmm1, (%rdi)	   /* store first 16 bytes into rdi */
-	add     $16, %rsi
-	add     $16, %rdi
-	pcmpeqb  (%rsi), %xmm0		   /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
-	pmovmskb  %xmm0, %edx		   /* move each byte mask of %xmm0 to edx*/
-
-	test    %edx, %edx		  /* edx must be 0 if there is no null char in rsi*/
-	jnz	LABEL(aligned_16bytes)
-
-LABEL(ashr_0_loop):
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi, %rcx), %xmm1
-	movdqa  %xmm1, (%rdi, %rcx)
-	add	$16, %rcx
-	pcmpeqb  (%rsi, %rcx), %xmm0
-	pmovmskb  %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi, %rcx), %xmm1
-	movdqa  %xmm1, (%rdi, %rcx)
-	add	$16, %rcx
-	pcmpeqb  (%rsi, %rcx), %xmm0
-	pmovmskb  %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi, %rcx), %xmm1
-	movdqa  %xmm1, (%rdi, %rcx)
-	add	$16, %rcx
-	pcmpeqb  (%rsi, %rcx), %xmm0
-	pmovmskb  %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi, %rcx), %xmm1
-	movdqa  %xmm1, (%rdi, %rcx)
-	add	$16, %rcx
-	pcmpeqb  (%rsi, %rcx), %xmm0
-	pmovmskb  %xmm0, %edx
-	test	%edx, %edx
-	jz	LABEL(ashr_0_loop)
-
-	jmp	LABEL(aligned_exit)
-        .p2align 4
-
-/*
- * The following cases will be handled by ashr_15
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(15)		n - 15		15((16 - (n -15) + n)%16	 ashr_15
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_15):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_15_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $15, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $15, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_15_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_14
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(14~15)		n - 14		14((16 - (n -14) + n)%16	 ashr_14
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_14):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_14_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $14, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $14, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_14_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_13
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(13~15)		n - 13		13((16 - (n -13) + n)%16	 ashr_13
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_13):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_13_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $13, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $13, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_13_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_12
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(12~15)		n - 12		12((16 - (n -12) + n)%16	 ashr_12
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_12):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_12_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $12, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $12, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_12_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_11
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(11~15)		n - 11		11((16 - (n -11) + n)%16	 ashr_11
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_11):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_11_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $11, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $11, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_11_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_10
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(10~15)		n - 10		10((16 - (n -10) + n)%16	 ashr_10
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_10):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_10_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $10, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $10, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_10_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_9
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(9~15)		n - 9		9((16 - (n -9) + n)%16	 ashr_9
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_9):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_9_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $9, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $9, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_9_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_8
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(8~15)		n - 8		8((16 - (n -8) + n)%16	 ashr_8
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_8):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_8_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $8, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $8, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_8_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_7
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(7~15)		n - 7		7((16 - (n -7) + n)%16	 ashr_7
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_7):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	.p2align 4
-
-LABEL(ashr_7_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $7, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $7, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_7_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_6
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(6~15)		n - 6		6((16 - (n -6) + n)%16	 ashr_6
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_6):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_6_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $6, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $6, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_6_use_ssse3)
-
- /*
- * The following cases will be handled by ashr_5
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(5~15)		n - 5		5((16 - (n -5) + n)%16	 ashr_5
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_5):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_5_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $5, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $5, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_5_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_4
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(4~15)		n - 4		4((16 - (n -4) + n)%16	 ashr_4
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_4):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_4_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $4, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $4, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_4_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_3
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(3~15)		n - 3		3((16 - (n -3) + n)%16	 ashr_3
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_3):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_3_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $3, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $3, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_3_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_2
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(2~15)		n - 2		2((16 - (n -2) + n)%16	 ashr_2
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_2):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_2_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $2, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $2, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_2_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_1
- *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  		corresponding case
- *	n(1~15)		n - 1	   	1 ((16 - (n -1) + n)%16	 ashr_1
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_1):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_1_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $1, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-	palignr $1, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_1_use_ssse3)
-
-	.p2align 4
-LABEL(less32bytes):
-	xor	%ecx, %ecx
-LABEL(unaligned_exit):
-	add	%r9, %rsi		/* r9 stores original offset of rsi*/
-	mov	%rcx, %r9
-	mov	%r10, %rcx
-	shl	%cl, %edx		/* after shl, calculate the exact number to be filled*/
-	mov	%r9, %rcx
-	.p2align 4
-LABEL(aligned_exit):
-	add	%rcx, %rdi		/*locate exact address for rdi */
-LABEL(less16bytes):
-	add	%rcx, %rsi		/*locate exact address for rsi */
-LABEL(aligned_16bytes):
-#ifdef USE_AS_STRNCPY
-	mov     $1, %r9d
-	lea     -1(%r8), %rcx
-	shl     %cl, %r9d
-	cmp     $32, %r8
-	ja      LABEL(strncpy_tail)
-	or      %r9d, %edx
-LABEL(strncpy_tail):
-#endif
-	bsf	%rdx, %rcx		/*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
-	lea	LABEL(tail_table)(%rip), %r11
-	movslq	(%r11, %rcx,4), %rcx
-	lea	(%r11, %rcx), %rcx
-	jmp	*%rcx
-
-#ifdef USE_AS_STRNCPY
-	.p2align 4
-LABEL(less32bytes_strncpy_truncation):
-	xor     %ecx, %ecx
-LABEL(strncpy_truncation_unaligned):
-	add      %r9, %rsi
-LABEL(strncpy_truncation_aligned):
-	add      %rcx, %rdi
-	add      %rcx, %rsi
-	add     $16, %r8
-	lea     -1(%r8), %rcx
-	lea     LABEL(tail_table)(%rip), %r11
-	movslq  (%r11, %rcx,4), %rcx
-	lea     (%r11, %rcx), %rcx
-	jmp     *%rcx
-	.p2align 4
-LABEL(strncpy_exitz):
-	mov     %rdi, %rax
-	ret
-#endif
-
-#ifdef USE_AS_STRNCPY
-	.p2align 4
-LABEL(strncpy_fill_tail):
-	mov	%rax, %rdx
-	movzx	%cl, %rax
-	mov	%r8, %rcx
-	add	%rax, %rdi
-	xor	%eax, %eax
-	shr	$3, %ecx
-	jz	LABEL(strncpy_fill_less_8)
-
-	rep	stosq
-LABEL(strncpy_fill_less_8):
-	mov	%r8, %rcx
-	and	$7, %ecx
-	jz	LABEL(strncpy_fill_return)
-LABEL(strncpy_fill_less_7):
-	sub	$1, %ecx
-	mov	%al, (%rdi, %rcx)
-	jnz	LABEL(strncpy_fill_less_7)
-LABEL(strncpy_fill_return):
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rdx)
-	sbb	$-1, %rdx
-#endif
-	mov	%rdx, %rax
-	ret
-#endif
-	.p2align 4
-LABEL(tail_0):
-	mov	(%rsi), %cl
-	mov	%cl, (%rdi)
-#ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$1, %cl
-	sub	$1, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_1):
-	mov	(%rsi), %cx
-	mov	%cx, (%rdi)
-#ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$2, %cl
-	sub	$2, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_2):
-	mov	(%rsi), %cx
-	mov	%cx, (%rdi)
-	mov	1(%rsi), %cx
-	mov	%cx, 1(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$3, %cl
-	sub	$3, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_3):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-#ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$4, %cl
-	sub	$4, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_4):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	1(%rsi), %edx
-	mov	%edx, 1(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	4(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$5, %cl
-	sub	$5, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_5):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	2(%rsi), %edx
-	mov	%edx, 2(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	5(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$6, %cl
-	sub	$6, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_6):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	3(%rsi), %edx
-	mov	%edx,3(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	6(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$7, %cl
-	sub	$7, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_7):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-#ifdef USE_AS_STPCPY
-	lea	7(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$8, %cl
-	sub	$8, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_8):
-
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	5(%rsi), %edx
-	mov	%edx, 5(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	8(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$9, %cl
-	sub	$9, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_9):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	6(%rsi), %edx
-	mov	%edx, 6(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	9(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$10, %cl
-	sub	$10, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_10):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	7(%rsi), %edx
-	mov	%edx, 7(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	10(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$11, %cl
-	sub	$11, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_11):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %edx
-	mov	%edx, 8(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	11(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$12, %cl
-	sub	$12, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_12):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	5(%rsi), %rcx
-	mov	%rcx, 5(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	12(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$13, %cl
-	sub	$13, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_13):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	6(%rsi), %rcx
-	mov	%rcx, 6(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	13(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$14, %cl
-	sub	$14, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_14):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	7(%rsi), %rcx
-	mov	%rcx, 7(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	14(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$15, %cl
-	sub	$15, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-LABEL(tail_15):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	15(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$16, %cl
-	sub	$16, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-
-	ret
-
-	.p2align 4
-LABEL(tail_16):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %cl
-	mov	%cl, 16(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	16(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$17, %cl
-	sub	$17, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_17):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %cx
-	mov	%cx, 16(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	17(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$18, %cl
-	sub	$18, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_18):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	15(%rsi), %ecx
-	mov	%ecx,15(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	18(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$19, %cl
-	sub	$19, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_19):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %ecx
-	mov	%ecx, 16(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	19(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$20, %cl
-	sub	$20, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_20):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	13(%rsi), %rcx
-	mov	%rcx, 13(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	20(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$21, %cl
-	sub	$21, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_21):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	14(%rsi), %rcx
-	mov	%rcx, 14(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	21(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$22, %cl
-	sub	$22, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_22):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	15(%rsi), %rcx
-	mov	%rcx, 15(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	22(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$23, %cl
-	sub	$23, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_23):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	23(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$24, %cl
-	sub	$24, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-
-	ret
-
-	.p2align 4
-LABEL(tail_24):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	21(%rsi), %edx
-	mov	%edx, 21(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	24(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$25, %cl
-	sub	$25, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_25):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	22(%rsi), %edx
-	mov	%edx, 22(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	25(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$26, %cl
-	sub	$26, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_26):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	23(%rsi), %edx
-	mov	%edx, 23(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	26(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$27, %cl
-	sub	$27, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_27):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	24(%rsi), %edx
-	mov	%edx, 24(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	27(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$28, %cl
-	sub	$28, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_28):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	21(%rsi), %rdx
-	mov	%rdx, 21(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	28(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$29, %cl
-	sub	$29, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-
-	ret
-
-	.p2align 4
-LABEL(tail_29):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	22(%rsi), %rdx
-	mov	%rdx, 22(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	29(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$30, %cl
-	sub	$30, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-
-	ret
-
-
-	.p2align 4
-LABEL(tail_30):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	23(%rsi), %rdx
-	mov	%rdx, 23(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	30(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$31, %cl
-	sub	$31, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_31):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	24(%rsi), %rdx
-	mov	%rdx, 24(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	31(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$32, %cl
-	sub	$32, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	cfi_endproc
-	.size	STRCPY_SSSE3, .-STRCPY_SSSE3
-
-	.p2align 4
-	.section .rodata.ssse3,"a",@progbits
-LABEL(tail_table):
-	.int	LABEL(tail_0) - LABEL(tail_table)
-	.int	LABEL(tail_1) - LABEL(tail_table)
-	.int	LABEL(tail_2) - LABEL(tail_table)
-	.int	LABEL(tail_3) - LABEL(tail_table)
-	.int	LABEL(tail_4) - LABEL(tail_table)
-	.int	LABEL(tail_5) - LABEL(tail_table)
-	.int	LABEL(tail_6) - LABEL(tail_table)
-	.int	LABEL(tail_7) - LABEL(tail_table)
-	.int	LABEL(tail_8) - LABEL(tail_table)
-	.int	LABEL(tail_9) - LABEL(tail_table)
-	.int	LABEL(tail_10) - LABEL(tail_table)
-	.int	LABEL(tail_11) - LABEL(tail_table)
-	.int	LABEL(tail_12) - LABEL(tail_table)
-	.int	LABEL(tail_13) - LABEL(tail_table)
-	.int	LABEL(tail_14) - LABEL(tail_table)
-	.int	LABEL(tail_15) - LABEL(tail_table)
-	.int	LABEL(tail_16) - LABEL(tail_table)
-	.int	LABEL(tail_17) - LABEL(tail_table)
-	.int	LABEL(tail_18) - LABEL(tail_table)
-	.int	LABEL(tail_19) - LABEL(tail_table)
-	.int	LABEL(tail_20) - LABEL(tail_table)
-	.int	LABEL(tail_21) - LABEL(tail_table)
-	.int	LABEL(tail_22) - LABEL(tail_table)
-	.int	LABEL(tail_23) - LABEL(tail_table)
-	.int	LABEL(tail_24) - LABEL(tail_table)
-	.int	LABEL(tail_25) - LABEL(tail_table)
-	.int	LABEL(tail_26) - LABEL(tail_table)
-	.int	LABEL(tail_27) - LABEL(tail_table)
-	.int	LABEL(tail_28) - LABEL(tail_table)
-	.int	LABEL(tail_29) - LABEL(tail_table)
-	.int	LABEL(tail_30) - LABEL(tail_table)
-	.int	LABEL(tail_31) - LABEL(tail_table)
-
-	.p2align 4
-LABEL(unaligned_table):
-	.int	LABEL(ashr_0) - LABEL(unaligned_table)
-	.int	LABEL(ashr_1) - LABEL(unaligned_table)
-	.int	LABEL(ashr_2) - LABEL(unaligned_table)
-	.int	LABEL(ashr_3) - LABEL(unaligned_table)
-	.int	LABEL(ashr_4) - LABEL(unaligned_table)
-	.int	LABEL(ashr_5) - LABEL(unaligned_table)
-	.int	LABEL(ashr_6) - LABEL(unaligned_table)
-	.int	LABEL(ashr_7) - LABEL(unaligned_table)
-	.int	LABEL(ashr_8) - LABEL(unaligned_table)
-	.int	LABEL(ashr_9) - LABEL(unaligned_table)
-	.int	LABEL(ashr_10) - LABEL(unaligned_table)
-	.int	LABEL(ashr_11) - LABEL(unaligned_table)
-	.int	LABEL(ashr_12) - LABEL(unaligned_table)
-	.int	LABEL(ashr_13) - LABEL(unaligned_table)
-	.int	LABEL(ashr_14) - LABEL(unaligned_table)
-	.int	LABEL(ashr_15) - LABEL(unaligned_table)
-
 # undef ENTRY
 # define ENTRY(name) \
 	.type STRCPY_SSE2, @function; \