about summary refs log tree commit diff
path: root/sysdeps/powerpc/powerpc64/power7/strcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7/strcpy.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcpy.S327
1 files changed, 245 insertions, 82 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
index ce71982eaf..115f98a304 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
@@ -31,8 +31,6 @@
 
    if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
      goto aligned_doubleword_copy;
-   if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
-     goto aligned_word_copy;
    if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
      goto same_alignment;
    goto unaligned;
@@ -70,9 +68,18 @@ EALIGN (FUNC_NAME, 4, 0)
 #endif
 	or	rTMP, rSRC, rRTN
 	clrldi.	rTMP, rTMP, 61
-	bne	L(check_word_alignment)
+	bne	L(check_alignment)
 	b	L(aligned_doubleword_copy)
 
+	.align 4
+L(check_alignment):
+	rldicl	rRTNAL, rRTN, 0, 61
+	rldicl	rSRCAL, rSRC, 0, 61
+	cmpld	cr7, rSRCAL, rRTNAL
+	beq	cr7, L(same_alignment)
+	b	L(unaligned)
+
+	.align 4
 L(same_alignment):
 /* Src and dst with same alignment: align both to doubleword.  */
 	mr	rALCNT, rRTN
@@ -180,93 +187,249 @@ L(g1):
 #endif
 	blr
 
-L(check_word_alignment):
-	clrldi. rTMP, rTMP, 62
-	beq	L(aligned_word_copy)
-	rldicl	rRTNAL, rRTN, 0, 61
-	rldicl	rSRCAL, rSRC, 0, 61
-	cmpld	cr7, rSRCAL, rRTNAL
-	beq	cr7, L(same_alignment)
-	b	L(unaligned)
-
-/* For word aligned memory, operate using word load and stores.  */
 	.align	4
-L(aligned_word_copy):
-	li	rMASK, 0
-	addi	rRTN, rRTN, -4
-	lwz	rWORD, 0(rSRC)
-	b	L(g5)
+L(unaligned):
+	cmpdi	rSRCAL, 0		/* Check src alignment */
+	beq	L(srcaligndstunalign)
+	/* src is unaligned */
+	rlwinm	r10, rSRC, 3,26,28	/* Calculate padding.  */
+	clrrdi	rSRC, rSRC, 3		/* Align the addr to dw boundary */
+	ld	rWORD, 0(rSRC)		/* Load doubleword from memory.  */
+	li	rTMP, 0
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	srd	rALT, rWORD, r10
+#else
+	sld	rALT, rWORD, r10
+#endif
+	cmpb	rTMP, rALT, rTMP	/* Compare each byte against null */
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	sld	rTMP, rTMP, r10
+#else
+	srd	rTMP, rTMP, r10
+#endif
+	cmpdi	rTMP, 0
+	bne	L(bytebybyte)		/* if it has null, copy byte by byte */
+	subfic	r8, r9, 8
+	rlwinm	r5, rRTN, 3,26,28	/* Calculate padding in bits.  */
+	rldicl	r9, rRTN, 0, 61		/* Calculate padding in bytes. */
+	addi	rRTN, rRTN, -1
 
-	.align	4
-L(g3):	lwzu	rALT, 4(rSRC)
-	stwu	rWORD, 4(rRTN)
-	cmpb	rTMP, rALT, rMASK
-	cmpwi	rTMP, 0
-	bne	L(g4)
-	lwzu	rWORD, 4(rSRC)
-	stwu	rALT, 4(rRTN)
-L(g5):	cmpb	rTMP, rWORD, rMASK
-	cmpwi	rTMP, 0		/* If rTMP is 0, no null in word.  */
-	beq	L(g3)
-
-	mr      rALT, rWORD
-/* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g4):
+	cmpdi	r5, 0			/* check dest alignment */
+	beq	L(srcunaligndstalign)
+
+	/* both src and dst unaligned */
 #ifdef __LITTLE_ENDIAN__
-	rlwinm.	rTMP, rALT, 0, 24, 31
-	stbu	rALT, 4(rRTN)
-	beqlr-
-	rlwinm.	rTMP, rALT, 24, 24, 31
-	stbu	rTMP, 1(rRTN)
-	beqlr-
-	rlwinm.	rTMP, rALT, 16, 24, 31
-	stbu	rTMP, 1(rRTN)
-	beqlr-
-	rlwinm	rTMP, rALT, 8, 24, 31
-	stbu	rTMP, 1(rRTN)
+	sld	rWORD, rALT, r10
+	mr 	r11, r10
+	addi	r11, r11, -8		/* Adjust byte pointer on loaded dw */
 #else
-	rlwinm. rTMP, rALT, 8, 24, 31
-	stbu    rTMP, 4(rRTN)
-	beqlr
-	rlwinm. rTMP, rALT, 16, 24, 31
-	stbu    rTMP, 1(rRTN)
-	beqlr
-	rlwinm. rTMP, rALT, 24, 24, 31
-	stbu    rTMP, 1(rRTN)
-	beqlr
-	stbu    rALT, 1(rRTN)
+	srd	rWORD, rALT, r10
+	subfic	r11, r10, 64
 #endif
-	blr
+	/* dst alignment is greater then src alignment? */
+	cmpd	cr7, r5, r10
+	blt	cr7, L(dst_align_small)
+	/* src alignment is less than dst */
 
-/* Oh well.  In this case, we just do a byte-by-byte copy.  */
-	.align	4
-L(unaligned):
-	lbz	rWORD, 0(rSRC)
-	addi	rRTN, rRTN, -1
-	cmpdi	rWORD, 0
-	beq	L(u2)
-
-	.align 	5
-L(u0):	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rRTN)
-	cmpdi	rALT, 0
-	beq	L(u1)
-	lbzu	rWORD, 1(rSRC)
+	/* Calculate the dst alignment differnce */
+	subfic	rALT, r9, 8
+	mtctr	rALT
+
+	/* Write till dst is aligned */
+	cmpdi	rTMP, rALT, 4
+	blt	L(storebyte1)		/* less than 4, store byte by byte */
+	beq	L(equal1)		/* if its 4, store word */
+	addi	rTMP, rALT, -4		/* greater than 4, so stb and stw */
+	mtctr	rTMP
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on loaded dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
+	stbu	rALT, 1(rRTN)
+	bdnz	L(storebyte1)
+
+	subfic	rALT, r9, 8		/* Check the remaining bytes */
+	cmpdi	rTMP, rALT, 4
+	blt	L(proceed)
+
+	.align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on loaded dw */
+	srd	rALT, rWORD, r11
+#else
+	subfic	r11, r11, 64
+	sld	rALT, rWORD, r11
+	srdi	rALT, rALT, 32
+#endif
+	stw	rALT, 1(rRTN)
+	addi	rRTN, rRTN, 4
+
+L(proceed):
+	mr	rALT, rWORD
+	/* calculate the Left over bytes to be written */
+	subfic	r11, r10, 64
+	subfic	r5, r5, 64
+	subf	r5, r5, r11		/* remaining bytes on second dw */
+        subfic	r10, r5, 64		/* remaining bytes on first dw */
+	subfic	r9, r9, 8
+	subf	r8, r9, r8		/* recalculate padding */
+L(srcunaligndstalign):
+	addi	rRTN, rRTN, 1
+	subfic	r5, r10, 64		/* remaining bytes on second dw */
+	addi	rSRC, rSRC, 8
+	li	rTMP,0
+	b	L(storedouble)
+
+	.align 4
+L(dst_align_small):
+	mtctr	r8
+	/* Write till src is aligned */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
 	stbu	rALT, 1(rRTN)
-	cmpdi	rWORD, 0
-	beq	L(u2)
-	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rRTN)
-	cmpdi	rALT, 0
-	beq	L(u1)
-	lbzu	rWORD, 1(rSRC)
+	bdnz	L(storebyte2)
+
+	addi	rSRC, rSRC, 8		/* Increment src pointer */
+	addi	rRTN, rRTN, 1		/* Increment dst pointer */
+	rldicl	r8, rRTN, 0, 61		/* Recalculate padding */
+
+	/* src is aligned */
+L(srcaligndstunalign):
+	ld	rWORD, 0(rSRC)
+	mr	rALT, rWORD
+	li	rTMP, 0			/* Check null */
+	cmpb	rTMP, rWORD, rTMP
+	cmpdi	rTMP, 0
+	bne	L(bytebybyte)		/* Do byte by byte if there is NULL */
+	rlwinm	r5, rRTN, 3,26,28	/* Calculate padding */
+	addi	rRTN, rRTN, -1
+	subfic	r10, r8, 8
+	/* write byte by byte till aligned */
+#ifdef __LITTLE_ENDIAN__
+	li	r11, -8
+#else
+	li	r11, 64
+#endif
+	mtctr	r10
+	cmpdi	rTMP, r10, 4
+	blt	L(storebyte)
+	beq	L(equal)
+	addi	rTMP, r10, -4
+	mtctr	rTMP
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on  dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
 	stbu	rALT, 1(rRTN)
-	cmpdi	rWORD, 0
-	bne	L(u0)
-L(u2):	stbu	rWORD, 1(rRTN)
-	blr
-L(u1):	stbu	rALT, 1(rRTN)
-	blr
+	bdnz	L(storebyte)
+
+	cmpdi	rTMP, r10, 4
+	blt	L(align)
+
+	.align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8
+	srd	rALT, rWORD, r11
+#else
+	subfic	r11, r11, 64
+	sld	rALT, rWORD, r11
+	srdi	rALT, rALT, 32
+#endif
+	stw	rALT, 1(rRTN)
+	addi	rRTN, rRTN, 4
+L(align):
+	addi	rRTN, rRTN, 1
+	addi	rSRC, rSRC, 8		/* Increment src pointer */
+	subfic	r10, r5, 64
+	li	rTMP, 0
+	/* dst addr aligned to 8 */
+L(storedouble):
+	ld	rALT, 0(rSRC)		/* load next dw */
+	cmpb	rTMP, rALT, rTMP
+	cmpdi	rTMP, 0			/* check for null on each new dw */
+	bne	L(null)
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, rWORD, r10		/* bytes from first dw */
+	sld	r11, rALT, r5		/* bytes from second dw */
+#else
+	sld	r9, rWORD, r10
+	srd	r11, rALT, r5
+#endif
+	or	r11, r9, r11		/* make as a single dw */
+	std	r11, 0(rRTN)		/* store as std on aligned addr */
+	mr	rWORD, rALT		/* still few bytes left to be written */
+	addi	rRTN, rRTN, 8		/* increment dst addr */
+	addi	rSRC, rSRC, 8		/* increment src addr */
+	b	L(storedouble)		/* Loop till NULL */
+
+	.align 4
+
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(null):
+	addi	rRTN, rRTN, -1
+	mr	r10, r5
+	mtctr	r8
+#ifdef __LITTLE_ENDIAN__
+	subfic	r10, r10, 64
+	addi	r10, r10, -8
+#endif
+	cmpdi	rTMP, r8, 4
+	blt	L(loop)
+
+	/* we can still use stw if leftover >= 4*/
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+	srd	r11, rWORD, r10
+#else
+	subfic	r10, r10, 64
+	sld	r11, rWORD, r10
+	srdi	r11, r11, 32
+#endif
+	stw	r11, 1(rRTN)
+	addi	rRTN, rRTN, 4
+
+	beq	L(bytebybyte1)
+	addi	r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, -8
+#else
+	subfic	r10, r10, 64
+#endif
+	addi	rTMP, r8, -4
+	mtctr	rTMP
+	/* remaining byte by byte part of first dw */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+#else
+	addi	r10, r10, -8
+#endif
+	srd	rTMP, rWORD, r10
+	stbu	rTMP, 1(rRTN)
+	bdnz	L(loop)
+
+L(bytebybyte1):
+	addi	rRTN, rRTN, 1
+	/* remaining byte by byte part of second dw */
+L(bytebybyte):
+	addi	rRTN, rRTN, -8
+	b	L(g1)
+
 END (FUNC_NAME)
 
 #ifndef USE_AS_STPCPY