about summary refs log tree commit diff
path: root/sysdeps
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/ia64/memcpy.S124
1 files changed, 29 insertions, 95 deletions
diff --git a/sysdeps/ia64/memcpy.S b/sysdeps/ia64/memcpy.S
index 5ac965c99b..caa1aa058a 100644
--- a/sysdeps/ia64/memcpy.S
+++ b/sysdeps/ia64/memcpy.S
@@ -41,8 +41,7 @@
 #define OPSIZ 		8
 
 #define saved_pfs	r14
-#define sf		r15
-#define rescnt		r16
+#define adest		r15
 #define saved_pr	r17
 #define saved_lc	r18
 #define dest		r19
@@ -59,36 +58,24 @@
 #define loopcnt		r30
 #define	value		r31
 
-#define dl0		r22
-#define dh0		r23
-#define dl1		r24
-#define dh1		r25
-#define dl2		r26
-#define dh2		r27
-#define dl3		r28
-#define dh3		r29 
-
 #define LOOP(shift)							\
-		.align	32 ; 						\
+		.align	32 ;						\
 .loop##shift##:								\
 (p[0])		ld8	r[0] = [asrc], 8 ;	/* w1 */		\
 (p[MEMLAT+1])	st8	[dest] = value, 8 ;				\
-(p[MEMLAT])	shrp	value = r[MEMLAT], r[MEMLAT+1], shift ;	\
+(p[MEMLAT])	shrp	value = r[MEMLAT], r[MEMLAT+1], shift ;		\
 		nop.b	0 ;						\
 		nop.b	0 ;						\
 		br.ctop.sptk .loop##shift ;				\
 		br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
 
 ENTRY(memcpy)
-	alloc 	saved_pfs = ar.pfs, 3, 40-3, 0, 40
+	alloc 	saved_pfs = ar.pfs, 3, 16 - 3, 0, 16
 #include "softpipe.h"
-	.rotr	r[MEMLAT + 2], q[MEMLAT + 1], s0[2], s1[2], s2[2], s3[2]
-	.rotf	tl0[5], th0[5], tl1[5], th1[5], tl2[5], th2[5], tl3[5], th3[5]
+	.rotr	r[MEMLAT + 2], q[MEMLAT + 1]
 	.rotp	p[MEMLAT + 2]
 	mov	ret0 = in0		// return value = dest
 	mov	saved_pr = pr		// save the predicate registers
-// brp is currently broken - reenable when it gets fixed.
-//	brp.loop.many	.l0, .done - 16
         mov 	saved_lc = ar.lc	// save the loop counter
 	or	tmp3 = in0, in1 ;;	// tmp3 = dest | src
 	or	tmp3 = tmp3, in2	// tmp3 = dest | src | len
@@ -99,88 +86,35 @@ ENTRY(memcpy)
 	cmp.eq	p6, p0 = in2, r0	// if (len == 0)
 (p6)	br.cond.spnt .restore_and_exit;;// 	return dest;
 	and	tmp4 = 7, tmp3 		// tmp4 = (dest | src | len) & 7
-	tbit.nz	p8, p0 = src, 3 ;;	// test for 16-byte boundary align
+	shr.u	loopcnt = len, 4 ;;	// loopcnt = len / 16
 	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
 (p6)	br.cond.sptk .next		//	goto next;
 
 // The optimal case, when dest, src and len are all multiples of 8
 
-(p8)	ld8	value = [src], 8	// align src if necessary
-(p8)	adds	len = -8, len ;;	// adjust len accordingly
-	shr.u	loopcnt = len, 6 	// loopcnt = len / 64
-	shr.u	rescnt = len, 3		// rescnt = len / 8
-	mov	pr.rot = 1 << 16 	// set rotating predicates
-	mov	ar.ec = 4 + 1 ;;	// set the epilog counter
-	cmp.eq	p6, p0 = loopcnt, r0 
-	and	rescnt = 7, rescnt	// resnt = residual word count
-	adds	loopcnt = -1, loopcnt	// --loopcnt
-(p8)	st8	[dest] = value, 8	// copy one word if aligning 
-(p6)	br.cond.spnt .epilog;;		// there are < 8 words to copy
-	add	sf = 64 * 4, src
+	and	tmp3 = 0xf, len		// tmp3 = len % 16
+	mov	pr.rot = 1 << 16	// set rotating predicates
+	mov	ar.ec = MEMLAT + 1 ;;	// set the epilog counter
+	cmp.ne	p6, p0 = tmp3, r0	// do we have to copy an extra word?
+	adds	loopcnt = -1, loopcnt;;	// --loopcnt
+(p6)	ld8	value = [src], 8;;	
+(p6)	st8	[dest] = value, 8	// copy the "extra" word
 	mov	ar.lc = loopcnt 	// set the loop counter		 
-	mov	s0[1] = src
-	add	s1[1] = 16*1, src
-	add     s2[1] = 16*2, src
-	add	s3[1] = 16*3, src
-	;;
-	mov     dl0 = dest
-	add	dh0 = 8 * 1, dest
-	add	dl1 = 8 * 2, dest
-	add     dh1 = 8 * 3, dest
-	add	dl2 = 8 * 4, dest
-	add	dh2 = 8 * 5, dest
-	add	dl3 = 8 * 6, dest
-	add	dh3 = 8 * 7, dest
-	;;	
+	cmp.eq	p6, p0 = 8, len
+(p6)	br.cond.spnt .restore_and_exit;;// there was only one word to copy
+	adds	adest = 8, dest
+	adds	asrc = 8, src ;;
+	.align	32
 .l0:
-(p[0]) 	lfetch.nta [sf], 64
-
-(p[0])  ldfp8   tl0[0], th0[0] = [s0[1]]
-(p[0])  ldfp8   tl1[0], th1[0] = [s1[1]]
-(p[0])  ldfp8   tl2[0], th2[0] = [s2[1]]
-(p[0])  ldfp8   tl3[0], th3[0] = [s3[1]]
-
-(p[0])  add     s0[0] = 64, s0[1]
-(p[0])  add     s1[0] = 64, s1[1]
-(p[0])  add     s2[0] = 64, s2[1]
-(p[0])  add     s3[0] = 64, s3[1]
-(p[1])	mov	src = s0[1]		// for the epilog code
-
-(p[4])  stf8    [dl0] = tl0[4], 64
-(p[4])  stf8    [dh0] = th0[4], 64
-(p[4])  stf8    [dl1] = tl1[4], 64
-(p[4])  stf8    [dh1] = th1[4], 64
-(p[4])  stf8    [dl2] = tl2[4], 64
-(p[4])  stf8    [dh2] = th2[4], 64
-(p[4])  stf8    [dl3] = tl3[4], 64
-(p[4])  stf8    [dh3] = th3[4], 64
-
-	br.ctop.sptk.many .l0
-.done:
-	mov	dest = dl0
-.epilog:
-	cmp.eq	p6, p0 = rescnt, r0	// are there any words left to copy?
-	tbit.nz	p10, p0 = rescnt, 0
-(p6)	br.cond.spnt .restore_and_exit ;;
-(p10)	ld8	r[0] = [src], 8
-	tbit.nz	p11, p0 = rescnt, 1 ;;
-(p11)	ld8	r[1] = [src], 8
-(p10)	st8	[dest] = r[0], 8 ;;
-(p11)	ld8	r[2] = [src], 8 
-(p11)	st8	[dest] = r[1], 8
-	tbit.nz	p12, p0 = rescnt, 2 ;;
-(p12)	ld8	r[3] = [src], 8
-(p11)	st8	[dest] = r[2], 8 ;;
-(p12)	ld8	r[4] = [src], 8
-(p12)	st8	[dest] = r[3], 8 ;;
-(p12)	ld8	r[5] = [src], 8
-(p12) 	st8	[dest] = r[4], 8 
-	mov	ar.lc = saved_lc ;;	// restore the loop counter
-(p12) 	ld8	r[6] = [src], 8
-(p12)	st8	[dest] = r[5], 8 
-	mov	ar.pfs = saved_pfs;;	// restore the PFS
-(p12)	st8	[dest] = r[6]
-	mov	pr = saved_pr, -1 	// restore the predicate registers
+(p[0])		ld8	r[0] = [src], 16
+(p[0])		ld8	q[0] = [asrc], 16
+(p[MEMLAT])	st8	[dest] = r[MEMLAT], 16
+(p[MEMLAT])	st8	[adest] = q[MEMLAT], 16
+		br.ctop.dptk .l0 ;;
+
+	mov	ar.pfs = saved_pfs	// restore the PFS
+	mov	pr = saved_pr, -1	// restore the predicate registers
+	mov	ar.lc = saved_lc	// restore the loop counter
 	br.ret.sptk.many b0
 .next:
 	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
@@ -197,7 +131,7 @@ ENTRY(memcpy)
 	ld1	value = [src], 1	// value = *src++
 	;;
 	st1	[dest] = value, 1	// *dest++ = value  
-	br.cloop.dptk .l1	
+	br.cloop.dptk .l1 ;;	
 .dest_aligned:
 	and	sh1 = 7, src 		// sh1 = src % 8
 	and	tmp2 = -8, len   	// tmp2 = len & -OPSIZ
@@ -236,7 +170,7 @@ ENTRY(memcpy)
 .l3:
 (p[0])		ld8	r[0] = [src], 8
 (p[MEMLAT])	st8	[dest] = r[MEMLAT], 8
-		br.ctop.dptk .l3
+		br.ctop.dptk .l3 ;;
 .cpyfew:
 	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
 	adds	len = -1, len		// --len;