summary refs log tree commit diff
path: root/sysdeps/ia64/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/ia64/memcpy.S')
-rw-r--r--sysdeps/ia64/memcpy.S436
1 files changed, 0 insertions, 436 deletions
diff --git a/sysdeps/ia64/memcpy.S b/sysdeps/ia64/memcpy.S
deleted file mode 100644
index a2aeea00fd..0000000000
--- a/sysdeps/ia64/memcpy.S
+++ /dev/null
@@ -1,436 +0,0 @@
-/* Optimized version of the standard memcpy() function.
-   This file is part of the GNU C Library.
-   Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
-   Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
-   Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/* Return: dest
-
-   Inputs:
-        in0:    dest
-        in1:    src
-        in2:    byte count
-
-   An assembly implementation of the algorithm used by the generic C
-   version from glibc.  The case when source and sest are aligned is
-   treated separately, for extra performance.
-
-   In this form, memcpy assumes little endian mode.  For big endian mode,
-   sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
-   and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
-   shrp instruction.  */
-
-#define USE_LFETCH
-#define USE_FLP
-#include <sysdep.h>
-#undef ret
-
-#define LFETCH_DIST     500
-
-#define ALIGN_UNROLL_no   4 // no. of elements
-#define ALIGN_UNROLL_sh	  2 // (shift amount)
-
-#define MEMLAT	8
-#define Nrot	((4*(MEMLAT+2) + 7) & ~7)
-
-#define OP_T_THRES 	16
-#define OPSIZ 		8
-
-#define loopcnt		r14
-#define elemcnt		r15
-#define saved_pr	r16
-#define saved_lc	r17
-#define adest		r18
-#define dest		r19
-#define asrc		r20
-#define src		r21
-#define len		r22
-#define tmp2		r23
-#define tmp3		r24
-#define	tmp4		r25
-#define ptable		r26
-#define ploop56		r27
-#define	loopaddr	r28
-#define	sh1		r29
-#define ptr1		r30
-#define ptr2		r31
-
-#define movi0 		mov
-
-#define p_scr		p6
-#define p_xtr		p7
-#define p_nxtr		p8
-#define p_few		p9
-
-#if defined(USE_FLP)
-#define load		ldf8
-#define store		stf8
-#define tempreg		f6
-#define the_r		fr
-#define the_s		fs
-#define the_t		ft
-#define the_q		fq
-#define the_w		fw
-#define the_x		fx
-#define the_y		fy
-#define the_z		fz
-#elif defined(USE_INT)
-#define load		ld8
-#define store		st8
-#define tempreg		tmp2
-#define the_r		r
-#define the_s		s
-#define the_t		t
-#define the_q		q
-#define the_w		w
-#define the_x		x
-#define the_y		y
-#define the_z		z
-#endif
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
-/* Manually force proper loop-alignment.  Note: be sure to
-   double-check the code-layout after making any changes to
-   this routine! */
-# define ALIGN(n)	{ nop 0 }
-#else
-# define ALIGN(n)	.align n
-#endif
-
-#if defined(USE_LFETCH)
-#define LOOP(shift)						\
-		ALIGN(32);					\
-.loop##shift##:							\
-{ .mmb								\
-(p[0])	ld8.nt1	r[0] = [asrc], 8 ;				\
-(p[0])	lfetch.nt1 [ptr1], 16 ;					\
-	nop.b 0 ;						\
-} { .mib							\
-(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ;				\
-(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ;		\
- 	nop.b 0 ;;						\
- } { .mmb							\
-(p[0])	ld8.nt1	s[0] = [asrc], 8 ;				\
-(p[0])	lfetch.nt1	[ptr2], 16 ;				\
-	nop.b 0 ;						\
-} { .mib							\
-(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ;				\
-(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ;		\
-	br.ctop.sptk.many .loop##shift 				\
-;; }								\
-{ .mib								\
-	br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */  \
-}
-#else
-#define LOOP(shift)						\
-		ALIGN(32);					\
-.loop##shift##:							\
-{ .mmb								\
-(p[0])	ld8.nt1	r[0] = [asrc], 8 ;				\
-	nop.b 0 ;						\
-} { .mib							\
-(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ;				\
-(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ;		\
- 	nop.b 0 ;;						\
- } { .mmb							\
-(p[0])	ld8.nt1	s[0] = [asrc], 8 ;				\
-	nop.b 0 ;						\
-} { .mib							\
-(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ;				\
-(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ;		\
-	br.ctop.sptk.many .loop##shift 				\
-;; }								\
-{ .mib								\
-	br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */  \
-}
-#endif
-
-
-ENTRY(memcpy)
-{ .mmi
-	.prologue
-	alloc 	r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
-	.rotr	r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
-	.rotp	p[MEMLAT+2]
-	.rotf	fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
-	mov	ret0 = in0		// return tmp2 = dest
-	.save   pr, saved_pr
-	movi0	saved_pr = pr		// save the predicate registers
-} { .mmi
-	and	tmp4 = 7, in0 		// check if destination is aligned
-	mov 	dest = in0		// dest
-	mov 	src = in1		// src
-;; }
-{ .mii
-	cmp.eq	p_scr, p0 = in2, r0	// if (len == 0)
-	.save   ar.lc, saved_lc
-        movi0 	saved_lc = ar.lc	// save the loop counter
-	.body
-	cmp.ge	p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
-} { .mbb
-	mov	len = in2		// len
-(p_scr)	br.cond.dpnt.few .restore_and_exit // 	Branch no. 1: return dest
-(p_few) br.cond.dpnt.many .copy_bytes	// Branch no. 2: copy byte by byte
-;; }
-{ .mmi
-#if defined(USE_LFETCH)
-	lfetch.nt1 [dest]		//
-	lfetch.nt1 [src]		//
-#endif
-	shr.u	elemcnt = len, 3	// elemcnt = len / 8
-} { .mib
-	cmp.eq	p_scr, p0 = tmp4, r0	// is destination aligned?
-	sub	loopcnt = 7, tmp4	//
-(p_scr) br.cond.dptk.many .dest_aligned
-;; }
-{ .mmi
-	ld1	tmp2 = [src], 1		//
-	sub	len = len, loopcnt, 1	// reduce len
-	movi0	ar.lc = loopcnt		//
-} { .mib
-	cmp.ne  p_scr, p0 = 0, loopcnt	// avoid loading beyond end-point
-;; }
-
-.l0:	// ---------------------------- // L0: Align src on 8-byte boundary
-{ .mmi
-	st1	[dest] = tmp2, 1	//
-(p_scr)	ld1	tmp2 = [src], 1		//
-} { .mib
-	cmp.lt	p_scr, p0 = 1, loopcnt	// avoid load beyond end-point
-	add	loopcnt = -1, loopcnt
-	br.cloop.dptk.few .l0		//
-;; }
-
-.dest_aligned:
-{ .mmi
-	and	tmp4 = 7, src		// ready for alignment check
-	shr.u	elemcnt = len, 3	// elemcnt = len / 8
-;; }
-{ .mib
-	cmp.ne	p_scr, p0 = tmp4, r0	// is source also aligned
-	tbit.nz p_xtr, p_nxtr = src, 3	// prepare a separate move if src
-} { .mib				// is not 16B aligned
-	add	ptr2 = LFETCH_DIST, dest	// prefetch address
-	add	ptr1 = LFETCH_DIST, src
-(p_scr) br.cond.dptk.many .src_not_aligned
-;; }
-
-// The optimal case, when dest, and src are aligned
-
-.both_aligned:
-{ .mmi
-	.pred.rel "mutex",p_xtr,p_nxtr
-(p_xtr)	cmp.gt  p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
-(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt  // Need only N to qualify
-	movi0	pr.rot = 1 << 16	// set rotating predicates
-} { .mib
-(p_scr) br.cond.dpnt.many .copy_full_words
-;; }
-
-{ .mmi
-(p_xtr)	load	tempreg = [src], 8
-(p_xtr) add 	elemcnt = -1, elemcnt
-	movi0	ar.ec = MEMLAT + 1	// set the epilog counter
-;; }
-{ .mmi
-(p_xtr) add	len = -8, len		//
-	add 	asrc = 16, src 		// one bank apart (for USE_INT)
-	shr.u	loopcnt = elemcnt, ALIGN_UNROLL_sh  // cater for unrolling
-;;}
-{ .mmi
-	add	loopcnt = -1, loopcnt
-(p_xtr)	store	[dest] = tempreg, 8	// copy the "extra" word
-	nop.i	0
-;; }
-{ .mib
-	add	adest = 16, dest
-	movi0	ar.lc = loopcnt 	// set the loop counter
-;; }
-
-#ifdef  GAS_ALIGN_BREAKS_UNWIND_INFO
-	{ nop 0 }
-#else
-	.align	32
-#endif
-#if defined(USE_FLP)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
-{ .mmi
-#if defined(USE_LFETCH)
-(p[0])	lfetch.nt1 [ptr2],32
-#endif
-(p[0])	ldfp8	the_r[0],the_q[0] = [src], 16
-(p[0])	add	len = -32, len
-} {.mmb
-(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
-(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
-;; }
-{ .mmi
-#if defined(USE_LFETCH)
-(p[0])	lfetch.nt1 [ptr1],32
-#endif
-(p[0])	ldfp8	the_s[0], the_t[0] = [src], 16
-} {.mmb
-(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
-(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
-	br.ctop.dptk.many .l1
-;; }
-#elif defined(USE_INT)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
-{ .mmi
-(p[0])	load	the_r[0] = [src], 8
-(p[0])	load	the_q[0] = [asrc], 8
-(p[0])	add	len = -32, len
-} {.mmb
-(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
-(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
-;; }
-{ .mmi
-(p[0])	load	the_s[0]  = [src], 24
-(p[0])	load	the_t[0] = [asrc], 24
-} {.mmb
-(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
-(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
-#if defined(USE_LFETCH)
-;; }
-{ .mmb
-(p[0])	lfetch.nt1 [ptr2],32
-(p[0])	lfetch.nt1 [ptr1],32
-#endif
-	br.ctop.dptk.many .l1
-;; }
-#endif
-
-.copy_full_words:
-{ .mib
-	cmp.gt	p_scr, p0 = 8, len	//
-	shr.u	elemcnt = len, 3	//
-(p_scr) br.cond.dpnt.many .copy_bytes
-;; }
-{ .mii
-	load	tempreg = [src], 8
-	add	loopcnt = -1, elemcnt	//
-;; }
-{ .mii
-	cmp.ne	p_scr, p0 = 0, loopcnt	//
-	mov	ar.lc = loopcnt		//
-;; }
-
-.l2: // ------------------------------- // L2: Max 4 words copied separately
-{ .mmi
-	store	[dest] = tempreg, 8
-(p_scr)	load	tempreg = [src], 8	//
-	add	len = -8, len
-} { .mib
-	cmp.lt	p_scr, p0 = 1, loopcnt	// avoid load beyond end-point
-	add	loopcnt = -1, loopcnt
-	br.cloop.dptk.few  .l2
-;; }
-
-.copy_bytes:
-{ .mib
-	cmp.eq	p_scr, p0 = len, r0	// is len == 0 ?
-	add	loopcnt = -1, len	// len--;
-(p_scr)	br.cond.spnt	.restore_and_exit
-;; }
-{ .mii
-	ld1	tmp2 = [src], 1
-	movi0	ar.lc = loopcnt
-	cmp.ne	p_scr, p0 = 0, loopcnt	// avoid load beyond end-point
-;; }
-
-.l3: // ------------------------------- // L3: Final byte move
-{ .mmi
-	st1	[dest] = tmp2, 1
-(p_scr)	ld1	tmp2 = [src], 1
-} { .mib
-	cmp.lt	p_scr, p0 = 1, loopcnt	// avoid load beyond end-point
-	add	loopcnt = -1, loopcnt
-	br.cloop.dptk.few  .l3
-;; }
-
-.restore_and_exit:
-{ .mmi
-	movi0	pr = saved_pr, -1	// restore the predicate registers
-;; }
-{ .mib
-	movi0	ar.lc = saved_lc	// restore the loop counter
-	br.ret.sptk.many b0
-;; }
-
-
-.src_not_aligned:
-{ .mmi
-	cmp.gt	p_scr, p0 = 16, len
-	and	sh1 = 7, src 		// sh1 = src % 8
-	shr.u	loopcnt = len, 4	// element-cnt = len / 16
-} { .mib
-	add	tmp4 = @ltoff(.table), gp
-	add 	tmp3 = @ltoff(.loop56), gp
-(p_scr)	br.cond.dpnt.many .copy_bytes	// do byte by byte if too few
-;; }
-{ .mmi
-	and	asrc = -8, src		// asrc = (-8) -- align src for loop
-	add 	loopcnt = -1, loopcnt	// loopcnt--
-	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
-} { .mmi
-	ld8	ptable = [tmp4]		// ptable = &table
-	ld8	ploop56 = [tmp3]	// ploop56 = &loop56
-	and	tmp2 = -16, len		// tmp2 = len & -OPSIZ
-;; }
-{ .mmi
-	add	tmp3 = ptable, sh1	// tmp3 = &table + sh1
-	add	src = src, tmp2		// src += len & (-16)
-	movi0	ar.lc = loopcnt		// set LC
-;; }
-{ .mmi
-	ld8	tmp4 = [tmp3]		// tmp4 = loop offset
-	sub	len = len, tmp2		// len -= len & (-16)
-	movi0	ar.ec = MEMLAT + 2 	// one more pass needed
-;; }
-{ .mmi
-	ld8	s[1] = [asrc], 8	// preload
-	sub	loopaddr = ploop56,tmp4	// loopadd = &loop56 - loop offset
-	movi0   pr.rot = 1 << 16	// set rotating predicates
-;; }
-{ .mib
-	nop.m	0
-	movi0	b6 = loopaddr
-	br	b6			// jump to the appropriate loop
-;; }
-
-	LOOP(8)
-	LOOP(16)
-	LOOP(24)
-	LOOP(32)
-	LOOP(40)
-	LOOP(48)
-	LOOP(56)
-END(memcpy)
-libc_hidden_builtin_def (memcpy)
-
-	.rodata
-	.align 8
-.table:
-	data8	0			// dummy entry
-	data8 	.loop56 - .loop8
-	data8 	.loop56 - .loop16
-	data8 	.loop56 - .loop24
-	data8	.loop56 - .loop32
-	data8	.loop56 - .loop40
-	data8	.loop56 - .loop48
-	data8	.loop56 - .loop56