about summary refs log tree commit diff
path: root/sysdeps/alpha/alphaev6/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/alpha/alphaev6/memcpy.S')
-rw-r--r--sysdeps/alpha/alphaev6/memcpy.S256
1 files changed, 0 insertions, 256 deletions
diff --git a/sysdeps/alpha/alphaev6/memcpy.S b/sysdeps/alpha/alphaev6/memcpy.S
deleted file mode 100644
index 7cff521da2..0000000000
--- a/sysdeps/alpha/alphaev6/memcpy.S
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (C) 2000, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/*
- * Much of the information about 21264 scheduling/coding comes from:
- *	Compiler Writer's Guide for the Alpha 21264
- *	abbreviated as 'CWG' in other comments here
- *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
- * Scheduling notation:
- *	E	- either cluster
- *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
- *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
- *
- * Temp usage notes:
- *	$0		- destination address
- *	$1,$2,		- scratch
- */
-
-#include <sysdep.h>
-
-	.arch ev6
-	.set noreorder
-	.set noat
-
-ENTRY(memcpy)
-	.prologue 0
-
-	mov	$16, $0			# E : copy dest to return
-	ble	$18, $nomoredata	# U : done with the copy?
-	xor	$16, $17, $1		# E : are source and dest alignments the same?
-	and	$1, 7, $1		# E : are they the same mod 8?
-
-	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
-	/* source and dest are same mod 8 address */
-	and	$16, 7, $1		# E : Are both 0mod8?
-	beq	$1, $both_0mod8		# U : Yes
-	nop				# E :
-
-	/*
-	 * source and dest are same misalignment.  move a byte at a time
-	 * until a 0mod8 alignment for both is reached.
-	 * At least one byte more to move
-	 */
-
-$head_align:
-	ldbu	$1, 0($17)		# L : grab a byte
-	subq	$18, 1, $18		# E : count--
-	addq	$17, 1, $17		# E : src++
-	stb	$1, 0($16)		# L :
-	addq	$16, 1, $16		# E : dest++
-	and	$16, 7, $1		# E : Are we at 0mod8 yet?
-	ble	$18, $nomoredata	# U : done with the copy?
-	bne	$1, $head_align		# U :
-
-$both_0mod8:
-	cmple	$18, 127, $1		# E : Can we unroll the loop?
-	bne	$1, $no_unroll		# U :
-	and	$16, 63, $1		# E : get mod64 alignment
-	beq	$1, $do_unroll		# U : no single quads to fiddle
-
-$single_head_quad:
-	ldq	$1, 0($17)		# L : get 8 bytes
-	subq	$18, 8, $18		# E : count -= 8
-	addq	$17, 8, $17		# E : src += 8
-	nop				# E :
-
-	stq	$1, 0($16)		# L : store
-	addq	$16, 8, $16		# E : dest += 8
-	and	$16, 63, $1		# E : get mod64 alignment
-	bne	$1, $single_head_quad	# U : still not fully aligned
-
-$do_unroll:
-	addq	$16, 64, $7		# E : Initial (+1 trip) wh64 address
-	cmple	$18, 127, $1		# E : Can we go through the unrolled loop?
-	bne	$1, $tail_quads		# U : Nope
-	nop				# E :
-
-$unroll_body:
-	wh64	($7)			# L1 : memory subsystem hint: 64 bytes at
-					# ($7) are about to be over-written
-	ldq	$6, 0($17)		# L0 : bytes 0..7
-	nop				# E :
-	nop				# E :
-
-	ldq	$4, 8($17)		# L : bytes 8..15
-	ldq	$5, 16($17)		# L : bytes 16..23
-	addq	$7, 64, $7		# E : Update next wh64 address
-	nop				# E :
-
-	ldq	$3, 24($17)		# L : bytes 24..31
-	addq	$16, 64, $1		# E : fallback value for wh64
-	nop				# E :
-	nop				# E :
-
-	addq	$17, 32, $17		# E : src += 32 bytes
-	stq	$6, 0($16)		# L : bytes 0..7
-	nop				# E :
-	nop				# E :
-
-	stq	$4, 8($16)		# L : bytes 8..15
-	stq	$5, 16($16)		# L : bytes 16..23
-	subq	$18, 192, $2		# E : At least two more trips to go?
-	nop				# E :
-
-	stq	$3, 24($16)		# L : bytes 24..31
-	addq	$16, 32, $16		# E : dest += 32 bytes
-	nop				# E :
-	nop				# E :
-
-	ldq	$6, 0($17)		# L : bytes 0..7
-	ldq	$4, 8($17)		# L : bytes 8..15
-	cmovlt	$2, $1, $7		# E : Latency 2, extra map slot - Use
-					# fallback wh64 address if < 2 more trips
-	nop				# E :
-
-	ldq	$5, 16($17)		# L : bytes 16..23
-	ldq	$3, 24($17)		# L : bytes 24..31
-	addq	$16, 32, $16		# E : dest += 32
-	subq	$18, 64, $18		# E : count -= 64
-
-	addq	$17, 32, $17		# E : src += 32
-	stq	$6, -32($16)		# L : bytes 0..7
-	stq	$4, -24($16)		# L : bytes 8..15
-	cmple	$18, 63, $1		# E : At least one more trip?
-
-	stq	$5, -16($16)		# L : bytes 16..23
-	stq	$3, -8($16)		# L : bytes 24..31
-	nop				# E :
-	beq	$1, $unroll_body
-
-$tail_quads:
-$no_unroll:
-	.align 4
-	subq	$18, 8, $18		# E : At least a quad left?
-	blt	$18, $less_than_8	# U : Nope
-	nop				# E :
-	nop				# E :
-
-$move_a_quad:
-	ldq	$1, 0($17)		# L : fetch 8
-	subq	$18, 8, $18		# E : count -= 8
-	addq	$17, 8, $17		# E : src += 8
-	nop				# E :
-
-	stq	$1, 0($16)		# L : store 8
-	addq	$16, 8, $16		# E : dest += 8
-	bge	$18, $move_a_quad	# U :
-	nop				# E :
-
-$less_than_8:
-	.align 4
-	addq	$18, 8, $18		# E : add back for trailing bytes
-	ble	$18, $nomoredata	# U : All-done
-	nop				# E :
-	nop				# E :
-
-	/* Trailing bytes */
-$tail_bytes:
-	subq	$18, 1, $18		# E : count--
-	ldbu	$1, 0($17)		# L : fetch a byte
-	addq	$17, 1, $17		# E : src++
-	nop				# E :
-
-	stb	$1, 0($16)		# L : store a byte
-	addq	$16, 1, $16		# E : dest++
-	bgt	$18, $tail_bytes	# U : more to be done?
-	nop				# E :
-
-	/* branching to exit takes 3 extra cycles, so replicate exit here */
-	ret	$31, ($26), 1		# L0 :
-	nop				# E :
-	nop				# E :
-	nop				# E :
-
-$misaligned:
-	mov	$0, $4			# E : dest temp
-	and	$0, 7, $1		# E : dest alignment mod8
-	beq	$1, $dest_0mod8		# U : life doesnt totally suck
-	nop
-
-$aligndest:
-	ble	$18, $nomoredata	# U :
-	ldbu	$1, 0($17)		# L : fetch a byte
-	subq	$18, 1, $18		# E : count--
-	addq	$17, 1, $17		# E : src++
-
-	stb	$1, 0($4)		# L : store it
-	addq	$4, 1, $4		# E : dest++
-	and	$4, 7, $1		# E : dest 0mod8 yet?
-	bne	$1, $aligndest		# U : go until we are aligned.
-
-	/* Source has unknown alignment, but dest is known to be 0mod8 */
-$dest_0mod8:
-	subq	$18, 8, $18		# E : At least a quad left?
-	blt	$18, $misalign_tail	# U : Nope
-	ldq_u	$3, 0($17)		# L : seed (rotating load) of 8 bytes
-	nop				# E :
-
-$mis_quad:
-	ldq_u	$16, 8($17)		# L : Fetch next 8
-	extql	$3, $17, $3		# U : masking
-	extqh	$16, $17, $1		# U : masking
-	bis	$3, $1, $1		# E : merged bytes to store
-
-	subq	$18, 8, $18		# E : count -= 8
-	addq	$17, 8, $17		# E : src += 8
-	stq	$1, 0($4)		# L : store 8 (aligned)
-	mov	$16, $3			# E : "rotate" source data
-
-	addq	$4, 8, $4		# E : dest += 8
-	bge	$18, $mis_quad		# U : More quads to move
-	nop
-	nop
-
-$misalign_tail:
-	addq	$18, 8, $18		# E : account for tail stuff
-	ble	$18, $nomoredata	# U :
-	nop
-	nop
-
-$misalign_byte:
-	ldbu	$1, 0($17)		# L : fetch 1
-	subq	$18, 1, $18		# E : count--
-	addq	$17, 1, $17		# E : src++
-	nop				# E :
-
-	stb	$1, 0($4)		# L : store
-	addq	$4, 1, $4		# E : dest++
-	bgt	$18, $misalign_byte	# U : more to go?
-	nop
-
-
-$nomoredata:
-	ret	$31, ($26), 1		# L0 :
-	nop				# E :
-	nop				# E :
-	nop				# E :
-
-END(memcpy)
-libc_hidden_builtin_def (memcpy)