summary refs log tree commit diff
path: root/sysdeps/ia64/memset.S
blob: 56db66fd0b64a7a48734b087dc4fd0041d7fb791 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* Optimized version of the standard memset() function.
   This file is part of the GNU C Library.
   Copyright (C) 2000, 2001 Free Software Foundation, Inc.
   Contributed by Dan Pop <Dan.Pop@cern.ch>.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */

/* Return: dest
  
   Inputs:
        in0:    dest
        in1:    value
        in2:    count

   The algorithm is fairly straightforward: set byte by byte until we
   we get to a word aligned address, then set word by word as much as
   possible; the remaining few bytes are set one by one.  */

#include <sysdep.h>
#undef ret

#define dest		in0
#define byteval		in1
#define	cnt		in2

#define save_pfs 	loc0
#define ptr1		loc1
#define ptr2		loc2
#define tmp		loc3
#define	loopcnt		loc4
#define save_lc		loc5
#define wordval		loc6

ENTRY(memset)
	.prologue
	alloc	save_pfs = ar.pfs, 3, 7, 0, 0	
	.save ar.lc, save_lc
	mov	save_lc = ar.lc
	.body
	mov	ret0 = dest
	and	tmp = 7, dest
	cmp.eq	p6, p0 = cnt, r0
(p6)	br.cond.spnt .restore_and_exit ;;
	mov	ptr1 = dest
	sub	loopcnt = 8, tmp
	cmp.gt	p6, p0 = 16, cnt
(p6)	br.cond.spnt .set_few;;
	cmp.eq	p6, p0 = tmp, r0
(p6)	br.cond.sptk .dest_aligned
	sub	cnt = cnt, loopcnt
	adds	loopcnt = -1, loopcnt;;
	mov	ar.lc = loopcnt;;	
.l1:
	st1	[ptr1] = byteval, 1
	br.cloop.dptk	.l1 ;;
.dest_aligned:
	adds	ptr2 = 8, ptr1
	mux1	wordval = byteval, @brcst
	shr.u	loopcnt = cnt, 4 ;;	// loopcnt = cnt / 16
	cmp.eq	p6, p0 = loopcnt, r0
(p6)	br.cond.spnt	.one_more
	and	cnt = 0xf, cnt		// compute the remaining cnt
	adds	loopcnt = -1, loopcnt;;
	mov     ar.lc = loopcnt;;	
.l2:
	st8	[ptr1] = wordval, 16
	st8	[ptr2] = wordval, 16
	br.cloop.dptk .l2
	cmp.le	p6, p0 = 8, cnt	;;
.one_more:
(p6)	st8     [ptr1] = wordval, 8
(p6)	adds	cnt = -8, cnt ;;
	cmp.eq	p6, p0 = cnt, r0
(p6)	br.cond.spnt	.restore_and_exit
.set_few:
	adds	loopcnt = -1, cnt;;
	mov	ar.lc = loopcnt;;
.l3:	
	st1     [ptr1] = byteval, 1
	br.cloop.dptk   .l3 ;;	
.restore_and_exit:
	mov	ar.lc = save_lc
	mov	ar.pfs = save_pfs
	br.ret.sptk.many b0					
END(memset)