about summary refs log tree commit diff
path: root/sysdeps/riscv/multiarch/memcpy_noalignment.S
blob: fa39be21d634d264e7276629f1956ad2512c0b50 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/* memcpy for RISC-V, ignoring buffer alignment
   Copyright (C) 2024 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>
#include <sys/asm.h>

/* memcpy optimization for CPUs with fast unaligned support
   (RISCV_HWPROBE_MISALIGNED_FAST).

   Copies are split into 3 main cases: small copies up to SZREG, copies up to
   BLOCK_SIZE (128 for 64 bits, 64 for 32 bits), and copies larger than BLOCK_SIZE.

   Large copies use a software pipelined loop processing BLOCK_SIZE bytes per
   iteration.  The destination pointer is SZREG-byte aligned to minimize store
   unaligned accesses.

   The tail is handled with branchless copies.  */

#define BLOCK_SIZE (16 * SZREG)

	.attribute unaligned_access, 1
ENTRY (__memcpy_noalignment)
	beq	a2, zero, L(ret)

	/* if LEN < SZREG jump to tail handling.  */
	li	a5, SZREG-1
	mv	a6, a0
	bleu	a2, a5, L(tail)

	/* Copy the first word, align DEST to word, and adjust DEST/SRC/LEN
	   based on the amount adjusted to align DEST.  */
	REG_L	a3, 0(a1)
	andi	a5, a0, SZREG-1
	addi	a2, a2, -SZREG
	li	a4, SZREG
	sub	a4, a4, a5
	REG_S	a3, 0(a0)
	add	a2, a5, a2

	/* If LEN < BLOCK_SIZE jump to word copy.  */
	li	a3, BLOCK_SIZE-1
	add	a5, a0, a4
	add	a1, a1, a4
	bleu	a2, a3, L(word_copy_adjust)
	addi	a7, a2, -BLOCK_SIZE
	andi	a7, a7, -BLOCK_SIZE
	addi	a7, a7, BLOCK_SIZE
	add	a3, a5, a7
	mv	a4, a1
L(block_copy):
	REG_L	a6,          0(a4)
	REG_L	t0,      SZREG(a4)
	REG_L	t1,  (2*SZREG)(a4)
	REG_L	t2,  (3*SZREG)(a4)
	REG_L	t3,  (4*SZREG)(a4)
	REG_L	t4,  (5*SZREG)(a4)
	REG_L	t5,  (6*SZREG)(a4)
	REG_L	t6,  (7*SZREG)(a4)
	REG_S	a6,          0(a5)
	REG_S	t0,      SZREG(a5)
	REG_S	t1,  (2*SZREG)(a5)
	REG_S	t2,  (3*SZREG)(a5)
	REG_S	t3,  (4*SZREG)(a5)
	REG_S	t4,  (5*SZREG)(a5)
	REG_S	t5,  (6*SZREG)(a5)
	REG_S	t6,  (7*SZREG)(a5)
	REG_L	a6,  (8*SZREG)(a4)
	REG_L	t0,  (9*SZREG)(a4)
	REG_L	t1, (10*SZREG)(a4)
	REG_L	t2, (11*SZREG)(a4)
	REG_L	t3, (12*SZREG)(a4)
	REG_L	t4, (13*SZREG)(a4)
	REG_L	t5, (14*SZREG)(a4)
	REG_L	t6, (15*SZREG)(a4)
	addi	a4, a4, BLOCK_SIZE
	REG_S	a6,  (8*SZREG)(a5)
	REG_S	t0,  (9*SZREG)(a5)
	REG_S	t1, (10*SZREG)(a5)
	REG_S	t2, (11*SZREG)(a5)
	REG_S	t3, (12*SZREG)(a5)
	REG_S	t4, (13*SZREG)(a5)
	REG_S	t5, (14*SZREG)(a5)
	REG_S	t6, (15*SZREG)(a5)
	addi	a5, a5, BLOCK_SIZE
	bne	a5, a3, L(block_copy)
	add	a1, a1, a7
	andi	a2, a2, BLOCK_SIZE-1

	/* 0 <= a2/LEN  < BLOCK_SIZE.  */
L(word_copy):
	li	a5, SZREG-1
	/* if LEN < SZREG jump to tail handling.  */
	bleu	a2, a5, L(tail_adjust)
	addi	a7, a2, -SZREG
	andi	a7, a7, -SZREG
	addi	a7, a7, SZREG
	add	a6, a3, a7
	mv	a5, a1
L(word_copy_loop):
	REG_L	a4, 0(a5)
	addi	a3, a3, SZREG
	addi	a5, a5, SZREG
	REG_S	a4, -SZREG(a3)
	bne	a3, a6, L(word_copy_loop)
	add	a1, a1, a7
	andi	a2, a2, SZREG-1

	/* Copy the last word unaligned.  */
	add	a3, a1, a2
	add	a4, a6, a2
	REG_L	t0, -SZREG(a3)
	REG_S	t0, -SZREG(a4)
	ret

L(tail):
	/* Copy 4-7 bytes.  */
	andi	a5, a2, 4
	add	a3, a1, a2
	add	a4, a6, a2
	beq	a5, zero, L(copy_0_3)
	lw	t0, 0(a1)
	lw	t1, -4(a3)
	sw	t0, 0(a6)
	sw	t1, -4(a4)
	ret

	/* Copy 0-3 bytes.  */
L(copy_0_3):
	beq	a2, zero, L(ret)
	srli    a2, a2, 1
	add     t4, a1, a2
	add     t5, a6, a2
	lbu     t0, 0(a1)
	lbu     t1, -1(a3)
	lbu     t2, 0(t4)
	sb      t0, 0(a6)
	sb      t1, -1(a4)
	sb      t2, 0(t5)
L(ret):
	ret
L(tail_adjust):
	mv	a6, a3
	j	L(tail)
L(word_copy_adjust):
	mv	a3, a5
	j	L(word_copy)
END (__memcpy_noalignment)