1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
/* memcpy for RISC-V, ignoring buffer alignment
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <sys/asm.h>
/* memcpy optimization for CPUs with fast unaligned support
(RISCV_HWPROBE_MISALIGNED_FAST).
Copies are split into 3 main cases: small copies up to SZREG, copies up to
BLOCK_SIZE (128 for 64 bits, 64 for 32 bits), and copies larger than BLOCK_SIZE.
Large copies use a software pipelined loop processing BLOCK_SIZE bytes per
iteration. The destination pointer is SZREG-byte aligned to minimize store
unaligned accesses.
The tail is handled with branchless copies. */
#define BLOCK_SIZE (16 * SZREG)
.attribute unaligned_access, 1
ENTRY (__memcpy_noalignment)
beq a2, zero, L(ret)
/* if LEN < SZREG jump to tail handling. */
li a5, SZREG-1
mv a6, a0
bleu a2, a5, L(tail)
/* Copy the first word, align DEST to word, and adjust DEST/SRC/LEN
based on the amount adjusted to align DEST. */
REG_L a3, 0(a1)
andi a5, a0, SZREG-1
addi a2, a2, -SZREG
li a4, SZREG
sub a4, a4, a5
REG_S a3, 0(a0)
add a2, a5, a2
/* If LEN < BLOCK_SIZE jump to word copy. */
li a3, BLOCK_SIZE-1
add a5, a0, a4
add a1, a1, a4
bleu a2, a3, L(word_copy_adjust)
addi a7, a2, -BLOCK_SIZE
andi a7, a7, -BLOCK_SIZE
addi a7, a7, BLOCK_SIZE
add a3, a5, a7
mv a4, a1
L(block_copy):
REG_L a6, 0(a4)
REG_L t0, SZREG(a4)
REG_L t1, (2*SZREG)(a4)
REG_L t2, (3*SZREG)(a4)
REG_L t3, (4*SZREG)(a4)
REG_L t4, (5*SZREG)(a4)
REG_L t5, (6*SZREG)(a4)
REG_L t6, (7*SZREG)(a4)
REG_S a6, 0(a5)
REG_S t0, SZREG(a5)
REG_S t1, (2*SZREG)(a5)
REG_S t2, (3*SZREG)(a5)
REG_S t3, (4*SZREG)(a5)
REG_S t4, (5*SZREG)(a5)
REG_S t5, (6*SZREG)(a5)
REG_S t6, (7*SZREG)(a5)
REG_L a6, (8*SZREG)(a4)
REG_L t0, (9*SZREG)(a4)
REG_L t1, (10*SZREG)(a4)
REG_L t2, (11*SZREG)(a4)
REG_L t3, (12*SZREG)(a4)
REG_L t4, (13*SZREG)(a4)
REG_L t5, (14*SZREG)(a4)
REG_L t6, (15*SZREG)(a4)
addi a4, a4, BLOCK_SIZE
REG_S a6, (8*SZREG)(a5)
REG_S t0, (9*SZREG)(a5)
REG_S t1, (10*SZREG)(a5)
REG_S t2, (11*SZREG)(a5)
REG_S t3, (12*SZREG)(a5)
REG_S t4, (13*SZREG)(a5)
REG_S t5, (14*SZREG)(a5)
REG_S t6, (15*SZREG)(a5)
addi a5, a5, BLOCK_SIZE
bne a5, a3, L(block_copy)
add a1, a1, a7
andi a2, a2, BLOCK_SIZE-1
/* 0 <= a2/LEN < BLOCK_SIZE. */
L(word_copy):
li a5, SZREG-1
/* if LEN < SZREG jump to tail handling. */
bleu a2, a5, L(tail_adjust)
addi a7, a2, -SZREG
andi a7, a7, -SZREG
addi a7, a7, SZREG
add a6, a3, a7
mv a5, a1
L(word_copy_loop):
REG_L a4, 0(a5)
addi a3, a3, SZREG
addi a5, a5, SZREG
REG_S a4, -SZREG(a3)
bne a3, a6, L(word_copy_loop)
add a1, a1, a7
andi a2, a2, SZREG-1
/* Copy the last word unaligned. */
add a3, a1, a2
add a4, a6, a2
REG_L t0, -SZREG(a3)
REG_S t0, -SZREG(a4)
ret
L(tail):
/* Copy 4-7 bytes. */
andi a5, a2, 4
add a3, a1, a2
add a4, a6, a2
beq a5, zero, L(copy_0_3)
lw t0, 0(a1)
lw t1, -4(a3)
sw t0, 0(a6)
sw t1, -4(a4)
ret
/* Copy 0-3 bytes. */
L(copy_0_3):
beq a2, zero, L(ret)
srli a2, a2, 1
add t4, a1, a2
add t5, a6, a2
lbu t0, 0(a1)
lbu t1, -1(a3)
lbu t2, 0(t4)
sb t0, 0(a6)
sb t1, -1(a4)
sb t2, 0(t5)
L(ret):
ret
L(tail_adjust):
mv a6, a3
j L(tail)
L(word_copy_adjust):
mv a3, a5
j L(word_copy)
END (__memcpy_noalignment)
|