summary refs log tree commit diff
path: root/ports/sysdeps/mips/memcpy.S
blob: aeea491e76c8dec6659f9385e23fe86f50bb46a2 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifdef ANDROID_CHANGES
#include "machine/asm.h"
#include "machine/regdef.h"
#define USE_MEMMOVE_FOR_OVERLAP
#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
#elif _LIBC
#include <sysdep.h>
#include <regdef.h>
#include <sys/asm.h>
#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
#elif _COMPILING_NEWLIB
#include "machine/asm.h"
#include "machine/regdef.h"
#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
#else
#include <regdef.h>
#include <sys/asm.h>
#endif

#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
    (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
#ifndef DISABLE_PREFETCH
#define USE_PREFETCH
#endif
#endif

#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
#ifndef DISABLE_DOUBLE
#define USE_DOUBLE
#endif
#endif



/* Some asm.h files do not have the L macro definition.  */
#ifndef L
#if _MIPS_SIM == _ABIO32
# define L(label) $L ## label
#else
# define L(label) .L ## label
#endif
#endif

/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
#ifndef PTR_ADDIU
#ifdef USE_DOUBLE
#define PTR_ADDIU	daddiu
#else
#define PTR_ADDIU	addiu
#endif
#endif

/* Some asm.h files do not have the PTR_SRA macro definition.  */
#ifndef PTR_SRA
#ifdef USE_DOUBLE
#define PTR_SRA		dsra
#else
#define PTR_SRA		sra
#endif
#endif


/*
 * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
 * prefetches appears to offer a slight preformance advantage.
 *
 * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
 * or PREFETCH_STORE_STREAMED offers a large performance advantage
 * but PREPAREFORSTORE has some special restrictions to consider.
 *
 * Prefetch with the 'prepare for store' hint does not copy a memory
 * location into the cache, it just allocates a cache line and zeros
 * it out.  This means that if you do not write to the entire cache
 * line before writing it out to memory some data will get zero'ed out
 * when the cache line is written back to memory and data will be lost.
 *
 * Also if you are using this memcpy to copy overlapping buffers it may
 * not behave correctly when using the 'prepare for store' hint.  If you
 * use the 'prepare for store' prefetch on a memory area that is in the
 * memcpy source (as well as the memcpy destination), then you will get
 * some data zero'ed out before you have a chance to read it and data will
 * be lost.
 *
 * If you are going to use this memcpy routine with the 'prepare for store'
 * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
 * the problem of running memcpy on overlapping buffers.
 *
 * There are ifdef'ed sections of this memcpy to make sure that it does not
 * do prefetches on cache lines that are not going to be completely written.
 * This code is only needed and only used when PREFETCH_STORE_HINT is set to
 * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
 * 32 bytes and if the cache line is larger it will not work correctly.
 */

#ifdef USE_PREFETCH
# define PREFETCH_HINT_LOAD		0
# define PREFETCH_HINT_STORE		1
# define PREFETCH_HINT_LOAD_STREAMED	4
# define PREFETCH_HINT_STORE_STREAMED	5
# define PREFETCH_HINT_LOAD_RETAINED	6
# define PREFETCH_HINT_STORE_RETAINED	7
# define PREFETCH_HINT_WRITEBACK_INVAL	25
# define PREFETCH_HINT_PREPAREFORSTORE	30

/*
 * If we have not picked out what hints to use at this point use the
 * standard load and store prefetch hints.
 */
#ifndef PREFETCH_STORE_HINT
# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
#endif
#ifndef PREFETCH_LOAD_HINT
# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
#endif

/*
 * We double everything when USE_DOUBLE is true so we do 2 prefetches to
 * get 64 bytes in that case.  The assumption is that each individual
 * prefetch brings in 32 bytes.
 */

#ifdef USE_DOUBLE
# define PREFETCH_CHUNK 64
# define PREFETCH_FOR_LOAD(chunk, reg) \
 pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \
 pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg)
# define PREFETCH_FOR_STORE(chunk, reg) \
 pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
 pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
#else
# define PREFETCH_CHUNK 32
# define PREFETCH_FOR_LOAD(chunk, reg) \
 pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
# define PREFETCH_FOR_STORE(chunk, reg) \
 pref PREFETCH_STORE_HINT, (chunk)*32(reg)
#endif
/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
 * then PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
 * of a prefetch is greater then MAX_PREFETCH_SIZE and the PREPAREFORSTORE
 * hint is used, the code will not work corrrectly.  If PREPAREFORSTORE is not
 * used then MAX_PREFETCH_SIZE does not matter.  */
#define MAX_PREFETCH_SIZE 128
/* PREFETCH_LIMIT is set based on the fact that we neve use an offset greater
 * then 5 on a STORE prefetch and that a single prefetch can never be larger
 * then MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
 * we actually do two prefetches in that case, one 32 bytes after the other.  */
#ifdef USE_DOUBLE
# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
#else
# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
#endif
#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
/* We cannot handle this because the initial prefetches may fetch bytes that
 * are before the buffer being copied.  We start copies with an offset
 * of 4 so avoid this situation when using PREPAREFORSTORE.  */
#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
#endif
#else /* USE_PREFETCH not defined */
# define PREFETCH_FOR_LOAD(offset, reg)
# define PREFETCH_FOR_STORE(offset, reg)
#endif

/* Allow the routine to be named something else if desired.  */
#ifndef MEMCPY_NAME
#define MEMCPY_NAME memcpy
#endif

/* We use these 32/64 bit registers as temporaries to do the copying.  */
#define REG0 t0
#define REG1 t1
#define REG2 t2
#define REG3 t3
#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABIO32) || (_MIPS_SIM == _ABIO64))
#  define REG4 t4
#  define REG5 t5
#  define REG6 t6
#  define REG7 t7
#else
#  define REG4 ta0
#  define REG5 ta1
#  define REG6 ta2
#  define REG7 ta3
#endif

/* We load/store 64 bits at a time when USE_DOUBLE is true.
 * The C_ prefix stands for CHUNK and is used to avoid macro name
 * conflicts with system header files.  */

#ifdef USE_DOUBLE
#  define C_ST	sd
#  define C_LD	ld
#if __MIPSEB
#  define C_LDHI	ldl	/* high part is left in big-endian	*/
#  define C_STHI	sdl	/* high part is left in big-endian	*/
#  define C_LDLO	ldr	/* low part is right in big-endian	*/
#  define C_STLO	sdr	/* low part is right in big-endian	*/
#else
#  define C_LDHI	ldr	/* high part is right in little-endian	*/
#  define C_STHI	sdr	/* high part is right in little-endian	*/
#  define C_LDLO	ldl	/* low part is left in little-endian	*/
#  define C_STLO	sdl	/* low part is left in little-endian	*/
#endif
#else
#  define C_ST	sw
#  define C_LD	lw
#if __MIPSEB
#  define C_LDHI	lwl	/* high part is left in big-endian	*/
#  define C_STHI	swl	/* high part is left in big-endian	*/
#  define C_LDLO	lwr	/* low part is right in big-endian	*/
#  define C_STLO	swr	/* low part is right in big-endian	*/
#else
#  define C_LDHI	lwr	/* high part is right in little-endian	*/
#  define C_STHI	swr	/* high part is right in little-endian	*/
#  define C_LDLO	lwl	/* low part is left in little-endian	*/
#  define C_STLO	swl	/* low part is left in little-endian	*/
#endif
#endif

/* Bookkeeping values for 32 vs. 64 bit mode.  */
#ifdef USE_DOUBLE
#  define NSIZE 8
#  define NSIZEMASK 0x3f
#  define NSIZEDMASK 0x7f
#else
#  define NSIZE 4
#  define NSIZEMASK 0x1f
#  define NSIZEDMASK 0x3f
#endif
#define UNIT(unit) ((unit)*NSIZE)
#define UNITM1(unit) (((unit)*NSIZE)-1)

#ifdef ANDROID_CHANGES
LEAF(MEMCPY_NAME, 0)
#else
LEAF(MEMCPY_NAME)
#endif
	.set	nomips16
	.set	noreorder
/*
 * Below we handle the case where memcpy is called with overlapping src and dst.
 * Although memcpy is not required to handle this case, some parts of Android
 * like Skia rely on such usage. We call memmove to handle such cases.
 */
#ifdef USE_MEMMOVE_FOR_OVERLAP
	PTR_SUBU t0,a0,a1
	PTR_SRA	t2,t0,31
	xor	t1,t0,t2
	PTR_SUBU t0,t1,t2
	sltu	t2,t0,a2
	beq	t2,zero,L(memcpy)
	la	t9,memmove
	jr	t9
	 nop
L(memcpy):
#endif
/*
 * If the size is less then 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
 * size, copy dst pointer to v0 for the return value.
 */
	slti	t2,a2,(2 * NSIZE)
	bne	t2,zero,L(lastb)
#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH)
	move	v0,zero
#else
	move	v0,a0
#endif
/*
 * If src and dst have different alignments, go to L(unaligned), if they
 * have the same alignment (but are not actually aligned) do a partial
 * load/store to make them aligned.  If they are both already aligned
 * we can start copying at L(aligned).
 */
	xor	t8,a1,a0
	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
	bne	t8,zero,L(unaligned)
	PTR_SUBU a3, zero, a0

	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */

	C_LDHI	t8,0(a1)
	PTR_ADDU a1,a1,a3
	C_STHI	t8,0(a0)
	PTR_ADDU a0,a0,a3

/*
 * Now dst/src are both aligned to (word or double word) aligned addresses
 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
 * equals a3.
 */

L(aligned):
	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */

/* When in the loop we may prefetch with the 'prepare to store' hint,
 * in this case the a0+x should not be past the "t0-32" address.  This
 * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
 * for x=64 the last "safe" a0 address is "t0-96" In the current version we
 * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
 */
#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
#endif
	PREFETCH_FOR_LOAD  (0, a1)
	PREFETCH_FOR_LOAD  (1, a1)
	PREFETCH_FOR_LOAD  (2, a1)
	PREFETCH_FOR_LOAD  (3, a1)
#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
	PREFETCH_FOR_STORE (1, a0)
	PREFETCH_FOR_STORE (2, a0)
	PREFETCH_FOR_STORE (3, a0)
#endif
#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
#if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
	sltu    v1,t9,a0
	bgtz    v1,L(skip_set)
	nop
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
L(skip_set):
#else
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
#endif
#endif
#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \
    && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3)
#ifdef USE_DOUBLE
	PTR_ADDIU v0,v0,32
#endif
#endif
L(loop16w):
	C_LD	t0,UNIT(0)(a1)
#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
	bgtz	v1,L(skip_pref)
#endif
	C_LD	t1,UNIT(1)(a1)
	PREFETCH_FOR_STORE (4, a0)
	PREFETCH_FOR_STORE (5, a0)
#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
#ifdef USE_DOUBLE
	PTR_ADDIU v0,v0,32
#endif
#endif
L(skip_pref):
	C_LD	REG2,UNIT(2)(a1)
	C_LD	REG3,UNIT(3)(a1)
	C_LD	REG4,UNIT(4)(a1)
	C_LD	REG5,UNIT(5)(a1)
	C_LD	REG6,UNIT(6)(a1)
	C_LD	REG7,UNIT(7)(a1)
        PREFETCH_FOR_LOAD (4, a1)

	C_ST	t0,UNIT(0)(a0)
	C_ST	t1,UNIT(1)(a0)
	C_ST	REG2,UNIT(2)(a0)
	C_ST	REG3,UNIT(3)(a0)
	C_ST	REG4,UNIT(4)(a0)
	C_ST	REG5,UNIT(5)(a0)
	C_ST	REG6,UNIT(6)(a0)
	C_ST	REG7,UNIT(7)(a0)

	C_LD	t0,UNIT(8)(a1)
	C_LD	t1,UNIT(9)(a1)
	C_LD	REG2,UNIT(10)(a1)
	C_LD	REG3,UNIT(11)(a1)
	C_LD	REG4,UNIT(12)(a1)
	C_LD	REG5,UNIT(13)(a1)
	C_LD	REG6,UNIT(14)(a1)
	C_LD	REG7,UNIT(15)(a1)
        PREFETCH_FOR_LOAD (5, a1)
	C_ST	t0,UNIT(8)(a0)
	C_ST	t1,UNIT(9)(a0)
	C_ST	REG2,UNIT(10)(a0)
	C_ST	REG3,UNIT(11)(a0)
	C_ST	REG4,UNIT(12)(a0)
	C_ST	REG5,UNIT(13)(a0)
	C_ST	REG6,UNIT(14)(a0)
	C_ST	REG7,UNIT(15)(a0)
	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
	bne	a0,a3,L(loop16w)
	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
	move	a2,t8

/* Here we have src and dest word-aligned but less than 64-bytes or
 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
 * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
 * the copy.
 */

L(chkw):
	PREFETCH_FOR_LOAD (0, a1)
	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
				/* The t8 is the reminder count past 32-bytes */
	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
	nop
	C_LD	t0,UNIT(0)(a1)
	C_LD	t1,UNIT(1)(a1)
	C_LD	REG2,UNIT(2)(a1)
	C_LD	REG3,UNIT(3)(a1)
	C_LD	REG4,UNIT(4)(a1)
	C_LD	REG5,UNIT(5)(a1)
	C_LD	REG6,UNIT(6)(a1)
	C_LD	REG7,UNIT(7)(a1)
	PTR_ADDIU a1,a1,UNIT(8)
	C_ST	t0,UNIT(0)(a0)
	C_ST	t1,UNIT(1)(a0)
	C_ST	REG2,UNIT(2)(a0)
	C_ST	REG3,UNIT(3)(a0)
	C_ST	REG4,UNIT(4)(a0)
	C_ST	REG5,UNIT(5)(a0)
	C_ST	REG6,UNIT(6)(a0)
	C_ST	REG7,UNIT(7)(a0)
	PTR_ADDIU a0,a0,UNIT(8)

/*
 * Here we have less then 32(64) bytes to copy.  Set up for a loop to
 * copy one word (or double word) at a time.  Set a2 to count how many
 * bytes we have to copy after all the word (or double word) chunks are
 * copied and a3 to the dst pointer after all the (d)word chunks have
 * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
 */
L(chk1w):
	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
	beq	a2,t8,L(lastb)
	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */

/* copying in words (4-byte or 8-byte chunks) */
L(wordCopy_loop):
	C_LD	REG3,UNIT(0)(a1)
	PTR_ADDIU a0,a0,UNIT(1)
	PTR_ADDIU a1,a1,UNIT(1)
	bne	a0,a3,L(wordCopy_loop)
	C_ST	REG3,UNIT(-1)(a0)

/* Copy the last 8 (or 16) bytes */
L(lastb):
	blez	a2,L(leave)
	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
L(lastbloop):
	lb	v1,0(a1)
	PTR_ADDIU a0,a0,1
	PTR_ADDIU a1,a1,1
	bne	a0,a3,L(lastbloop)
	sb	v1,-1(a0)
L(leave):
	j	ra
	nop
/*
 * UNALIGNED case, got here with a3 = "negu a0"
 * This code is nearly identical to the aligned code above
 * but only the destination (not the source) gets aligned
 * so we need to do partial loads of the source followed
 * by normal stores to the destination (once we have aligned
 * the destination).
 */

L(unaligned):
	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */

	C_LDHI	v1,UNIT(0)(a1)
	C_LDLO	v1,UNITM1(1)(a1)
	PTR_ADDU a1,a1,a3
	C_STHI	v1,UNIT(0)(a0)
	PTR_ADDU a0,a0,a3

/*
 *  Now the destination (but not the source) is aligned
 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
 * equals a3.
 */

L(ua_chk16w):
	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */

#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
#endif
	PREFETCH_FOR_LOAD  (0, a1)
	PREFETCH_FOR_LOAD  (1, a1)
	PREFETCH_FOR_LOAD  (2, a1)
#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
	PREFETCH_FOR_STORE (1, a0)
	PREFETCH_FOR_STORE (2, a0)
	PREFETCH_FOR_STORE (3, a0)
#endif
#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	sltu    v1,t9,a0
	bgtz    v1,L(ua_skip_set)
	nop
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
L(ua_skip_set):
#else
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
#endif
#endif
L(ua_loop16w):
	PREFETCH_FOR_LOAD  (3, a1)
	C_LDHI	t0,UNIT(0)(a1)
	C_LDHI	t1,UNIT(1)(a1)
	C_LDHI	REG2,UNIT(2)(a1)
#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	sltu	v1,t9,a0
	bgtz	v1,L(ua_skip_pref)
#endif
	C_LDHI	REG3,UNIT(3)(a1)
	PREFETCH_FOR_STORE (4, a0)
	PREFETCH_FOR_STORE (5, a0)
L(ua_skip_pref):
	C_LDHI	REG4,UNIT(4)(a1)
	C_LDHI	REG5,UNIT(5)(a1)
	C_LDHI	REG6,UNIT(6)(a1)
	C_LDHI	REG7,UNIT(7)(a1)
	C_LDLO	t0,UNITM1(1)(a1)
	C_LDLO	t1,UNITM1(2)(a1)
	C_LDLO	REG2,UNITM1(3)(a1)
	C_LDLO	REG3,UNITM1(4)(a1)
	C_LDLO	REG4,UNITM1(5)(a1)
	C_LDLO	REG5,UNITM1(6)(a1)
	C_LDLO	REG6,UNITM1(7)(a1)
	C_LDLO	REG7,UNITM1(8)(a1)
        PREFETCH_FOR_LOAD (4, a1)
	C_ST	t0,UNIT(0)(a0)
	C_ST	t1,UNIT(1)(a0)
	C_ST	REG2,UNIT(2)(a0)
	C_ST	REG3,UNIT(3)(a0)
	C_ST	REG4,UNIT(4)(a0)
	C_ST	REG5,UNIT(5)(a0)
	C_ST	REG6,UNIT(6)(a0)
	C_ST	REG7,UNIT(7)(a0)
	C_LDHI	t0,UNIT(8)(a1)
	C_LDHI	t1,UNIT(9)(a1)
	C_LDHI	REG2,UNIT(10)(a1)
	C_LDHI	REG3,UNIT(11)(a1)
	C_LDHI	REG4,UNIT(12)(a1)
	C_LDHI	REG5,UNIT(13)(a1)
	C_LDHI	REG6,UNIT(14)(a1)
	C_LDHI	REG7,UNIT(15)(a1)
	C_LDLO	t0,UNITM1(9)(a1)
	C_LDLO	t1,UNITM1(10)(a1)
	C_LDLO	REG2,UNITM1(11)(a1)
	C_LDLO	REG3,UNITM1(12)(a1)
	C_LDLO	REG4,UNITM1(13)(a1)
	C_LDLO	REG5,UNITM1(14)(a1)
	C_LDLO	REG6,UNITM1(15)(a1)
	C_LDLO	REG7,UNITM1(16)(a1)
        PREFETCH_FOR_LOAD (5, a1)
	C_ST	t0,UNIT(8)(a0)
	C_ST	t1,UNIT(9)(a0)
	C_ST	REG2,UNIT(10)(a0)
	C_ST	REG3,UNIT(11)(a0)
	C_ST	REG4,UNIT(12)(a0)
	C_ST	REG5,UNIT(13)(a0)
	C_ST	REG6,UNIT(14)(a0)
	C_ST	REG7,UNIT(15)(a0)
	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
	bne	a0,a3,L(ua_loop16w)
	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
	move	a2,t8

/* Here we have src and dest word-aligned but less than 64-bytes or
 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
 * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
 * the copy.  */

L(ua_chkw):
	PREFETCH_FOR_LOAD (0, a1)
	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
				  /* t8 is the reminder count past 32-bytes */
	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
	nop
	C_LDHI	t0,UNIT(0)(a1)
	C_LDHI	t1,UNIT(1)(a1)
	C_LDHI	REG2,UNIT(2)(a1)
	C_LDHI	REG3,UNIT(3)(a1)
	C_LDHI	REG4,UNIT(4)(a1)
	C_LDHI	REG5,UNIT(5)(a1)
	C_LDHI	REG6,UNIT(6)(a1)
	C_LDHI	REG7,UNIT(7)(a1)
	C_LDLO	t0,UNITM1(1)(a1)
	C_LDLO	t1,UNITM1(2)(a1)
	C_LDLO	REG2,UNITM1(3)(a1)
	C_LDLO	REG3,UNITM1(4)(a1)
	C_LDLO	REG4,UNITM1(5)(a1)
	C_LDLO	REG5,UNITM1(6)(a1)
	C_LDLO	REG6,UNITM1(7)(a1)
	C_LDLO	REG7,UNITM1(8)(a1)
	PTR_ADDIU a1,a1,UNIT(8)
	C_ST	t0,UNIT(0)(a0)
	C_ST	t1,UNIT(1)(a0)
	C_ST	REG2,UNIT(2)(a0)
	C_ST	REG3,UNIT(3)(a0)
	C_ST	REG4,UNIT(4)(a0)
	C_ST	REG5,UNIT(5)(a0)
	C_ST	REG6,UNIT(6)(a0)
	C_ST	REG7,UNIT(7)(a0)
	PTR_ADDIU a0,a0,UNIT(8)
/*
 * Here we have less then 32(64) bytes to copy.  Set up for a loop to
 * copy one word (or double word) at a time.
 */
L(ua_chk1w):
	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
	beq	a2,t8,L(ua_smallCopy)
	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */

/* copying in words (4-byte or 8-byte chunks) */
L(ua_wordCopy_loop):
	C_LDHI	v1,UNIT(0)(a1)
	C_LDLO	v1,UNITM1(1)(a1)
	PTR_ADDIU a0,a0,UNIT(1)
	PTR_ADDIU a1,a1,UNIT(1)
	bne	a0,a3,L(ua_wordCopy_loop)
	C_ST	v1,UNIT(-1)(a0)

/* Copy the last 8 (or 16) bytes */
L(ua_smallCopy):
	beqz	a2,L(leave)
	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
L(ua_smallCopy_loop):
	lb	v1,0(a1)
	PTR_ADDIU a0,a0,1
	PTR_ADDIU a1,a1,1
	bne	a0,a3,L(ua_smallCopy_loop)
	sb	v1,-1(a0)

	j	ra
	nop

	.set	at
	.set	reorder
END(MEMCPY_NAME)
#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (MEMCPY_NAME)
#endif
#endif