summary refs log tree commit diff
path: root/ports/sysdeps/powerpc/powerpc32/405/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'ports/sysdeps/powerpc/powerpc32/405/memcpy.S')
-rw-r--r--ports/sysdeps/powerpc/powerpc32/405/memcpy.S132
1 files changed, 132 insertions, 0 deletions
diff --git a/ports/sysdeps/powerpc/powerpc32/405/memcpy.S b/ports/sysdeps/powerpc/powerpc32/405/memcpy.S
new file mode 100644
index 0000000000..61025cf818
--- /dev/null
+++ b/ports/sysdeps/powerpc/powerpc32/405/memcpy.S
@@ -0,0 +1,132 @@
+/* Optimized memcpy implementation for PowerPC476.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* memcpy
+
+       r0:return address
+       r3:destination address
+       r4:source address
+       r5:byte count
+
+       Save return address in r0.
+       If destinationn and source are unaligned and copy count is greater than 256
+       then copy 0-3 bytes to make destination aligned.
+       If 32 or more bytes to copy we use 32 byte copy loop.
+       Finaly we copy 0-31 extra bytes. */
+
+EALIGN (BP_SYM (memcpy), 5, 0)
+/* Check if bytes to copy are greater than 256 and if
+       source and destination are unaligned */
+       cmpwi   r5,0x0100
+       addi    r0,r3,0
+       ble     L(string_count_loop)
+       neg     r6,r3
+       clrlwi. r6,r6,30
+       beq     L(string_count_loop)
+       neg     r6,r4
+       clrlwi. r6,r6,30
+       beq     L(string_count_loop)
+       mtctr   r6
+       subf    r5,r6,r5
+
+L(unaligned_bytecopy_loop): /* Align destination by coping 0-3 bytes */
+       lbz     r8,0x0(r4)
+       addi    r4,r4,1
+       stb     r8,0x0(r3)
+       addi    r3,r3,1
+       bdnz    L(unaligned_bytecopy_loop)
+       srwi.   r7,r5,5
+       beq     L(preword2_count_loop)
+       mtctr   r7
+
+L(word8_count_loop_no_dcbt): /* Copy 32 bytes at a time */
+       lwz     r6,0(r4)
+       lwz     r7,4(r4)
+       lwz     r8,8(r4)
+       lwz     r9,12(r4)
+       subi    r5,r5,0x20
+       stw     r6,0(r3)
+       stw     r7,4(r3)
+       stw     r8,8(r3)
+       stw     r9,12(r3)
+       lwz     r6,16(r4)
+       lwz     r7,20(r4)
+       lwz     r8,24(r4)
+       lwz     r9,28(r4)
+       addi    r4,r4,0x20
+       stw     r6,16(r3)
+       stw     r7,20(r3)
+       stw     r8,24(r3)
+       stw     r9,28(r3)
+       addi    r3,r3,0x20
+       bdnz    L(word8_count_loop_no_dcbt)
+
+L(preword2_count_loop): /* Copy remaining 0-31 bytes */
+       clrlwi. r12,r5,27
+       beq     L(end_memcpy)
+       mtxer   r12
+       lswx    r5,0,r4
+       stswx   r5,0,r3
+       mr       r3,r0
+       blr
+
+L(string_count_loop): /* Copy odd 0-31 bytes */
+       clrlwi. r12,r5,28
+       add     r3,r3,r5
+       add     r4,r4,r5
+       beq     L(pre_string_copy)
+       mtxer   r12
+       subf    r4,r12,r4
+       subf    r3,r12,r3
+       lswx    r6,0,r4
+       stswx   r6,0,r3
+
+L(pre_string_copy): /* Check how many 32 byte chunck to copy */
+       srwi.   r7,r5,4
+       beq     L(end_memcpy)
+       mtctr   r7
+
+L(word4_count_loop_no_dcbt): /* Copy 32 bytes at a time */
+       lwz     r6,-4(r4)
+       lwz     r7,-8(r4)
+       lwz     r8,-12(r4)
+       lwzu    r9,-16(r4)
+       stw     r6,-4(r3)
+       stw     r7,-8(r3)
+       stw     r8,-12(r3)
+       stwu    r9,-16(r3)
+       bdz     L(end_memcpy)
+       lwz     r6,-4(r4)
+       lwz     r7,-8(r4)
+       lwz     r8,-12(r4)
+       lwzu    r9,-16(r4)
+       stw     r6,-4(r3)
+       stw     r7,-8(r3)
+       stw     r8,-12(r3)
+       stwu    r9,-16(r3)
+       bdnz    L(word4_count_loop_no_dcbt)
+
+L(end_memcpy):
+       mr       r3,r0
+       blr
+END (BP_SYM (memcpy))
+libc_hidden_builtin_def (memcpy)