diff options
Diffstat (limited to 'REORG.TODO/sysdeps/arm/armv7')
-rw-r--r-- | REORG.TODO/sysdeps/arm/armv7/Implies | 2 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/arm/armv7/multiarch/Makefile | 3 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c | 2 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c | 56 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy.S | 76 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_impl.S | 728 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_neon.S | 9 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_vfp.S | 7 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/arm/armv7/strcmp.S | 519 |
9 files changed, 1402 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/arm/armv7/Implies b/REORG.TODO/sysdeps/arm/armv7/Implies new file mode 100644 index 0000000000..c6cd0eb877 --- /dev/null +++ b/REORG.TODO/sysdeps/arm/armv7/Implies @@ -0,0 +1,2 @@ +# We can do everything that 6T2 can +arm/armv6t2 diff --git a/REORG.TODO/sysdeps/arm/armv7/multiarch/Makefile b/REORG.TODO/sysdeps/arm/armv7/multiarch/Makefile new file mode 100644 index 0000000000..e834cc937f --- /dev/null +++ b/REORG.TODO/sysdeps/arm/armv7/multiarch/Makefile @@ -0,0 +1,3 @@ +ifeq ($(subdir),string) +sysdep_routines += memcpy_neon memcpy_vfp +endif diff --git a/REORG.TODO/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c b/REORG.TODO/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c new file mode 100644 index 0000000000..c6a2a98a55 --- /dev/null +++ b/REORG.TODO/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c @@ -0,0 +1,2 @@ +/* Empty file to override sysdeps/arm version. See memcpy.S for definitions + of these functions. */ diff --git a/REORG.TODO/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c new file mode 100644 index 0000000000..b8094fd393 --- /dev/null +++ b/REORG.TODO/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c @@ -0,0 +1,56 @@ +/* Enumerate available IFUNC implementations of a function. ARM version. + Copyright (C) 2013-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdbool.h> +#include <string.h> +#include <ldsodefs.h> +#include <sysdep.h> +#include <ifunc-impl-list.h> + +/* Fill ARRAY of MAX elements with IFUNC implementations for function + NAME and return the number of valid entries. */ + +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + size_t i = 0; + + bool use_neon = true; +#ifdef __ARM_NEON__ +# define __memcpy_neon memcpy +#else + use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0; +#endif + +#ifndef __ARM_NEON__ + bool use_vfp = true; +# ifdef __SOFTFP__ + use_vfp = (GLRO(dl_hwcap) & HWCAP_ARM_VFP) != 0; +# endif +#endif + + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, use_neon, __memcpy_neon) +#ifndef __ARM_NEON__ + IFUNC_IMPL_ADD (array, i, memcpy, use_vfp, __memcpy_vfp) +#endif + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm)); + + return i; +} diff --git a/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy.S b/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy.S new file mode 100644 index 0000000000..8a53bdaf91 --- /dev/null +++ b/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy.S @@ -0,0 +1,76 @@ +/* Multiple versions of memcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2013-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Thumb requires excess IT instructions here. */ +#define NO_THUMB +#include <sysdep.h> +#include <rtld-global-offsets.h> + +#if IS_IN (libc) +/* Under __ARM_NEON__, memcpy_neon.S defines the name memcpy. */ +# ifndef __ARM_NEON__ + .text +ENTRY(memcpy) + .type memcpy, %gnu_indirect_function +# ifdef __SOFTFP__ + ldr r1, .Lmemcpy_arm + tst r0, #HWCAP_ARM_VFP + ldrne r1, .Lmemcpy_vfp +# else + ldr r1, .Lmemcpy_vfp +# endif + tst r0, #HWCAP_ARM_NEON + ldrne r1, .Lmemcpy_neon +1: + add r0, r1, pc + DO_RET(lr) + +# ifdef __SOFTFP__ +.Lmemcpy_arm: + .long C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS +# endif +.Lmemcpy_neon: + .long C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS +.Lmemcpy_vfp: + .long C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS + +END(memcpy) + +libc_hidden_builtin_def (memcpy) +#endif /* Not __ARM_NEON__. */ + +/* These versions of memcpy are defined not to clobber any VFP or NEON + registers so they must always call the ARM variant of the memcpy code. */ +strong_alias (__memcpy_arm, __aeabi_memcpy) +strong_alias (__memcpy_arm, __aeabi_memcpy4) +strong_alias (__memcpy_arm, __aeabi_memcpy8) +libc_hidden_def (__memcpy_arm) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) +#undef weak_alias +#define weak_alias(x, y) +#undef libc_hidden_def +#define libc_hidden_def(name) + +#define memcpy __memcpy_arm + +#endif + +#include "memcpy_impl.S" diff --git a/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_impl.S new file mode 100644 index 0000000000..c1b9fb0ab5 --- /dev/null +++ b/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_impl.S @@ -0,0 +1,728 @@ +/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15. + Copyright (C) 2013-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. + + This memcpy routine is optimised for Cortex-A15 cores and takes advantage + of VFP or NEON when built with the appropriate flags. + + Assumptions: + + ARMv6 (ARMv7-a if using Neon) + ARM state + Unaligned accesses + + */ + +/* Thumb cannot encode negative immediate offsets in memory operations. */ +#ifndef NO_THUMB +#define NO_THUMB +#endif +#include <sysdep.h> +#include <arm-features.h> + + .syntax unified + /* This implementation requires ARM state. */ + .arm + +#ifdef MEMCPY_NEON + + .fpu neon + .arch armv7-a +# define FRAME_SIZE 4 +# define USE_VFP +# define USE_NEON + +#elif defined (MEMCPY_VFP) + + .arch armv6 + .fpu vfpv2 +# define FRAME_SIZE 32 +# define USE_VFP + +#else + .arch armv6 +# define FRAME_SIZE 32 + +#endif + +#define ALIGN(addr, align) addr:align + +#define INSN_SIZE 4 + +/* Call parameters. */ +#define dstin r0 +#define src r1 +#define count r2 + +/* Locals. */ +#define tmp1 r3 +#define dst ip +#define tmp2 r8 + +/* These two macros both work by repeated invocation of the macro + dispatch_step (not defined here). That macro performs one "step", + doing one load instruction and one store instruction to copy one + "unit". On entry, TMP1 contains the number of bytes to be copied, + a multiple of the unit size. The macro clobbers TMP1 in the + process of doing a computed jump to the tail containing the + appropriate number of steps. + + In dispatch_7_dword, dispatch_step is invoked seven times, with an + argument that is 7 for the first and 1 for the last. Units are + double-words (8 bytes). TMP1 is at most 56. + + In dispatch_15_word, dispatch_step is invoked fifteen times, + with an argument that is 15 for the first and 1 for the last. + Units are words (4 bytes). TMP1 is at most 60. */ + +#ifndef ARM_ALWAYS_BX +# if ARM_BX_ALIGN_LOG2 != 2 +# error case not handled +# endif + .macro dispatch_7_dword + rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE) + add pc, pc, tmp1 + dispatch_step 7 + dispatch_step 6 + dispatch_step 5 + dispatch_step 4 + dispatch_step 3 + dispatch_step 2 + dispatch_step 1 + .purgem dispatch_step + .endm + + .macro dispatch_15_word + rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2) + add pc, pc, tmp1, lsl #1 + dispatch_step 15 + dispatch_step 14 + dispatch_step 13 + dispatch_step 12 + dispatch_step 11 + dispatch_step 10 + dispatch_step 9 + dispatch_step 8 + dispatch_step 7 + dispatch_step 6 + dispatch_step 5 + dispatch_step 4 + dispatch_step 3 + dispatch_step 2 + dispatch_step 1 + .purgem dispatch_step + .endm +#else +# if ARM_BX_ALIGN_LOG2 < 3 +# error case not handled +# endif + .macro dispatch_helper steps, log2_bytes_per_step + /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is + (STEPS << LOG2_BYTES_PER_STEP). + So this is (steps_to_skip << LOG2_BYTES_PER_STEP). + Then it needs further adjustment to compensate for the + distance between the PC value taken below (0f + PC_OFS) + and the first step's instructions (1f). */ + rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \ + + ((1f - PC_OFS - 0f) \ + >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step))) + /* Shifting down LOG2_BYTES_PER_STEP gives us the number of + steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us + the (byte) distance to add to the PC. */ +0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step) + bx tmp1 + .p2align ARM_BX_ALIGN_LOG2 +1: + .endm + + .macro dispatch_7_dword + dispatch_helper 7, 3 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 7 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 6 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 5 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 4 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 3 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 2 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 1 + .p2align ARM_BX_ALIGN_LOG2 + .purgem dispatch_step + .endm + + .macro dispatch_15_word + dispatch_helper 15, 2 + dispatch_step 15 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 14 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 13 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 12 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 11 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 10 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 9 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 8 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 7 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 6 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 5 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 4 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 3 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 2 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 1 + .p2align ARM_BX_ALIGN_LOG2 + .purgem dispatch_step + .endm + +#endif + +#ifndef USE_NEON +/* For bulk copies using GP registers. */ +#define A_l r2 /* Call-clobbered. */ +#define A_h r3 /* Call-clobbered. */ +#define B_l r4 +#define B_h r5 +#define C_l r6 +#define C_h r7 +/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */ +#define D_l r10 +#define D_h r11 +#endif + +/* Number of lines ahead to pre-fetch data. If you change this the code + below will need adjustment to compensate. */ + +#define prefetch_lines 5 + +#ifdef USE_VFP + .macro cpy_line_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm + + .macro cpy_tail_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm +#endif + + .p2align 6 +ENTRY(memcpy) + + mov dst, dstin /* Preserve dstin, we need to return it. */ + cmp count, #64 + bge .Lcpy_not_short + /* Deal with small copies quickly by dropping straight into the + exit block. */ + +.Ltail63unaligned: +#ifdef USE_NEON + /* These need an extra layer of macro just to work around a + bug in the assembler's parser when an operand starts with + a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647 + tracks that bug; it was not fixed as of binutils-2.23.2. */ + .macro neon_load_d0 reg + vld1.8 {d0}, [\reg]! + .endm + .macro neon_store_d0 reg + vst1.8 {d0}, [\reg]! + .endm + + and tmp1, count, #0x38 + .macro dispatch_step i + neon_load_d0 src + neon_store_d0 dst + .endm + dispatch_7_dword + + tst count, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 +#else + /* Copy up to 15 full words of data. May not be aligned. */ + /* Cannot use VFP for unaligned data. */ + and tmp1, count, #0x3c + add dst, dst, tmp1 + add src, src, tmp1 + /* Jump directly into the sequence below at the correct offset. */ + .macro dispatch_step i + ldr tmp1, [src, #-(\i * 4)] + str tmp1, [dst, #-(\i * 4)] + .endm + dispatch_15_word +#endif + + lsls count, count, #31 + ldrhcs tmp1, [src], #2 + ldrbne src, [src] /* Src is dead, use as a scratch. */ + strhcs tmp1, [dst], #2 + strbne src, [dst] + bx lr + +.Lcpy_not_short: + /* At least 64 bytes to copy, but don't know the alignment yet. */ + str tmp2, [sp, #-FRAME_SIZE]! + cfi_adjust_cfa_offset (FRAME_SIZE) + cfi_rel_offset (tmp2, 0) + cfi_remember_state + and tmp2, src, #7 + and tmp1, dst, #7 + cmp tmp1, tmp2 + bne .Lcpy_notaligned + +#ifdef USE_VFP + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show + that the FP pipeline is much better at streaming loads and + stores. This is outside the critical loop. */ + vmov.f32 s0, s0 +#endif + + /* SRC and DST have the same mutual 64-bit alignment, but we may + still need to pre-copy some bytes to get to natural alignment. + We bring SRC and DST into full 64-bit alignment. */ + lsls tmp2, dst, #29 + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src], #1 + strhcs tmp1, [dst], #2 + strbne tmp2, [dst], #1 + +1: + subs tmp2, count, #64 /* Use tmp2 for count. */ + blt .Ltail63aligned + + cmp tmp2, #512 + bge .Lcpy_body_long + +.Lcpy_body_medium: /* Count in tmp2. */ +#ifdef USE_VFP +1: + vldr d0, [src, #0] + subs tmp2, tmp2, #64 + vldr d1, [src, #8] + vstr d0, [dst, #0] + vldr d0, [src, #16] + vstr d1, [dst, #8] + vldr d1, [src, #24] + vstr d0, [dst, #16] + vldr d0, [src, #32] + vstr d1, [dst, #24] + vldr d1, [src, #40] + vstr d0, [dst, #32] + vldr d0, [src, #48] + vstr d1, [dst, #40] + vldr d1, [src, #56] + vstr d0, [dst, #48] + add src, src, #64 + vstr d1, [dst, #56] + add dst, dst, #64 + bge 1b + tst tmp2, #0x3f + beq .Ldone + +.Ltail63aligned: /* Count in tmp2. */ + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + .macro dispatch_step i + vldr d0, [src, #-(\i * 8)] + vstr d0, [dst, #-(\i * 8)] + .endm + dispatch_7_dword +#else + sub src, src, #8 + sub dst, dst, #8 +1: + ldrd A_l, A_h, [src, #8] + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #16] + strd A_l, A_h, [dst, #16] + ldrd A_l, A_h, [src, #24] + strd A_l, A_h, [dst, #24] + ldrd A_l, A_h, [src, #32] + strd A_l, A_h, [dst, #32] + ldrd A_l, A_h, [src, #40] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #48] + strd A_l, A_h, [dst, #48] + ldrd A_l, A_h, [src, #56] + strd A_l, A_h, [dst, #56] + ldrd A_l, A_h, [src, #64]! + strd A_l, A_h, [dst, #64]! + subs tmp2, tmp2, #64 + bge 1b + tst tmp2, #0x3f + bne 1f + ldr tmp2,[sp], #FRAME_SIZE + cfi_adjust_cfa_offset (-FRAME_SIZE) + cfi_restore (tmp2) + bx lr + + cfi_restore_state + cfi_remember_state +1: + add src, src, #8 + add dst, dst, #8 + +.Ltail63aligned: /* Count in tmp2. */ + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but + we know that the src and dest are 64-bit aligned so we can use + LDRD/STRD to improve efficiency. */ + /* TMP2 is now negative, but we don't care about that. The bottom + six bits still tell us how many bytes are left to copy. */ + + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + .macro dispatch_step i + ldrd A_l, A_h, [src, #-(\i * 8)] + strd A_l, A_h, [dst, #-(\i * 8)] + .endm + dispatch_7_dword +#endif + + tst tmp2, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src] + strhcs tmp1, [dst], #2 + strbne tmp2, [dst] + +.Ldone: + ldr tmp2, [sp], #FRAME_SIZE + cfi_adjust_cfa_offset (-FRAME_SIZE) + cfi_restore (tmp2) + bx lr + + cfi_restore_state + cfi_remember_state + +.Lcpy_body_long: /* Count in tmp2. */ + + /* Long copy. We know that there's at least (prefetch_lines * 64) + bytes to go. */ +#ifdef USE_VFP + /* Don't use PLD. Instead, read some data in advance of the current + copy position into a register. This should act like a PLD + operation but we won't have to repeat the transfer. */ + + vldr d3, [src, #0] + vldr d4, [src, #64] + vldr d5, [src, #128] + vldr d6, [src, #192] + vldr d7, [src, #256] + + vldr d0, [src, #8] + vldr d1, [src, #16] + vldr d2, [src, #24] + add src, src, #32 + + subs tmp2, tmp2, #prefetch_lines * 64 * 2 + blt 2f +1: + cpy_line_vfp d3, 0 + cpy_line_vfp d4, 64 + cpy_line_vfp d5, 128 + add dst, dst, #3 * 64 + add src, src, #3 * 64 + cpy_line_vfp d6, 0 + cpy_line_vfp d7, 64 + add dst, dst, #2 * 64 + add src, src, #2 * 64 + subs tmp2, tmp2, #prefetch_lines * 64 + bge 1b + +2: + cpy_tail_vfp d3, 0 + cpy_tail_vfp d4, 64 + cpy_tail_vfp d5, 128 + add src, src, #3 * 64 + add dst, dst, #3 * 64 + cpy_tail_vfp d6, 0 + vstr d7, [dst, #64] + vldr d7, [src, #64] + vstr d0, [dst, #64 + 8] + vldr d0, [src, #64 + 8] + vstr d1, [dst, #64 + 16] + vldr d1, [src, #64 + 16] + vstr d2, [dst, #64 + 24] + vldr d2, [src, #64 + 24] + vstr d7, [dst, #64 + 32] + add src, src, #96 + vstr d0, [dst, #64 + 40] + vstr d1, [dst, #64 + 48] + vstr d2, [dst, #64 + 56] + add dst, dst, #128 + add tmp2, tmp2, #prefetch_lines * 64 + b .Lcpy_body_medium +#else + /* Long copy. Use an SMS style loop to maximize the I/O + bandwidth of the core. We don't have enough spare registers + to synthesise prefetching, so use PLD operations. */ + /* Pre-bias src and dst. */ + sub src, src, #8 + sub dst, dst, #8 + pld [src, #8] + pld [src, #72] + subs tmp2, tmp2, #64 + pld [src, #136] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [sp, #8] + cfi_rel_offset (B_l, 8) + cfi_rel_offset (B_h, 12) + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [sp, #16] + cfi_rel_offset (C_l, 16) + cfi_rel_offset (C_h, 20) + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [sp, #24] + cfi_rel_offset (D_l, 24) + cfi_rel_offset (D_h, 28) + pld [src, #200] + ldrd D_l, D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #232] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldrd D_l, D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldrd D_l, D_h, [src, #32] + bcs 2b + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #40 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + cfi_restore (B_l) + cfi_restore (B_h) + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + cfi_restore (C_l) + cfi_restore (C_h) + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + cfi_restore (D_l) + cfi_restore (D_h) + add dst, dst, #72 + tst tmp2, #0x3f + bne .Ltail63aligned + ldr tmp2, [sp], #FRAME_SIZE + cfi_adjust_cfa_offset (-FRAME_SIZE) + cfi_restore (tmp2) + bx lr +#endif + + cfi_restore_state + cfi_remember_state + +.Lcpy_notaligned: + pld [src, #0] + pld [src, #64] + /* There's at least 64 bytes to copy, but there is no mutual + alignment. */ + /* Bring DST to 64-bit alignment. */ + lsls tmp2, dst, #29 + pld [src, #(2 * 64)] + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrbne tmp1, [src], #1 + ldrhcs tmp2, [src], #2 + strbne tmp1, [dst], #1 + strhcs tmp2, [dst], #2 +1: + pld [src, #(3 * 64)] + subs count, count, #64 + ldrmi tmp2, [sp], #FRAME_SIZE + bmi .Ltail63unaligned + pld [src, #(4 * 64)] + +#ifdef USE_NEON + /* These need an extra layer of macro just to work around a + bug in the assembler's parser when an operand starts with + a {...}. */ + .macro neon_load_multi reglist, basereg + vld1.8 {\reglist}, [\basereg]! + .endm + .macro neon_store_multi reglist, basereg + vst1.8 {\reglist}, [ALIGN (\basereg, 64)]! + .endm + + neon_load_multi d0-d3, src + neon_load_multi d4-d7, src + subs count, count, #64 + bmi 2f +1: + pld [src, #(4 * 64)] + neon_store_multi d0-d3, dst + neon_load_multi d0-d3, src + neon_store_multi d4-d7, dst + neon_load_multi d4-d7, src + subs count, count, #64 + bpl 1b +2: + neon_store_multi d0-d3, dst + neon_store_multi d4-d7, dst + ands count, count, #0x3f +#else + /* Use an SMS style loop to maximize the I/O bandwidth. */ + sub src, src, #4 + sub dst, dst, #8 + subs tmp2, count, #64 /* Use tmp2 for count. */ + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [sp, #8] + cfi_rel_offset (B_l, 8) + cfi_rel_offset (B_h, 12) + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [sp, #16] + cfi_rel_offset (C_l, 16) + cfi_rel_offset (C_h, 20) + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [sp, #24] + cfi_rel_offset (D_l, 24) + cfi_rel_offset (D_h, 28) + ldr D_l, [src, #28] + ldr D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #(5 * 64) - (32 - 4)] + strd A_l, A_h, [dst, #40] + ldr A_l, [src, #36] + ldr A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldr B_l, [src, #44] + ldr B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldr C_l, [src, #52] + ldr C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldr D_l, [src, #60] + ldr D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldr D_l, [src, #28] + ldr D_h, [src, #32] + bcs 2b + + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #36 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + cfi_restore (B_l) + cfi_restore (B_h) + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + cfi_restore (C_l) + cfi_restore (C_h) + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + cfi_restore (D_l) + cfi_restore (D_h) + add dst, dst, #72 + ands count, tmp2, #0x3f +#endif + ldr tmp2, [sp], #FRAME_SIZE + cfi_adjust_cfa_offset (-FRAME_SIZE) + cfi_restore (tmp2) + bne .Ltail63unaligned + bx lr + +END(memcpy) +libc_hidden_builtin_def (memcpy) diff --git a/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_neon.S new file mode 100644 index 0000000000..e60d1cc0e1 --- /dev/null +++ b/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_neon.S @@ -0,0 +1,9 @@ +#ifdef __ARM_NEON__ +/* Under __ARM_NEON__, this file defines memcpy directly. */ +libc_hidden_builtin_def (memcpy) +#else +# define memcpy __memcpy_neon +#endif + +#define MEMCPY_NEON +#include "memcpy_impl.S" diff --git a/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_vfp.S new file mode 100644 index 0000000000..e008c041ed --- /dev/null +++ b/REORG.TODO/sysdeps/arm/armv7/multiarch/memcpy_vfp.S @@ -0,0 +1,7 @@ +/* Under __ARM_NEON__, memcpy_neon.S defines memcpy directly + and the __memcpy_vfp code will never be used. */ +#ifndef __ARM_NEON__ +# define MEMCPY_VFP +# define memcpy __memcpy_vfp +# include "memcpy_impl.S" +#endif diff --git a/REORG.TODO/sysdeps/arm/armv7/strcmp.S b/REORG.TODO/sysdeps/arm/armv7/strcmp.S new file mode 100644 index 0000000000..25d055754e --- /dev/null +++ b/REORG.TODO/sysdeps/arm/armv7/strcmp.S @@ -0,0 +1,519 @@ +/* strcmp implementation for ARMv7-A, optimized for Cortex-A15. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <arm-features.h> +#include <sysdep.h> + +/* Implementation of strcmp for ARMv7 when DSP instructions are + available. Use ldrd to support wider loads, provided the data + is sufficiently aligned. Use saturating arithmetic to optimize + the compares. */ + +/* Build Options: + STRCMP_PRECHECK: Run a quick pre-check of the first byte in the + string. If comparing completely random strings the pre-check will + save time, since there is a very high probability of a mismatch in + the first character: we save significant overhead if this is the + common case. However, if strings are likely to be identical (e.g. + because we're verifying a hit in a hash table), then this check + is largely redundant. */ + +#define STRCMP_PRECHECK 1 + + .syntax unified + +#ifdef __ARM_BIG_ENDIAN +# define S2LO lsl +# define S2LOEQ lsleq +# define S2HI lsr +# define MSB 0x000000ff +# define LSB 0xff000000 +# define BYTE0_OFFSET 24 +# define BYTE1_OFFSET 16 +# define BYTE2_OFFSET 8 +# define BYTE3_OFFSET 0 +#else /* not __ARM_BIG_ENDIAN */ +# define S2LO lsr +# define S2LOEQ lsreq +# define S2HI lsl +# define BYTE0_OFFSET 0 +# define BYTE1_OFFSET 8 +# define BYTE2_OFFSET 16 +# define BYTE3_OFFSET 24 +# define MSB 0xff000000 +# define LSB 0x000000ff +#endif /* not __ARM_BIG_ENDIAN */ + +/* Parameters and result. */ +#define src1 r0 +#define src2 r1 +#define result r0 /* Overlaps src1. */ + +/* Internal variables. */ +#define tmp1 r4 +#define tmp2 r5 +#define const_m1 r12 + +/* Additional internal variables for 64-bit aligned data. */ +#define data1a r2 +#define data1b r3 +#define data2a r6 +#define data2b r7 +#define syndrome_a tmp1 +#define syndrome_b tmp2 + +/* Additional internal variables for 32-bit aligned data. */ +#define data1 r2 +#define data2 r3 +#define syndrome tmp2 + + +#ifndef NO_THUMB +/* This code is best on Thumb. */ + .thumb + +/* In Thumb code we can't use MVN with a register shift, but we do have ORN. */ +.macro prepare_mask mask_reg, nbits_reg + S2HI \mask_reg, const_m1, \nbits_reg +.endm +.macro apply_mask data_reg, mask_reg + orn \data_reg, \data_reg, \mask_reg +.endm +#else +/* In ARM code we don't have ORN, but we can use MVN with a register shift. */ +.macro prepare_mask mask_reg, nbits_reg + mvn \mask_reg, const_m1, S2HI \nbits_reg +.endm +.macro apply_mask data_reg, mask_reg + orr \data_reg, \data_reg, \mask_reg +.endm + +/* These clobber the condition codes, which the real Thumb cbz/cbnz + instructions do not. But it doesn't matter for any of the uses here. */ +.macro cbz reg, label + cmp \reg, #0 + beq \label +.endm +.macro cbnz reg, label + cmp \reg, #0 + bne \label +.endm +#endif + + + /* Macro to compute and return the result value for word-aligned + cases. */ + .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 +#ifdef __ARM_BIG_ENDIAN + /* If data1 contains a zero byte, then syndrome will contain a 1 in + bit 7 of that byte. Otherwise, the highest set bit in the + syndrome will highlight the first different bit. It is therefore + sufficient to extract the eight bits starting with the syndrome + bit. */ + clz tmp1, \synd + lsl r1, \d2, tmp1 + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + lsl \d1, \d1, tmp1 + lsr result, \d1, #24 + ldrd r4, r5, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + cfi_restore (r6) + cfi_restore (r7) + sub result, result, r1, lsr #24 + bx lr +#else + /* To use the big-endian trick we'd have to reverse all three words. + that's slower than this approach. */ + rev \synd, \synd + clz tmp1, \synd + bic tmp1, tmp1, #7 + lsr r1, \d2, tmp1 + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + lsr \d1, \d1, tmp1 + and result, \d1, #255 + and r1, r1, #255 + ldrd r4, r5, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + cfi_restore (r6) + cfi_restore (r7) + sub result, result, r1 + + bx lr +#endif + .endm + + .text + .p2align 5 +.Lstrcmp_start_addr: +#if STRCMP_PRECHECK == 1 +.Lfastpath_exit: + sub r0, r2, r3 + bx lr + nop +#endif +ENTRY (strcmp) +#if STRCMP_PRECHECK == 1 + ldrb r2, [src1] + ldrb r3, [src2] + cmp r2, #1 + it cs + cmpcs r2, r3 + bne .Lfastpath_exit +#endif + strd r4, r5, [sp, #-16]! + cfi_def_cfa_offset (16) + cfi_offset (r4, -16) + cfi_offset (r5, -12) + orr tmp1, src1, src2 + strd r6, r7, [sp, #8] + cfi_offset (r6, -8) + cfi_offset (r7, -4) + mvn const_m1, #0 + lsl r2, tmp1, #29 + cbz r2, .Lloop_aligned8 + +.Lnot_aligned: + eor tmp1, src1, src2 + tst tmp1, #7 + bne .Lmisaligned8 + + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + and tmp1, src1, #7 + bic src1, src1, #7 + and tmp2, tmp1, #3 + bic src2, src2, #7 + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ + ldrd data1a, data1b, [src1], #16 + tst tmp1, #4 + ldrd data2a, data2b, [src2], #16 + prepare_mask tmp1, tmp2 + apply_mask data1a, tmp1 + apply_mask data2a, tmp1 + beq .Lstart_realigned8 + apply_mask data1b, tmp1 + mov data1a, const_m1 + apply_mask data2b, tmp1 + mov data2a, const_m1 + b .Lstart_realigned8 + + /* Unwind the inner loop by a factor of 2, giving 16 bytes per + pass. */ + .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ + .p2align 2 /* Always word aligned. */ +.Lloop_aligned8: + ldrd data1a, data1b, [src1], #16 + ldrd data2a, data2b, [src2], #16 +.Lstart_realigned8: + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + cbnz syndrome_a, .Ldiff_in_a + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + cbnz syndrome_b, .Ldiff_in_b + + ldrd data1a, data1b, [src1, #-8] + ldrd data2a, data2b, [src2, #-8] + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + /* Can't use CBZ for backwards branch. */ + orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ + beq .Lloop_aligned8 + +.Ldiff_found: + cbnz syndrome_a, .Ldiff_in_a + +.Ldiff_in_b: + strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 + +.Ldiff_in_a: + cfi_restore_state + strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 + + cfi_restore_state +.Lmisaligned8: + tst tmp1, #3 + bne .Lmisaligned4 + ands tmp1, src1, #3 + bne .Lmutual_align4 + + /* Unrolled by a factor of 2, to reduce the number of post-increment + operations. */ +.Lloop_aligned4: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned4: + uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cbnz syndrome, .Laligned4_done + ldr data1, [src1, #-4] + ldr data2, [src2, #-4] + uadd8 syndrome, data1, const_m1 + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cmp syndrome, #0 + beq .Lloop_aligned4 + +.Laligned4_done: + strcmp_epilogue_aligned syndrome, data1, data2, 0 + +.Lmutual_align4: + cfi_restore_state + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + lsl tmp1, tmp1, #3 /* Bytes -> bits. */ + bic src1, src1, #3 + ldr data1, [src1], #8 + bic src2, src2, #3 + ldr data2, [src2], #8 + + prepare_mask tmp1, tmp1 + apply_mask data1, tmp1 + apply_mask data2, tmp1 + b .Lstart_realigned4 + +.Lmisaligned4: + ands tmp1, src1, #3 + beq .Lsrc1_aligned + sub src2, src2, tmp1 + bic src1, src1, #3 + lsls tmp1, tmp1, #31 + ldr data1, [src1], #4 + beq .Laligned_m2 + bcs .Laligned_m1 + +#if STRCMP_PRECHECK == 0 + ldrb data2, [src2, #1] + uxtb tmp1, data1, ror #BYTE1_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m1: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + add src2, src2, #4 + cbnz data2, .Lsrc1_aligned +#else /* STRCMP_PRECHECK */ + /* If we've done the pre-check, then we don't need to check the + first byte again here. */ + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbnz data2, .Laligned_m1 +#endif + +.Lmisaligned_exit: + mov result, tmp1 + ldr r4, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + cfi_restore (r6) + cfi_restore (r7) + bx lr + +#if STRCMP_PRECHECK == 1 +.Laligned_m1: + add src2, src2, #4 +#endif +.Lsrc1_aligned: + cfi_restore_state + /* src1 is word aligned, but src2 has no common alignment + with it. */ + ldr data1, [src1], #4 + lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ + + bic src2, src2, #3 + ldr data2, [src2], #4 + bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ + bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ + + /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ +.Loverlap3: + bic tmp1, data1, #MSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #8 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #24 + bne 6f + ldr data1, [src1], #4 + b .Loverlap3 +4: + S2LO data2, data2, #8 + b .Lstrcmp_tail + +5: + bics syndrome, syndrome, #MSB + bne .Lstrcmp_done_equal + + /* We can only get here if the MSB of data1 contains 0, so + fast-path the exit. */ + ldrb result, [src2] + ldrd r4, r5, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + /* R6/7 Not used in this sequence. */ + cfi_restore (r6) + cfi_restore (r7) + neg result, result + bx lr + +6: + cfi_restore_state + S2LO data1, data1, #24 + and data2, data2, #LSB + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap2: + and tmp1, data1, const_m1, S2LO #16 + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #16 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #16 + bne 6f + ldr data1, [src1], #4 + b .Loverlap2 +4: + S2LO data2, data2, #16 + b .Lstrcmp_tail +5: + ands syndrome, syndrome, const_m1, S2LO #16 + bne .Lstrcmp_done_equal + + ldrh data2, [src2] + S2LO data1, data1, #16 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #16 +#endif + b .Lstrcmp_tail + +6: + S2LO data1, data1, #16 + and data2, data2, const_m1, S2LO #16 + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap1: + and tmp1, data1, #LSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #24 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #8 + bne 6f + ldr data1, [src1], #4 + b .Loverlap1 +4: + S2LO data2, data2, #24 + b .Lstrcmp_tail +5: + tst syndrome, #LSB + bne .Lstrcmp_done_equal + ldr data2, [src2] +6: + S2LO data1, data1, #8 + bic data2, data2, #MSB + b .Lstrcmp_tail + +.Lstrcmp_done_equal: + mov result, #0 + ldrd r4, r5, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + /* R6/7 not used in this sequence. */ + cfi_restore (r6) + cfi_restore (r7) + bx lr + +.Lstrcmp_tail: + cfi_restore_state +#ifndef __ARM_BIG_ENDIAN + rev data1, data1 + rev data2, data2 + /* Now everything looks big-endian... */ +#endif + uadd8 tmp1, data1, const_m1 + eor tmp1, data1, data2 + sel syndrome, tmp1, const_m1 + clz tmp1, syndrome + lsl data1, data1, tmp1 + lsl data2, data2, tmp1 + lsr result, data1, #24 + ldrd r4, r5, [sp], #16 + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + /* R6/7 not used in this sequence. */ + cfi_restore (r6) + cfi_restore (r7) + sub result, result, data2, lsr #24 + bx lr +END (strcmp) +libc_hidden_builtin_def (strcmp) |