diff options
author | Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> | 2018-08-16 12:12:02 +0530 |
---|---|---|
committer | Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> | 2018-08-16 12:12:02 +0530 |
commit | 7793ad7a2c00434398aa8bb3f5932e2fdf43536a (patch) | |
tree | 4adb3fc19bdde52c7ccdc0f110646cf18de8a32d /sysdeps/powerpc/powerpc64/power9/strcmp.S | |
parent | 436e4d5b965abe592d26150cb518accf9ded8fe4 (diff) | |
download | glibc-7793ad7a2c00434398aa8bb3f5932e2fdf43536a.tar.gz glibc-7793ad7a2c00434398aa8bb3f5932e2fdf43536a.tar.xz glibc-7793ad7a2c00434398aa8bb3f5932e2fdf43536a.zip |
powerpc: Rearrange little endian specific files
This patch moves little endian specific POWER9 optimization files to sysdeps/powerpc/powerpc64/le and creates POWER9 ifunc functions only for little endian.
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power9/strcmp.S')
-rw-r--r-- | sysdeps/powerpc/powerpc64/power9/strcmp.S | 268 |
1 files changed, 0 insertions, 268 deletions
diff --git a/sysdeps/powerpc/powerpc64/power9/strcmp.S b/sysdeps/powerpc/powerpc64/power9/strcmp.S deleted file mode 100644 index 98243a9d51..0000000000 --- a/sysdeps/powerpc/powerpc64/power9/strcmp.S +++ /dev/null @@ -1,268 +0,0 @@ -/* Optimized strcmp implementation for PowerPC64/POWER9. - Copyright (C) 2016-2018 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ -#ifdef __LITTLE_ENDIAN__ -#include <sysdep.h> - -#ifndef STRCMP -# define STRCMP strcmp -#endif - -/* Implements the function - - int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) - - The implementation uses unaligned doubleword access for first 32 bytes - as in POWER8 patch and uses vectorised loops after that. */ - -/* TODO: Change this to actual instructions when minimum binutils is upgraded - to 2.27. Macros are defined below for these newer instructions in order - to maintain compatibility. */ -# define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21))) - -# define VEXTUBRX(t,a,b) .long (0x1000070d \ - | ((t)<<(32-11)) \ - | ((a)<<(32-16)) \ - | ((b)<<(32-21)) ) - -# define VCMPNEZB(t,a,b) .long (0x10000507 \ - | ((t)<<(32-11)) \ - | ((a)<<(32-16)) \ - | ((b)<<(32-21)) ) - -/* Get 16 bytes for unaligned case. - reg1: Vector to hold next 16 bytes. - reg2: Address to read from. - reg3: Permute control vector. */ -# define GET16BYTES(reg1, reg2, reg3) \ - lvx reg1, 0, reg2; \ - vperm v8, v2, reg1, reg3; \ - vcmpequb. v8, v0, v8; \ - beq cr6, 1f; \ - vspltisb v9, 0; \ - b 2f; \ - .align 4; \ -1: \ - addi r6, reg2, 16; \ - lvx v9, 0, r6; \ -2: \ - vperm reg1, v9, reg1, reg3; - -/* TODO: change this to .machine power9 when the minimum required binutils - allows it. */ - - .machine power7 -ENTRY_TOCLESS (STRCMP, 4) - li r0, 0 - - /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using - the code: - - (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) - - with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ - - rldicl r7, r3, 0, 52 - rldicl r9, r4, 0, 52 - cmpldi cr7, r7, 4096-16 - bgt cr7, L(pagecross_check) - cmpldi cr5, r9, 4096-16 - bgt cr5, L(pagecross_check) - - /* For short strings up to 16 bytes, load both s1 and s2 using - unaligned dwords and compare. */ - ld r8, 0(r3) - ld r10, 0(r4) - cmpb r12, r8, r0 - cmpb r11, r8, r10 - orc. r9, r12, r11 - bne cr0, L(different_nocmpb) - - ld r8, 8(r3) - ld r10, 8(r4) - cmpb r12, r8, r0 - cmpb r11, r8, r10 - orc. r9, r12, r11 - bne cr0, L(different_nocmpb) - - addi r7, r3, 16 - addi r4, r4, 16 - -L(align): - /* Now it has checked for first 16 bytes. */ - vspltisb v0, 0 - vspltisb v2, -1 - lvsr v6, 0, r4 /* Compute mask. */ - or r5, r4, r7 - andi. r5, r5, 0xF - beq cr0, L(aligned) - andi. r5, r7, 0xF - beq cr0, L(s1_align) - lvsr v10, 0, r7 /* Compute mask. */ - - /* Both s1 and s2 are unaligned. */ - GET16BYTES(v4, r7, v10) - GET16BYTES(v5, r4, v6) - VCMPNEZB(v7, v5, v4) - beq cr6, L(match) - b L(different) - - /* Align s1 to qw and adjust s2 address. */ - .align 4 -L(match): - clrldi r6, r7, 60 - subfic r5, r6, 16 - add r7, r7, r5 - add r4, r4, r5 - andi. r5, r4, 0xF - beq cr0, L(aligned) - lvsr v6, 0, r4 - /* There are 2 loops depending on the input alignment. - Each loop gets 16 bytes from s1 and s2 and compares. - Loop until a mismatch or null occurs. */ -L(s1_align): - lvx v4, r7, r0 - GET16BYTES(v5, r4, v6) - VCMPNEZB(v7, v5, v4) - addi r7, r7, 16 - addi r4, r4, 16 - bne cr6, L(different) - - lvx v4, r7, r0 - GET16BYTES(v5, r4, v6) - VCMPNEZB(v7, v5, v4) - addi r7, r7, 16 - addi r4, r4, 16 - bne cr6, L(different) - - lvx v4, r7, r0 - GET16BYTES(v5, r4, v6) - VCMPNEZB(v7, v5, v4) - addi r7, r7, 16 - addi r4, r4, 16 - bne cr6, L(different) - - lvx v4, r7, r0 - GET16BYTES(v5, r4, v6) - VCMPNEZB(v7, v5, v4) - addi r7, r7, 16 - addi r4, r4, 16 - beq cr6, L(s1_align) - b L(different) - - .align 4 -L(aligned): - lvx v4, 0, r7 - lvx v5, 0, r4 - VCMPNEZB(v7, v5, v4) - addi r7, r7, 16 - addi r4, r4, 16 - bne cr6, L(different) - - lvx v4, 0, r7 - lvx v5, 0, r4 - VCMPNEZB(v7, v5, v4) - addi r7, r7, 16 - addi r4, r4, 16 - bne cr6, L(different) - - lvx v4, 0, r7 - lvx v5, 0, r4 - VCMPNEZB(v7, v5, v4) - addi r7, r7, 16 - addi r4, r4, 16 - bne cr6, L(different) - - lvx v4, 0, r7 - lvx v5, 0, r4 - VCMPNEZB(v7, v5, v4) - addi r7, r7, 16 - addi r4, r4, 16 - beq cr6, L(aligned) - - /* Calculate and return the difference. */ -L(different): - VCTZLSBB(r6, v7) - VEXTUBRX(r5, r6, v4) - VEXTUBRX(r4, r6, v5) - subf r3, r4, r5 - extsw r3, r3 - blr - - .align 4 -L(different_nocmpb): - neg r3, r9 - and r9, r9, r3 - cntlzd r9, r9 - subfic r9, r9, 63 - srd r3, r8, r9 - srd r10, r10, r9 - rldicl r10, r10, 0, 56 - rldicl r3, r3, 0, 56 - subf r3, r10, r3 - extsw r3, r3 - blr - - .align 4 -L(pagecross_check): - subfic r9, r9, 4096 - subfic r7, r7, 4096 - cmpld cr7, r7, r9 - bge cr7, L(pagecross) - mr r7, r9 - - /* If unaligned 16 bytes reads across a 4K page boundary, it uses - a simple byte a byte comparison until the page alignment for s1 - is reached. */ -L(pagecross): - add r7, r3, r7 - subf r9, r3, r7 - mtctr r9 - - .align 4 -L(pagecross_loop): - /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 - and if *s1 is '\0'. */ - lbz r9, 0(r3) - lbz r10, 0(r4) - addi r3, r3, 1 - addi r4, r4, 1 - cmplw cr7, r9, r10 - cmpdi cr5, r9, r0 - bne cr7, L(pagecross_ne) - beq cr5, L(pagecross_nullfound) - bdnz L(pagecross_loop) - b L(align) - - .align 4 -L(pagecross_ne): - extsw r3, r9 - mr r9, r10 -L(pagecross_retdiff): - subf r9, r9, r3 - extsw r3, r9 - blr - - .align 4 -L(pagecross_nullfound): - li r3, 0 - b L(pagecross_retdiff) -END (STRCMP) -libc_hidden_builtin_def (strcmp) -#else -#include <sysdeps/powerpc/powerpc64/power8/strcmp.S> -#endif |