diff options
author | Andreas Schwab <schwab@redhat.com> | 2010-02-22 16:34:12 +0100 |
---|---|---|
committer | Andreas Schwab <schwab@redhat.com> | 2010-02-22 16:41:22 +0100 |
commit | 2b6e16c5e2fa8ad12b02a218ba32d63a4dffda18 (patch) | |
tree | b931e143f212d5b656497bb0c3f3e3a413fc78e1 /sysdeps | |
parent | 0c3f133ed106996172dd2c106b22e38ce695e63d (diff) | |
parent | 199428c19774c12b3c4b6e6486ea9d4a021288af (diff) | |
download | glibc-2b6e16c5e2fa8ad12b02a218ba32d63a4dffda18.tar.gz glibc-2b6e16c5e2fa8ad12b02a218ba32d63a4dffda18.tar.xz glibc-2b6e16c5e2fa8ad12b02a218ba32d63a4dffda18.zip |
Merge remote branch 'origin/master' into fedora/master
Diffstat (limited to 'sysdeps')
75 files changed, 6701 insertions, 389 deletions
diff --git a/sysdeps/i386/fpu/fegetenv.c b/sysdeps/i386/fpu/fegetenv.c index fb955cf565..ddb67e5d83 100644 --- a/sysdeps/i386/fpu/fegetenv.c +++ b/sysdeps/i386/fpu/fegetenv.c @@ -40,4 +40,5 @@ strong_alias (__fegetenv, __old_fegetenv) compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1); #endif +libm_hidden_ver (__fegetenv, fegetenv) versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2); diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index fbad9ae734..e8847d6fc4 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -7,7 +7,9 @@ ifeq ($(subdir),string) sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \ memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ - memset-sse2-rep bzero-sse2-rep + memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ + strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ + memcmp-ssse3 memcmp-sse4 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..b1ed778f1f --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -0,0 +1,1004 @@ +/* memcmp with SSE4.2 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef MEMCMP +# define MEMCMP __memcmp_sse4_2 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define PARMS 4 +#define BLK1 PARMS +#define BLK2 BLK1+4 +#define LEN BLK2+4 +#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) + + +#ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table and adjuested EDX/ESI. Go. */ \ + jmp *%ebx + + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + ALIGN (4) + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret +#else +# define JMPTBL(I, B) I + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + + .section .text.sse4.2,"ax",@progbits +ENTRY (MEMCMP) + movl BLK1(%esp), %eax + movl BLK2(%esp), %edx + movl LEN(%esp), %ecx + cmp $1, %ecx + jbe L(less1bytes) + pxor %xmm0, %xmm0 + cmp $64, %ecx + ja L(64bytesormore) + cmp $8, %ecx + PUSH (%ebx) + jb L(less8bytes) + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + + ALIGN (4) +L(less8bytes): + mov (%eax), %bl + cmpb (%edx), %bl + jne L(nonzero) + + mov 1(%eax), %bl + cmpb 1(%edx), %bl + jne L(nonzero) + + cmp $2, %ecx + jz L(0bytes) + + mov 2(%eax), %bl + cmpb 2(%edx), %bl + jne L(nonzero) + + cmp $3, %ecx + jz L(0bytes) + + mov 3(%eax), %bl + cmpb 3(%edx), %bl + jne L(nonzero) + + cmp $4, %ecx + jz L(0bytes) + + mov 4(%eax), %bl + cmpb 4(%edx), %bl + jne L(nonzero) + + cmp $5, %ecx + jz L(0bytes) + + mov 5(%eax), %bl + cmpb 5(%edx), %bl + jne L(nonzero) + + cmp $6, %ecx + jz L(0bytes) + + mov 6(%eax), %bl + cmpb 6(%edx), %bl + je L(0bytes) +L(nonzero): + POP (%ebx) + mov $1, %eax + ja L(above) + neg %eax +L(above): + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(0bytes): + POP (%ebx) + xor %eax, %eax + ret + + ALIGN (4) +L(less1bytes): + jb L(0bytesend) + movzbl (%eax), %eax + movzbl (%edx), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(0bytesend): + xor %eax, %eax + ret + + ALIGN (4) +L(64bytesormore): + PUSH (%ebx) + mov %ecx, %ebx + mov $64, %ecx + sub $64, %ebx +L(64bytesormore_loop): + movdqu (%eax), %xmm1 + movdqu (%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_16diff) + + movdqu 16(%eax), %xmm1 + movdqu 16(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_32diff) + + movdqu 32(%eax), %xmm1 + movdqu 32(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_48diff) + + movdqu 48(%eax), %xmm1 + movdqu 48(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_64diff) + add %ecx, %eax + add %ecx, %edx + sub %ecx, %ebx + jae L(64bytesormore_loop) + add %ebx, %ecx + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + + ALIGN (4) +L(find_16diff): + sub $16, %ecx +L(find_32diff): + sub $16, %ecx +L(find_48diff): + sub $16, %ecx +L(find_64diff): + add %ecx, %edx + add %ecx, %eax + jmp L(16bytes) + + ALIGN (4) +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + + ALIGN (4) +L(49bytes): + movdqu -49(%eax), %xmm1 + movdqu -49(%edx), %xmm2 + mov $-49, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%eax), %xmm1 + movdqu -33(%edx), %xmm2 + mov $-33, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(50bytes): + mov $-50, %ebx + movdqu -50(%eax), %xmm1 + movdqu -50(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + mov $-34, %ebx + movdqu -34(%eax), %xmm1 + movdqu -34(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(51bytes): + mov $-51, %ebx + movdqu -51(%eax), %xmm1 + movdqu -51(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + mov $-35, %ebx + movdqu -35(%eax), %xmm1 + movdqu -35(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) +L(1bytes): + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(52bytes): + movdqu -52(%eax), %xmm1 + movdqu -52(%edx), %xmm2 + mov $-52, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%eax), %xmm1 + movdqu -36(%edx), %xmm2 + mov $-36, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%eax), %xmm1 + movdqu -20(%edx), %xmm2 + mov $-20, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + + ALIGN (4) +L(53bytes): + movdqu -53(%eax), %xmm1 + movdqu -53(%edx), %xmm2 + mov $-53, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + mov $-37, %ebx + movdqu -37(%eax), %xmm1 + movdqu -37(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + mov $-21, %ebx + movdqu -21(%eax), %xmm1 + movdqu -21(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(54bytes): + movdqu -54(%eax), %xmm1 + movdqu -54(%edx), %xmm2 + mov $-54, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + mov $-38, %ebx + movdqu -38(%eax), %xmm1 + movdqu -38(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + mov $-22, %ebx + movdqu -22(%eax), %xmm1 + movdqu -22(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(55bytes): + movdqu -55(%eax), %xmm1 + movdqu -55(%edx), %xmm2 + mov $-55, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + mov $-39, %ebx + movdqu -39(%eax), %xmm1 + movdqu -39(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + mov $-23, %ebx + movdqu -23(%eax), %xmm1 + movdqu -23(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(56bytes): + movdqu -56(%eax), %xmm1 + movdqu -56(%edx), %xmm2 + mov $-56, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + mov $-40, %ebx + movdqu -40(%eax), %xmm1 + movdqu -40(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + mov $-24, %ebx + movdqu -24(%eax), %xmm1 + movdqu -24(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + + ALIGN (4) +L(57bytes): + movdqu -57(%eax), %xmm1 + movdqu -57(%edx), %xmm2 + mov $-57, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + mov $-41, %ebx + movdqu -41(%eax), %xmm1 + movdqu -41(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + mov $-25, %ebx + movdqu -25(%eax), %xmm1 + movdqu -25(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(58bytes): + movdqu -58(%eax), %xmm1 + movdqu -58(%edx), %xmm2 + mov $-58, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + mov $-42, %ebx + movdqu -42(%eax), %xmm1 + movdqu -42(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + mov $-26, %ebx + movdqu -26(%eax), %xmm1 + movdqu -26(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(59bytes): + movdqu -59(%eax), %xmm1 + movdqu -59(%edx), %xmm2 + mov $-59, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + mov $-43, %ebx + movdqu -43(%eax), %xmm1 + movdqu -43(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + mov $-27, %ebx + movdqu -27(%eax), %xmm1 + movdqu -27(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(60bytes): + movdqu -60(%eax), %xmm1 + movdqu -60(%edx), %xmm2 + mov $-60, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + mov $-44, %ebx + movdqu -44(%eax), %xmm1 + movdqu -44(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + mov $-28, %ebx + movdqu -28(%eax), %xmm1 + movdqu -28(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + + ALIGN (4) +L(61bytes): + movdqu -61(%eax), %xmm1 + movdqu -61(%edx), %xmm2 + mov $-61, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + mov $-45, %ebx + movdqu -45(%eax), %xmm1 + movdqu -45(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + mov $-29, %ebx + movdqu -29(%eax), %xmm1 + movdqu -29(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(62bytes): + movdqu -62(%eax), %xmm1 + movdqu -62(%edx), %xmm2 + mov $-62, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + mov $-46, %ebx + movdqu -46(%eax), %xmm1 + movdqu -46(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + mov $-30, %ebx + movdqu -30(%eax), %xmm1 + movdqu -30(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(63bytes): + movdqu -63(%eax), %xmm1 + movdqu -63(%edx), %xmm2 + mov $-63, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + mov $-47, %ebx + movdqu -47(%eax), %xmm1 + movdqu -47(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + mov $-31, %ebx + movdqu -31(%eax), %xmm1 + movdqu -31(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(64bytes): + movdqu -64(%eax), %xmm1 + movdqu -64(%edx), %xmm2 + mov $-64, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%eax), %xmm1 + movdqu -48(%edx), %xmm2 + mov $-48, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%eax), %xmm1 + movdqu -32(%edx), %xmm2 + mov $-32, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + + ALIGN (4) +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + mov (%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + mov 4(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + mov 8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + mov 12(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + + ALIGN (4) +L(find_diff): + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +END (MEMCMP) + + .section .rodata.sse4.2,"a",@progbits + ALIGN (2) + .type L(table_64bytes), @object +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .size L(table_64bytes), .-L(table_64bytes) +#endif diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..d2f852f726 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -0,0 +1,1966 @@ +/* memcmp with SSSE3 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define PARMS 4 +#define BLK1 PARMS +#define BLK2 BLK1+4 +#define LEN BLK2+4 +#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state + + .section .text.ssse3,"ax",@progbits +ENTRY (MEMCMP) + movl LEN(%esp), %ecx + movl BLK1(%esp), %eax + cmp $48, %ecx + movl BLK2(%esp), %edx + jae L(48bytesormore) + cmp $1, %ecx + jbe L(less1bytes) + PUSH (%ebx) + add %ecx, %edx + add %ecx, %eax + jmp L(less48bytes) + + ALIGN (4) + CFI_POP (%ebx) +L(less1bytes): + jb L(zero) + movb (%eax), %cl + cmp (%edx), %cl + je L(zero) + mov $1, %eax + ja L(1bytesend) + neg %eax +L(1bytesend): + ret + + ALIGN (4) +L(zero): + mov $0, %eax + ret + + ALIGN (4) +L(48bytesormore): + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) + cfi_remember_state + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 + movl %eax, %edi + movl %edx, %esi + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%edi), %edi + + sub $0xffff, %edx + lea 16(%esi), %esi + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %edx, %edi + sub %edx, %esi + add %edx, %ecx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %edx, %esi + + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + ALIGN (4) +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) + + ALIGN (4) +L(shr_0): + cmp $80, %ecx + jae L(shr_0_gobble) + lea -48(%ecx), %ecx + xor %eax, %eax + movaps (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + movaps 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + add $32, %edi + add $32, %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_0_gobble): + lea -48(%ecx), %ecx + movdqa (%esi), %xmm0 + xor %eax, %eax + pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %ecx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%esi), %xmm0 + movdqa 48(%esi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%edi), %xmm0 + pcmpeqb 48(%edi), %xmm2 + lea 32(%edi), %edi + lea 32(%esi), %esi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %ecx + jge L(shr_0_gobble_loop_next) + inc %edx + add $32, %ecx +L(shr_0_gobble_loop_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_1): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $1,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_1_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $1,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $1,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $1,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $1,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_1_gobble_loop) + + cmp $0, %ecx + jge L(shr_1_gobble_next) + inc %edx + add $32, %ecx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_2): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $2,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_2_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $2,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $2,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $2,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $2,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_2_gobble_loop) + + cmp $0, %ecx + jge L(shr_2_gobble_next) + inc %edx + add $32, %ecx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_3): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $3,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_3_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $3,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $3,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $3,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $3,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_3_gobble_loop) + + cmp $0, %ecx + jge L(shr_3_gobble_next) + inc %edx + add $32, %ecx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_4): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $4,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_4_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $4,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $4,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $4,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $4,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_4_gobble_loop) + + cmp $0, %ecx + jge L(shr_4_gobble_next) + inc %edx + add $32, %ecx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_5): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $5,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_5_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $5,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $5,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $5,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $5,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_5_gobble_loop) + + cmp $0, %ecx + jge L(shr_5_gobble_next) + inc %edx + add $32, %ecx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_6): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $6,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_6_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $6,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $6,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $6,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $6,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_6_gobble_loop) + + cmp $0, %ecx + jge L(shr_6_gobble_next) + inc %edx + add $32, %ecx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_7): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $7,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_7_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $7,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $7,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $7,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $7,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_7_gobble_loop) + + cmp $0, %ecx + jge L(shr_7_gobble_next) + inc %edx + add $32, %ecx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_8): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $8,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_8_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $8,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $8,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $8,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $8,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_8_gobble_loop) + + cmp $0, %ecx + jge L(shr_8_gobble_next) + inc %edx + add $32, %ecx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_9): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $9,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_9_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $9,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $9,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $9,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $9,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_9_gobble_loop) + + cmp $0, %ecx + jge L(shr_9_gobble_next) + inc %edx + add $32, %ecx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_10): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $10,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_10_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $10, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $10, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $10,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $10,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_10_gobble_loop) + + cmp $0, %ecx + jge L(shr_10_gobble_next) + inc %edx + add $32, %ecx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_11): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_11_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $11, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $11, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $11,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $11,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_11_gobble_loop) + + cmp $0, %ecx + jge L(shr_11_gobble_next) + inc %edx + add $32, %ecx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_12): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_12_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $12, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $12, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $12,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $12,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_12_gobble_loop) + + cmp $0, %ecx + jge L(shr_12_gobble_next) + inc %edx + add $32, %ecx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_13): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_13_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $13, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $13, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $13,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $13,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_13_gobble_loop) + + cmp $0, %ecx + jge L(shr_13_gobble_next) + inc %edx + add $32, %ecx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_14): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_14_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $14, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $14, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $14,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $14,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_14_gobble_loop) + + cmp $0, %ecx + jge L(shr_14_gobble_next) + inc %edx + add $32, %ecx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_15): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shr_15_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $15, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $15, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $15,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $15,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_15_gobble_loop) + + cmp $0, %ecx + jge L(shr_15_gobble_next) + inc %edx + add $32, %ecx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(exit): + pmovmskb %xmm1, %ebx + sub $0xffff, %ebx + jz L(first16bytes) + lea -16(%esi), %esi + lea -16(%edi), %edi + mov %ebx, %edx +L(first16bytes): + add %eax, %esi +L(less16bytes): + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) +L(Byte23): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte16): + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte17): + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte18): + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte19): + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte20): + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte21): + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte22): + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(next_24_bytes): + lea 8(%edi), %edi + lea 8(%esi), %esi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + ALIGN (4) +L(Byte31): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN_END + + CFI_PUSH (%ebx) + ALIGN (4) +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) + + ALIGN (4) +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) + + ALIGN (4) +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) + + ALIGN (4) +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) + + ALIGN (4) +L(more40bytes): + cmp $40, %ecx + je L(40bytes) + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + ALIGN (4) +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) + + ALIGN (4) +L(44bytes): + mov -44(%eax), %ecx + mov -44(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + mov -40(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + mov -36(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + mov -32(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + mov -28(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + mov -24(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + mov -20(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(45bytes): + mov -45(%eax), %ecx + mov -45(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(41bytes): + mov -41(%eax), %ecx + mov -41(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(37bytes): + mov -37(%eax), %ecx + mov -37(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(33bytes): + mov -33(%eax), %ecx + mov -33(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(29bytes): + mov -29(%eax), %ecx + mov -29(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(25bytes): + mov -25(%eax), %ecx + mov -25(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(21bytes): + mov -21(%eax), %ecx + mov -21(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(46bytes): + mov -46(%eax), %ecx + mov -46(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(42bytes): + mov -42(%eax), %ecx + mov -42(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(38bytes): + mov -38(%eax), %ecx + mov -38(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(34bytes): + mov -34(%eax), %ecx + mov -34(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(30bytes): + mov -30(%eax), %ecx + mov -30(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(26bytes): + mov -26(%eax), %ecx + mov -26(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(22bytes): + mov -22(%eax), %ecx + mov -22(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(47bytes): + movl -47(%eax), %ecx + movl -47(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(43bytes): + movl -43(%eax), %ecx + movl -43(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(39bytes): + movl -39(%eax), %ecx + movl -39(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(35bytes): + movl -35(%eax), %ecx + movl -35(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(31bytes): + movl -31(%eax), %ecx + movl -31(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(27bytes): + movl -27(%eax), %ecx + movl -27(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(23bytes): + movl -23(%eax), %ecx + movl -23(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(find_diff): + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret + +END (MEMCMP) + +#endif diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S new file mode 100644 index 0000000000..cf606a5959 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcmp.S @@ -0,0 +1,88 @@ +/* Multiple versions of memcmp + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc +# ifdef SHARED + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __memcmp_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memcmp_ssse3@GOTOFF(%ebx), %eax + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memcmp_sse4_2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(memcmp) +# else + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __memcmp_ia32, %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features + jz 2f + leal __memcmp_ssse3, %eax + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features + jz 2f + leal __memcmp_sse4_2, %eax +2: ret +END(memcmp) +# endif + +# undef ENTRY +# define ENTRY(name) \ + .type __memcmp_ia32, @function; \ + .p2align 4; \ + __memcmp_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcmp; __GI_memcmp = __memcmp_ia32 +# endif +#endif + +#include "../memcmp.S" diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S new file mode 100644 index 0000000000..d5fd23e15c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -0,0 +1,378 @@ +/* strcmp with SSE4.2 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifndef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strcmp_sse4_2 +# endif +# define STR1 4 +# define STR2 STR1+4 +# define RETURN ret; .p2align 4 +#else +# ifndef STRCMP +# define STRCMP __strncmp_sse4_2 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +# define RETURN POP (%ebp); ret; .p2align 4; CFI_PUSH (%ebp) +#endif + + .section .text.sse4.2,"ax",@progbits +ENTRY (STRCMP) +#ifdef USE_AS_STRNCMP + PUSH (%ebp) +#endif + mov STR1(%esp), %edx + mov STR2(%esp), %eax +#ifdef USE_AS_STRNCMP + movl CNT(%esp), %ebp + test %ebp, %ebp + je L(eq) +#endif + mov %dx, %cx + and $0xfff, %cx + cmp $0xff0, %cx + ja L(first4bytes) + movdqu (%edx), %xmm2 + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(first4bytes) + movd %xmm2, %ecx + cmp (%eax), %ecx + jne L(less4bytes) + movdqu (%eax), %xmm1 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm0 + ptest %xmm1, %xmm0 + jnc L(less16bytes) + pcmpeqb %xmm0, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + +#ifdef USE_AS_STRNCMP + sub $16, %ebp + jbe L(eq) +#endif + add $16, %edx + add $16, %eax +L(first4bytes): + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + je L(eq) +#endif + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + je L(eq) +#endif + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + je L(eq) +#endif + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + je L(eq) +#endif + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + je L(eq) +#endif + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + je L(eq) +#endif + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + je L(eq) +#endif + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + sub $8, %ebp + je L(eq) +#endif + add $8, %eax + add $8, %edx + + PUSH (%ebx) + PUSH (%edi) + PUSH (%esi) + cfi_remember_state + mov %edx, %edi + mov %eax, %esi + xorl %eax, %eax +L(check_offset): + movl %edi, %ebx + movl %esi, %ecx + andl $0xfff, %ebx + andl $0xfff, %ecx + cmpl %ebx, %ecx + cmovl %ebx, %ecx + lea -0xff0(%ecx), %edx + sub %edx, %edi + sub %edx, %esi + testl %edx, %edx + jg L(crosspage) +L(loop): + movdqu (%esi,%edx), %xmm2 + movdqu (%edi,%edx), %xmm1 + pcmpistri $0x1a, %xmm2, %xmm1 + jbe L(end) + +#ifdef USE_AS_STRNCMP + sub $16, %ebp + jbe L(more16byteseq) +#endif + + add $16, %edx + jle L(loop) +L(crosspage): + movzbl (%edi,%edx), %eax + movzbl (%esi,%edx), %ebx + subl %ebx, %eax + jne L(ret) + testl %ebx, %ebx + je L(ret) +#ifdef USE_AS_STRNCMP + sub $1, %ebp + jbe L(more16byteseq) +#endif + inc %edx + cmp $15, %edx + jle L(crosspage) + add $16, %edi + add $16, %esi + jmp L(check_offset) + + .p2align 4 +L(end): + jnc L(ret) +#ifdef USE_AS_STRNCMP + sub %ecx, %ebp + jbe L(more16byteseq) +#endif + lea (%ecx,%edx), %ebx + movzbl (%edi,%ebx), %eax + movzbl (%esi,%ebx), %ecx + subl %ecx, %eax +L(ret): + POP (%esi) + POP (%edi) + POP (%ebx) +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + + .p2align 4 + cfi_restore_state +#ifdef USE_AS_STRNCMP +L(more16byteseq): + POP (%esi) + POP (%edi) + POP (%ebx) +#endif +L(eq): + xorl %eax, %eax + RETURN + +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): + RETURN + +L(less16bytes): + add $0xfefefeff, %ecx + jnc L(less4bytes) + xor (%edx), %ecx + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(less4bytes) + +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + jbe L(eq) +#endif + mov 4(%edx), %ecx + cmp 4(%eax), %ecx + jne L(more4bytes) + add $0xfefefeff, %ecx + jnc L(more4bytes) + xor 4(%edx), %ecx + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(more4bytes) + +#ifdef USE_AS_STRNCMP + sub $8, %ebp + jbe L(eq) +#endif + + add $8, %edx + add $8, %eax +L(less4bytes): + + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + je L(eq) +#endif + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + je L(eq) +#endif + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + je L(eq) +#endif + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +L(more4bytes): +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + je L(eq) +#endif + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + je L(eq) +#endif + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + je L(eq) +#endif + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + je L(eq) +#endif + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + jmp L(eq) + +END (STRCMP) + +#endif diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S new file mode 100644 index 0000000000..40994c05b1 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S @@ -0,0 +1,2220 @@ +/* strcmp with SSSE3 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifndef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strcmp_ssse3 +# endif +# define STR1 4 +# define STR2 STR1+4 +# define RETURN ret; .p2align 4 +# define UPDATE_STRNCMP_COUNTER +#else +# ifndef STRCMP +# define STRCMP __strncmp_ssse3 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +# define RETURN POP (%ebp); ret; .p2align 4; CFI_PUSH (%ebp) +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + mov $16, %esi; \ + sub %ecx, %esi; \ + cmp %esi, %ebp; \ + jbe L(more8byteseq); \ + sub %esi, %ebp +#endif + + .section .text.ssse3,"ax",@progbits +ENTRY (STRCMP) +#ifdef USE_AS_STRNCMP + PUSH (%ebp) +#endif + movl STR1(%esp), %edx + movl STR2(%esp), %eax +#ifdef USE_AS_STRNCMP + movl CNT(%esp), %ebp + cmp $16, %ebp + jb L(less16bytes_sncmp) + jmp L(more16bytes) +#endif + + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + add $8, %edx + add $8, %eax +#ifdef USE_AS_STRNCMP + cmp $8, %ebp + lea -8(%ebp), %ebp + je L(eq) +L(more16bytes): +#endif + movl %edx, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + pxor %xmm0, %xmm0 + movlpd (%eax), %xmm1 + movlpd (%edx), %xmm2 + movhpd 8(%eax), %xmm1 + movhpd 8(%edx), %xmm2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %ecx + sub $0xffff, %ecx + jnz L(less16bytes) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(eq) +#endif + add $16, %eax + add $16, %edx + +L(crosspage): + + PUSH (%ebx) + PUSH (%edi) + PUSH (%esi) +#ifdef USE_AS_STRNCMP + cfi_remember_state +#endif + + movl %edx, %edi + movl %eax, %ecx + and $0xf, %ecx + and $0xf, %edi + xor %ecx, %eax + xor %edi, %edx + xor %ebx, %ebx + cmp %edi, %ecx + je L(ashr_0) + ja L(bigger) + or $0x20, %ebx + xchg %edx, %eax + xchg %ecx, %edi +L(bigger): + lea 15(%edi), %edi + sub %ecx, %edi + cmp $8, %edi + jle L(ashr_less_8) + cmp $14, %edi + je L(ashr_15) + cmp $13, %edi + je L(ashr_14) + cmp $12, %edi + je L(ashr_13) + cmp $11, %edi + je L(ashr_12) + cmp $10, %edi + je L(ashr_11) + cmp $9, %edi + je L(ashr_10) +L(ashr_less_8): + je L(ashr_9) + cmp $7, %edi + je L(ashr_8) + cmp $6, %edi + je L(ashr_7) + cmp $5, %edi + je L(ashr_6) + cmp $4, %edi + je L(ashr_5) + cmp $3, %edi + je L(ashr_4) + cmp $2, %edi + je L(ashr_3) + cmp $1, %edi + je L(ashr_2) + cmp $0, %edi + je L(ashr_1) + +/* + * The following cases will be handled by ashr_0 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +L(ashr_0): + mov $0xffff, %esi + movdqa (%eax), %xmm1 + pxor %xmm0, %xmm0 + pcmpeqb %xmm1, %xmm0 + pcmpeqb (%edx), %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + mov %ecx, %edi + jne L(less32bytes) + UPDATE_STRNCMP_COUNTER + mov $0x10, %ebx + mov $0x10, %ecx + pxor %xmm0, %xmm0 + .p2align 4 +L(loop_ashr_0): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + jmp L(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +L(ashr_1): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $15, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -15(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $1, %ebx + lea 1(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_1): + add $16, %edi + jg L(nibble_ashr_1) + +L(gobble_ashr_1): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_1) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_1) + + .p2align 4 +L(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffe, %esi + jnz L(ashr_1_exittail) + +#ifdef USE_AS_STRNCMP + cmp $15, %ebp + jbe L(ashr_1_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_1) + + .p2align 4 +L(ashr_1_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_2 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +L(ashr_2): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -14(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $2, %ebx + lea 2(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_2): + add $16, %edi + jg L(nibble_ashr_2) + +L(gobble_ashr_2): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_2) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_2) + + .p2align 4 +L(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffc, %esi + jnz L(ashr_2_exittail) + +#ifdef USE_AS_STRNCMP + cmp $14, %ebp + jbe L(ashr_2_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_2) + + .p2align 4 +L(ashr_2_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_3 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +L(ashr_3): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -13(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $3, %ebx + lea 3(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_3): + add $16, %edi + jg L(nibble_ashr_3) + +L(gobble_ashr_3): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_3) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_3) + + .p2align 4 +L(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff8, %esi + jnz L(ashr_3_exittail) + +#ifdef USE_AS_STRNCMP + cmp $13, %ebp + jbe L(ashr_3_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_3) + + .p2align 4 +L(ashr_3_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_4 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +L(ashr_4): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -12(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $4, %ebx + lea 4(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_4): + add $16, %edi + jg L(nibble_ashr_4) + +L(gobble_ashr_4): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_4) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_4) + + .p2align 4 +L(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff0, %esi + jnz L(ashr_4_exittail) + +#ifdef USE_AS_STRNCMP + cmp $12, %ebp + jbe L(ashr_4_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_4) + + .p2align 4 +L(ashr_4_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_5 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(11~15) n -11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +L(ashr_5): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -11(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $5, %ebx + lea 5(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_5): + add $16, %edi + jg L(nibble_ashr_5) + +L(gobble_ashr_5): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_5) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_5) + + .p2align 4 +L(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffe0, %esi + jnz L(ashr_5_exittail) + +#ifdef USE_AS_STRNCMP + cmp $11, %ebp + jbe L(ashr_5_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_5) + + .p2align 4 +L(ashr_5_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_6 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(10~15) n -10 5(15 +(n-10) - n) ashr_6 + */ + + .p2align 4 +L(ashr_6): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -10(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $6, %ebx + lea 6(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_6): + add $16, %edi + jg L(nibble_ashr_6) + +L(gobble_ashr_6): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_6) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_6) + + .p2align 4 +L(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffc0, %esi + jnz L(ashr_6_exittail) + +#ifdef USE_AS_STRNCMP + cmp $10, %ebp + jbe L(ashr_6_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_6) + + .p2align 4 +L(ashr_6_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_7 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n-9) - n) ashr_7 + */ + + .p2align 4 +L(ashr_7): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -9(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $7, %ebx + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_7): + add $16, %edi + jg L(nibble_ashr_7) + +L(gobble_ashr_7): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_7) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_7) + + .p2align 4 +L(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff80, %esi + jnz L(ashr_7_exittail) + +#ifdef USE_AS_STRNCMP + cmp $9, %ebp + jbe L(ashr_7_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_7) + + .p2align 4 +L(ashr_7_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_8 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n-8) - n) ashr_8 + */ + .p2align 4 +L(ashr_8): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -8(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $8, %ebx + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_8): + add $16, %edi + jg L(nibble_ashr_8) + +L(gobble_ashr_8): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_8) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_8) + + .p2align 4 +L(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff00, %esi + jnz L(ashr_8_exittail) + +#ifdef USE_AS_STRNCMP + cmp $8, %ebp + jbe L(ashr_8_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_8) + + .p2align 4 +L(ashr_8_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_9 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n-7) - n) ashr_9 + */ + .p2align 4 +L(ashr_9): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -7(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $9, %ebx + lea 9(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_9): + add $16, %edi + jg L(nibble_ashr_9) + +L(gobble_ashr_9): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_9) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_9) + + .p2align 4 +L(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfe00, %esi + jnz L(ashr_9_exittail) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + jbe L(ashr_9_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_9) + + .p2align 4 +L(ashr_9_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_10 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n-6) - n) ashr_10 + */ + .p2align 4 +L(ashr_10): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -6(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $10, %ebx + lea 10(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_10): + add $16, %edi + jg L(nibble_ashr_10) + +L(gobble_ashr_10): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_10) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_10) + + .p2align 4 +L(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfc00, %esi + jnz L(ashr_10_exittail) + +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + jbe L(ashr_10_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_10) + + .p2align 4 +L(ashr_10_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_11 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n-5) - n) ashr_11 + */ + .p2align 4 +L(ashr_11): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -5(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $11, %ebx + lea 11(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_11): + add $16, %edi + jg L(nibble_ashr_11) + +L(gobble_ashr_11): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_11) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_11) + + .p2align 4 +L(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf800, %esi + jnz L(ashr_11_exittail) + +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + jbe L(ashr_11_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_11) + + .p2align 4 +L(ashr_11_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_12 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n-4) - n) ashr_12 + */ + .p2align 4 +L(ashr_12): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -4(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $12, %ebx + lea 12(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_12): + add $16, %edi + jg L(nibble_ashr_12) + +L(gobble_ashr_12): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_12) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_12) + + .p2align 4 +L(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf000, %esi + jnz L(ashr_12_exittail) + +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + jbe L(ashr_12_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_12) + + .p2align 4 +L(ashr_12_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_13 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n-3) - n) ashr_13 + */ + .p2align 4 +L(ashr_13): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -3(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $13, %ebx + lea 13(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_13): + add $16, %edi + jg L(nibble_ashr_13) + +L(gobble_ashr_13): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_13) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_13) + + .p2align 4 +L(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xe000, %esi + jnz L(ashr_13_exittail) + +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + jbe L(ashr_13_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_13) + + .p2align 4 +L(ashr_13_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n-2) - n) ashr_14 + */ + .p2align 4 +L(ashr_14): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -2(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $14, %ebx + lea 14(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_14): + add $16, %edi + jg L(nibble_ashr_14) + +L(gobble_ashr_14): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_14) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_14) + + .p2align 4 +L(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xc000, %esi + jnz L(ashr_14_exittail) + +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + jbe L(ashr_14_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_14) + + .p2align 4 +L(ashr_14_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n-1) - n) ashr_15 + */ + + .p2align 4 +L(ashr_15): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -1(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $15, %ebx + lea 15(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_15): + add $16, %edi + jg L(nibble_ashr_15) + +L(gobble_ashr_15): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_15) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_15) + + .p2align 4 +L(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0x8000, %esi + jnz L(ashr_15_exittail) + +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + jbe L(ashr_15_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_15) + + .p2align 4 +L(ashr_15_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $15, %xmm0 + psrldq $15, %xmm3 + jmp L(aftertail) + + .p2align 4 +L(aftertail): + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + not %esi +L(exit): + mov %ebx, %edi + and $0x1f, %edi + lea -16(%edi, %ecx), %edi +L(less32bytes): + add %edi, %edx + add %ecx, %eax + test $0x20, %ebx + jz L(ret2) + xchg %eax, %edx + + .p2align 4 +L(ret2): + mov %esi, %ecx + POP (%esi) + POP (%edi) + POP (%ebx) +L(less16bytes): + test %cl, %cl + jz L(2next_8_bytes) + + test $0x01, %cl + jnz L(Byte0) + + test $0x02, %cl + jnz L(Byte1) + + test $0x04, %cl + jnz L(Byte2) + + test $0x08, %cl + jnz L(Byte3) + + test $0x10, %cl + jnz L(Byte4) + + test $0x20, %cl + jnz L(Byte5) + + test $0x40, %cl + jnz L(Byte6) +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + jbe L(eq) +#endif + + movzx 7(%eax), %ecx + movzx 7(%edx), %eax + + sub %ecx, %eax + RETURN + +L(Byte0): +#ifdef USE_AS_STRNCMP + cmp $0, %ebp + jbe L(eq) +#endif + movzx (%eax), %ecx + movzx (%edx), %eax + + sub %ecx, %eax + RETURN + +L(Byte1): +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + jbe L(eq) +#endif + movzx 1(%eax), %ecx + movzx 1(%edx), %eax + + sub %ecx, %eax + RETURN + +L(Byte2): +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + jbe L(eq) +#endif + movzx 2(%eax), %ecx + movzx 2(%edx), %eax + + sub %ecx, %eax + RETURN + +L(Byte3): +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + jbe L(eq) +#endif + movzx 3(%eax), %ecx + movzx 3(%edx), %eax + + sub %ecx, %eax + RETURN + +L(Byte4): +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + jbe L(eq) +#endif + movzx 4(%eax), %ecx + movzx 4(%edx), %eax + + sub %ecx, %eax + RETURN + +L(Byte5): +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + jbe L(eq) +#endif + movzx 5(%eax), %ecx + movzx 5(%edx), %eax + + sub %ecx, %eax + RETURN + +L(Byte6): +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + jbe L(eq) +#endif + movzx 6(%eax), %ecx + movzx 6(%edx), %eax + + sub %ecx, %eax + RETURN + +L(2next_8_bytes): + add $8, %eax + add $8, %edx +#ifdef USE_AS_STRNCMP + cmp $8, %ebp + lea -8(%ebp), %ebp + jbe L(eq) +#endif + + test $0x01, %ch + jnz L(Byte0) + + test $0x02, %ch + jnz L(Byte1) + + test $0x04, %ch + jnz L(Byte2) + + test $0x08, %ch + jnz L(Byte3) + + test $0x10, %ch + jnz L(Byte4) + + test $0x20, %ch + jnz L(Byte5) + + test $0x40, %ch + jnz L(Byte6) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + jbe L(eq) +#endif + movzx 7(%eax), %ecx + movzx 7(%edx), %eax + + sub %ecx, %eax + RETURN + +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + +#ifdef USE_AS_STRNCMP + .p2align 4 + cfi_restore_state +L(more8byteseq): + POP (%esi) + POP (%edi) + POP (%ebx) +#endif + +L(eq): + +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + xorl %eax, %eax + ret + +#ifdef USE_AS_STRNCMP + .p2align 4 + CFI_PUSH (%ebp) +L(less16bytes_sncmp): + test %ebp, %ebp + jz L(eq) + + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $1, %ebp + je L(eq) + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $2, %ebp + je L(eq) + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $3, %ebp + je L(eq) + + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $4, %ebp + je L(eq) + + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $5, %ebp + je L(eq) + + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $6, %ebp + je L(eq) + + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $7, %ebp + je L(eq) + + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + + cmp $8, %ebp + je L(eq) + + movzbl 8(%eax), %ecx + cmpb %cl, 8(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $9, %ebp + je L(eq) + + movzbl 9(%eax), %ecx + cmpb %cl, 9(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $10, %ebp + je L(eq) + + movzbl 10(%eax), %ecx + cmpb %cl, 10(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $11, %ebp + je L(eq) + + movzbl 11(%eax), %ecx + cmpb %cl, 11(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + + cmp $12, %ebp + je L(eq) + + movzbl 12(%eax), %ecx + cmpb %cl, 12(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $13, %ebp + je L(eq) + + movzbl 13(%eax), %ecx + cmpb %cl, 13(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $14, %ebp + je L(eq) + + movzbl 14(%eax), %ecx + cmpb %cl, 14(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $15, %ebp + je L(eq) + + movzbl 15(%eax), %ecx + cmpb %cl, 15(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + POP (%ebp) + xor %eax, %eax + ret +#endif + +END (STRCMP) + +#endif diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S new file mode 100644 index 0000000000..7136d47e85 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcmp.S @@ -0,0 +1,115 @@ +/* Multiple versions of strcmp + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef USE_AS_STRNCMP +# define STRCMP strcmp +# define __GI_STRCMP __GI_strcmp +# define __STRCMP_IA32 __strcmp_ia32 +# define __STRCMP_SSSE3 __strcmp_ssse3 +# define __STRCMP_SSE4_2 __strcmp_sse4_2 +#else +# define STRCMP strncmp +# define __GI_STRCMP __GI_strncmp +# define __STRCMP_IA32 __strncmp_ia32 +# define __STRCMP_SSSE3 __strncmp_ssse3 +# define __STRCMP_SSE4_2 __strncmp_sse4_2 +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncmp in static library since we + need strncmp before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __STRCMP_IA32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __STRCMP_SSSE3@GOTOFF(%ebx), %eax + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __STRCMP_SSE4_2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(STRCMP) +# else + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __STRCMP_IA32, %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features + jz 2f + leal __STRCMP_SSSE3, %eax + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features + jz 2f + leal __STRCMP_SSE4_2, %eax +2: ret +END(STRCMP) +# endif + +# undef ENTRY +# define ENTRY(name) \ + .type __STRCMP_IA32, @function; \ + .p2align 4; \ + __STRCMP_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32 +# endif +#endif + +#ifndef USE_AS_STRNCMP +# include "../strcmp.S" +#endif diff --git a/sysdeps/i386/i686/multiarch/strncmp-c.c b/sysdeps/i386/i686/multiarch/strncmp-c.c new file mode 100644 index 0000000000..cc059da494 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncmp-c.c @@ -0,0 +1,8 @@ +#ifdef SHARED +# define STRNCMP __strncmp_ia32 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32); +#endif + +#include "string/strncmp.c" diff --git a/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/sysdeps/i386/i686/multiarch/strncmp-sse4.S new file mode 100644 index 0000000000..cf14dfaf6c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncmp-sse4.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_sse4_2 +# include "strcmp-sse4.S" +#endif diff --git a/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/sysdeps/i386/i686/multiarch/strncmp-ssse3.S new file mode 100644 index 0000000000..536c8685f2 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncmp-ssse3.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_ssse3 +# include "strcmp-ssse3.S" +#endif diff --git a/sysdeps/i386/i686/multiarch/strncmp.S b/sysdeps/i386/i686/multiarch/strncmp.S new file mode 100644 index 0000000000..b6814315fb --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncmp.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCMP +#define STRCMP strncmp +#include "strcmp.S" diff --git a/sysdeps/i386/lshift.S b/sysdeps/i386/lshift.S index 536d9878eb..398cf038c7 100644 --- a/sysdeps/i386/lshift.S +++ b/sysdeps/i386/lshift.S @@ -1,5 +1,5 @@ /* i80386 __mpn_lshift -- - Copyright (C) 1992, 1994, 1997-2000, 2005 Free Software Foundation, Inc. + Copyright (C) 1992,1994,1997-2000,2005,2010 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -55,6 +55,7 @@ ENTRY (BP_SYM (__mpn_lshift)) movl (%esi,%edx,4),%ebx /* read most significant limb */ cfi_rel_offset (ebx, 0) + cfi_remember_state xorl %eax,%eax shldl %cl,%ebx,%eax /* compute carry limb */ decl %edx @@ -95,6 +96,7 @@ L(1): movl (%esi,%edx,4),%eax LEAVE ret + cfi_restore_state L(end): shll %cl,%ebx /* compute least significant limb */ movl %ebx,(%edi) /* store it */ diff --git a/sysdeps/i386/rshift.S b/sysdeps/i386/rshift.S index 3fd0afe822..332c4d09e7 100644 --- a/sysdeps/i386/rshift.S +++ b/sysdeps/i386/rshift.S @@ -1,5 +1,5 @@ /* i80386 __mpn_rshift -- - Copyright (C) 1992,1994,1997-2000,2005 Free Software Foundation, Inc. + Copyright (C) 1992,1994,1997-2000,2005,2010 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -57,6 +57,7 @@ ENTRY (BP_SYM (__mpn_rshift)) movl (%esi,%edx,4),%ebx /* read least significant limb */ cfi_rel_offset (ebx, 0) + cfi_remember_state xorl %eax,%eax shrdl %cl,%ebx,%eax /* compute carry limb */ incl %edx @@ -97,10 +98,7 @@ L(1): movl (%esi,%edx,4),%eax LEAVE ret - cfi_adjust_cfa_offset (12) - cfi_rel_offset (edi, 8) - cfi_rel_offset (esi, 4) - cfi_rel_offset (ebx, 0) + cfi_restore_state L(end): shrl %cl,%ebx /* compute most significant limb */ movl %ebx,(%edi) /* store it */ diff --git a/sysdeps/ia64/fpu/fegetenv.c b/sysdeps/ia64/fpu/fegetenv.c index 5446b16494..e240f75e43 100644 --- a/sysdeps/ia64/fpu/fegetenv.c +++ b/sysdeps/ia64/fpu/fegetenv.c @@ -27,3 +27,4 @@ fegetenv (fenv_t *envp) return 0; } +libm_hidden_def (fegetenv) diff --git a/sysdeps/powerpc/fpu/fegetenv.c b/sysdeps/powerpc/fpu/fegetenv.c index 53953454cb..3d21abb529 100644 --- a/sysdeps/powerpc/fpu/fegetenv.c +++ b/sysdeps/powerpc/fpu/fegetenv.c @@ -35,4 +35,5 @@ strong_alias (__fegetenv, __old_fegetenv) compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1); #endif +libm_hidden_ver (__fegetenv, fegetenv) versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2); diff --git a/sysdeps/powerpc/powerpc32/configure b/sysdeps/powerpc/powerpc32/configure index 9b76c57886..da8ec0b87c 100644 --- a/sysdeps/powerpc/powerpc32/configure +++ b/sysdeps/powerpc/powerpc32/configure @@ -25,11 +25,10 @@ rm -f conftest* fi { $as_echo "$as_me:$LINENO: result: $libc_cv_ppc_rel16" >&5 $as_echo "$libc_cv_ppc_rel16" >&6; } -if test $libc_cv_ppc_rel16 = yes; then - cat >>confdefs.h <<\_ACEOF -#define HAVE_ASM_PPC_REL16 1 -_ACEOF - +if test $libc_cv_ppc_rel16 = no; then + { { $as_echo "$as_me:$LINENO: error: R_PPC_REL16 is not supported. Binutils is too old." >&5 +$as_echo "$as_me: error: R_PPC_REL16 is not supported. Binutils is too old." >&2;} + { (exit 1); exit 1; }; } fi # See whether GCC uses -msecure-plt. diff --git a/sysdeps/powerpc/powerpc32/configure.in b/sysdeps/powerpc/powerpc32/configure.in index 7219ad993e..21d3f5ee5b 100644 --- a/sysdeps/powerpc/powerpc32/configure.in +++ b/sysdeps/powerpc/powerpc32/configure.in @@ -13,8 +13,8 @@ else libc_cv_ppc_rel16=no fi rm -f conftest*]) -if test $libc_cv_ppc_rel16 = yes; then - AC_DEFINE(HAVE_ASM_PPC_REL16) +if test $libc_cv_ppc_rel16 = no; then + AC_MSG_ERROR(R_PPC_REL16 is not supported. Binutils is too old.) fi # See whether GCC uses -msecure-plt. diff --git a/sysdeps/powerpc/powerpc32/dl-machine.h b/sysdeps/powerpc/powerpc32/dl-machine.h index 6f8d0f506e..5351d9691d 100644 --- a/sysdeps/powerpc/powerpc32/dl-machine.h +++ b/sysdeps/powerpc/powerpc32/dl-machine.h @@ -41,16 +41,13 @@ static inline Elf32_Addr * __attribute__ ((const)) ppc_got (void) { Elf32_Addr *got; -#ifdef HAVE_ASM_PPC_REL16 + asm ("bcl 20,31,1f\n" "1: mflr %0\n" " addis %0,%0,_GLOBAL_OFFSET_TABLE_-1b@ha\n" " addi %0,%0,_GLOBAL_OFFSET_TABLE_-1b@l\n" : "=b" (got) : : "lr"); -#else - asm (" bl _GLOBAL_OFFSET_TABLE_-4@local" - : "=l" (got)); -#endif + return got; } diff --git a/sysdeps/powerpc/powerpc32/dl-start.S b/sysdeps/powerpc/powerpc32/dl-start.S index c77c4de198..ae41f47ede 100644 --- a/sysdeps/powerpc/powerpc32/dl-start.S +++ b/sysdeps/powerpc/powerpc32/dl-start.S @@ -47,15 +47,10 @@ _dl_start_user: passed by value!). */ /* Put our GOT pointer in r31, */ -#ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r31 addis r31,r31,_GLOBAL_OFFSET_TABLE_-1b@ha addi r31,r31,_GLOBAL_OFFSET_TABLE_-1b@l -#else - bl _GLOBAL_OFFSET_TABLE_-4@local - mflr r31 -#endif /* the address of _start in r30, */ mr r30,r3 /* &_dl_argc in 29, &_dl_argv in 27, and _dl_loaded in 28. */ diff --git a/sysdeps/powerpc/powerpc32/elf/start.S b/sysdeps/powerpc/powerpc32/elf/start.S index a8abdca0c6..dc89a5e109 100644 --- a/sysdeps/powerpc/powerpc32/elf/start.S +++ b/sysdeps/powerpc/powerpc32/elf/start.S @@ -53,10 +53,6 @@ L(start_addresses): ASM_SIZE_DIRECTIVE(L(start_addresses)) .section ".text" -#if defined PIC && !defined HAVE_ASM_PPC_REL16 -L(start_addressesp): - .long L(start_addresses)-L(branch) -#endif ENTRY(_start) /* Save the stack pointer, in case we're statically linked under Linux. */ mr r9,r1 @@ -77,16 +73,10 @@ L(branch): start_addresses in r8. Also load the GOT pointer so that new PLT calls work, like the one to __libc_start_main. */ #ifdef PIC -# ifdef HAVE_ASM_PPC_REL16 addis r30,r13,_GLOBAL_OFFSET_TABLE_-L(branch)@ha addis r8,r13,L(start_addresses)-L(branch)@ha addi r30,r30,_GLOBAL_OFFSET_TABLE_-L(branch)@l lwzu r13,L(start_addresses)-L(branch)@l(r8) -# else - lwz r8,L(start_addressesp)-L(branch)(r13) - add r8,r13,r8 - lwz r13,0(r8) -# endif #else lis r8,L(start_addresses)@ha lwzu r13,L(start_addresses)@l(r8) diff --git a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S index 04ed6da68b..e1ac064a59 100644 --- a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S +++ b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S @@ -34,15 +34,10 @@ ENTRY (BP_SYM (__longjmp)) # ifdef PIC mflr r6 cfi_register (lr,r6) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r5 addis r5,r5,_GLOBAL_OFFSET_TABLE_-1b@ha addi r5,r5,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r5 -# endif # ifdef SHARED lwz r5,_rtld_global_ro@got(r5) mtlr r6 diff --git a/sysdeps/powerpc/powerpc32/fpu/s_ceil.S b/sysdeps/powerpc/powerpc32/fpu/s_ceil.S index bc74d302fb..80e72ca2bd 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_ceil.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_ceil.S @@ -31,17 +31,10 @@ ENTRY (__ceil) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S b/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S index 47a75ec0c3..ce6d71e4f8 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S @@ -30,17 +30,10 @@ ENTRY (__ceilf) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_floor.S b/sysdeps/powerpc/powerpc32/fpu/s_floor.S index a29e4791ea..0dd0dbe6c0 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_floor.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_floor.S @@ -31,17 +31,10 @@ ENTRY (__floor) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_floorf.S b/sysdeps/powerpc/powerpc32/fpu/s_floorf.S index 99fbdc5f86..98a47458bc 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_floorf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_floorf.S @@ -30,17 +30,10 @@ ENTRY (__floorf) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_lround.S b/sysdeps/powerpc/powerpc32/fpu/s_lround.S index d73749e134..3bf1ffaea1 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_lround.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_lround.S @@ -45,17 +45,10 @@ ENTRY (__lround) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp10,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp10,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_rint.S b/sysdeps/powerpc/powerpc32/fpu/s_rint.S index c8dca313ae..93133718ad 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_rint.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_rint.S @@ -33,17 +33,10 @@ ENTRY (__rint) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_rintf.S b/sysdeps/powerpc/powerpc32/fpu/s_rintf.S index 7771cb2bc8..1e0fbb1f0d 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_rintf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_rintf.S @@ -29,17 +29,10 @@ ENTRY (__rintf) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_round.S b/sysdeps/powerpc/powerpc32/fpu/s_round.S index 590c87ad8c..48b346e651 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_round.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_round.S @@ -43,16 +43,10 @@ ENTRY (__round) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha addi r9,r9,.LC0-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) -# endif mtlr r11 cfi_same_value (lr) lfs fp13,0(r9) diff --git a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S index 7e99bca315..88125aad06 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S @@ -42,16 +42,10 @@ ENTRY (__roundf ) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha addi r9,r9,.LC0-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) -# endif mtlr r11 cfi_same_value (lr) lfs fp13,0(r9) diff --git a/sysdeps/powerpc/powerpc32/fpu/s_trunc.S b/sysdeps/powerpc/powerpc32/fpu/s_trunc.S index 5bc0856b9f..c3c021716a 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_trunc.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_trunc.S @@ -38,17 +38,10 @@ ENTRY (__trunc) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_truncf.S b/sysdeps/powerpc/powerpc32/fpu/s_truncf.S index e2e3bd6740..eddef070cd 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_truncf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_truncf.S @@ -37,17 +37,10 @@ ENTRY (__truncf) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S index b7d1abc00d..131e7a332e 100644 --- a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S +++ b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S @@ -85,15 +85,10 @@ ENTRY (BP_SYM (__sigsetjmp)) # ifdef PIC mflr r6 cfi_register(lr,r6) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r5 addis r5,r5,_GLOBAL_OFFSET_TABLE_-1b@ha addi r5,r5,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r5 -# endif mtlr r6 cfi_same_value (lr) # ifdef SHARED diff --git a/sysdeps/powerpc/powerpc32/memset.S b/sysdeps/powerpc/powerpc32/memset.S index 454abb2b65..b4ce218e24 100644 --- a/sysdeps/powerpc/powerpc32/memset.S +++ b/sysdeps/powerpc/powerpc32/memset.S @@ -256,17 +256,10 @@ L(checklinesize): beq L(medium) /* Establishes GOT addressability so we can load __cache_line_size from static. This value was set from the aux vector during startup. */ -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr rGOT addis rGOT,rGOT,__cache_line_size-1b@ha lwz rCLS,__cache_line_size-1b@l(rGOT) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr rGOT - lwz rGOT,__cache_line_size@got(rGOT) - lwz rCLS,0(rGOT) -# endif mtlr rTMP #else /* Load __cache_line_size from static. This value was set from the diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S index e10a37977a..b03e041d8a 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S +++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S @@ -53,16 +53,10 @@ ENTRY (__llround) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha addi r9,r9,.LC0-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) -# endif mtlr r11 cfi_same_value (lr) lfd fp9,0(r9) diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S index 95a0b3915d..8be3cf1848 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S +++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S @@ -63,7 +63,6 @@ EALIGN (__sqrt, 5, 0) cfi_offset(lr,20-16) cfi_offset(r30,8-16) #ifdef SHARED -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 .LCF1: mflr r30 @@ -71,12 +70,6 @@ EALIGN (__sqrt, 5, 0) addi r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l lwz r9,_LIB_VERSION@got(30) lwz r0,0(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r30 - lwz r9,_LIB_VERSION@got(30) - lwz r0,0(r9) -# endif #else lis r9,_LIB_VERSION@ha lwz r0,_LIB_VERSION@l(r9) diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S index c31555194b..9fa282c162 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S +++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S @@ -63,7 +63,6 @@ EALIGN (__sqrtf, 5, 0) cfi_offset(lr,20-16) cfi_offset(r30,8-16) #ifdef SHARED -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 .LCF1: mflr r30 @@ -71,12 +70,6 @@ EALIGN (__sqrtf, 5, 0) addi r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l lwz r9,_LIB_VERSION@got(30) lwz r0,0(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r30 - lwz r9,_LIB_VERSION@got(30) - lwz r0,0(r9) -# endif #else lis r9,_LIB_VERSION@ha lwz r0,_LIB_VERSION@l(r9) diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S index 105b5912a1..27a1a0dcbb 100644 --- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S +++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S @@ -63,7 +63,6 @@ EALIGN (__sqrt, 5, 0) cfi_offset(lr,20-16) cfi_offset(r30,8-16) #ifdef SHARED -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 .LCF1: mflr r30 @@ -71,12 +70,6 @@ EALIGN (__sqrt, 5, 0) addi r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l lwz r9,_LIB_VERSION@got(30) lwz r0,0(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r30 - lwz r9,_LIB_VERSION@got(30) - lwz r0,0(r9) -# endif #else lis r9,_LIB_VERSION@ha lwz r0,_LIB_VERSION@l(r9) diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S index 14bc0a2ceb..8914855542 100644 --- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S +++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S @@ -63,7 +63,6 @@ EALIGN (__sqrtf, 5, 0) cfi_offset(lr,20-16) cfi_offset(r30,8-16) #ifdef SHARED -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 .LCF1: mflr r30 @@ -71,12 +70,6 @@ EALIGN (__sqrtf, 5, 0) addi r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l lwz r9,_LIB_VERSION@got(30) lwz r0,0(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r30 - lwz r9,_LIB_VERSION@got(30) - lwz r0,0(r9) -# endif #else lis r9,_LIB_VERSION@ha lwz r0,_LIB_VERSION@l(r9) diff --git a/sysdeps/powerpc/powerpc32/power7/Implies b/sysdeps/powerpc/powerpc32/power7/Implies deleted file mode 100644 index 03899d8a3c..0000000000 --- a/sysdeps/powerpc/powerpc32/power7/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc32/power5 diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/Implies b/sysdeps/powerpc/powerpc32/power7/fpu/Implies deleted file mode 100644 index 819a7d7979..0000000000 --- a/sysdeps/powerpc/powerpc32/power7/fpu/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc32/power5/fpu diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S new file mode 100644 index 0000000000..5b0d950c74 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S @@ -0,0 +1,89 @@ +/* finite(). PowerPC32/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado <luisgpm@br.ibm.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +/* int __finite(x) */ + .section .rodata.cst8,"aM",@progbits,8 + .align 3 +.LC0: /* 1.0 */ + .quad 0x3ff0000000000000 + + .section ".text" + .type __finite, @function + .machine power7 +ENTRY (__finite) +#ifdef SHARED + mflr r11 + cfi_register(lr,r11) + + bcl 20,31,1f +1: mflr r9 + addis r9,r9,.LC0-1b@ha + lfd fp0,.LC0-1b@l(r9) + + mtlr r11 + cfi_same_value (lr) +#else + lis r9,.LC0@ha + lfd fp0,.LC0@l(r9) +#endif + ftdiv cr7,fp1,fp0 + li r3,1 + bflr 30 + + /* We have -INF/+INF/NaN or a denormal. */ + + stwu r1,-16(r1) /* Allocate stack space. */ + stfd fp1,8(r1) /* Transfer FP to GPR's. */ + + ori 2,2,0 /* Force a new dispatch group. */ + lhz r0,8(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + clrlwi r0,r0,17 /* r0 = abs(r0). */ + addi r1,r1,16 /* Reset the stack pointer. */ + cmpwi cr7,r0,0x7ff0 /* r4 == 0x7ff0?. */ + bltlr cr7 /* LT means we have a denormal. */ + li r3,0 + blr + END (__finite) + +hidden_def (__finite) +weak_alias (__finite, finite) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__finite, __finitef) +hidden_def (__finitef) +weak_alias (__finitef, finitef) + +#ifdef NO_LONG_DOUBLE +strong_alias (__finite, __finitel) +weak_alias (__finite, finitel) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __finite, __finitel, GLIBC_2_0); +compat_symbol (libc, finite, finitel, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S new file mode 100644 index 0000000000..54bd94176d --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S @@ -0,0 +1 @@ +/* This function uses the same code as s_finite.S. */ diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S new file mode 100644 index 0000000000..2979534911 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S @@ -0,0 +1,88 @@ +/* isinf(). PowerPC32/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado <luisgpm@br.ibm.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +/* int __isinf(x) */ + .section .rodata.cst8,"aM",@progbits,8 + .align 3 +.LC0: /* 1.0 */ + .quad 0x3ff0000000000000 + + .section ".text" + .type __isinf, @function + .machine power7 +ENTRY (__isinf) +#ifdef SHARED + mflr r11 + cfi_register(lr,r11) + + bcl 20,31,1f +1: mflr r9 + addis r9,r9,.LC0-1b@ha + lfd fp0,.LC0-1b@l(r9) + + mtlr r11 + cfi_same_value (lr) +#else + lis r9,.LC0@ha + lfd fp0,.LC0@l(r9) +#endif + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 29 /* If not INF, return. */ + + /* Either we have -INF/+INF or a denormal. */ + + stwu r1,-16(r1) /* Allocate stack space. */ + stfd fp1,8(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + lhz r4,8(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + addi r1,r1,16 /* Reset the stack pointer. */ + cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ + li r3,1 + beqlr cr7 /* EQ means INF, otherwise -INF. */ + li r3,-1 + blr + END (__isinf) + +hidden_def (__isinf) +weak_alias (__isinf, isinf) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isinf, __isinff) +hidden_def (__isinff) +weak_alias (__isinff, isinff) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isinf, __isinfl) +weak_alias (__isinf, isinfl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); +compat_symbol (libc, isinf, isinfl, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S new file mode 100644 index 0000000000..be759e091e --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isinf.S. */ diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S new file mode 100644 index 0000000000..852539f24b --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S @@ -0,0 +1,92 @@ +/* isnan(). PowerPC32/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado <luisgpm@br.ibm.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +/* int __isnan(x) */ + .section .rodata.cst8,"aM",@progbits,8 + .align 3 +.LC0: /* 1.0 */ + .quad 0x3ff0000000000000 + + .section ".text" + .type __isnan, @function + .machine power7 +ENTRY (__isnan) +#ifdef SHARED + mflr r11 + cfi_register(lr,r11) + + bcl 20,31,1f +1: mflr r9 + addis r9,r9,.LC0-1b@ha + lfd fp0,.LC0-1b@l(r9) + + mtlr r11 + cfi_same_value (lr) +#else + lis r9,.LC0@ha + lfd fp0,.LC0@l(r9) +#endif + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 30 /* If not NaN or Inf, finish. */ + + /* We have -INF/+INF/NaN or a denormal. */ + + stwu r1,-16(r1) /* Allocate stack space. */ + stfd fp1,8(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + lwz r4,8(r1) /* Load the upper half of the FP value. */ + lwz r5,12(r1) /* Load the lower half of the FP value. */ + addi r1,r1,16 /* Reset the stack pointer. */ + lis r0,0x7ff0 /* Load the upper portion for an INF/NaN. */ + clrlwi r4,r4,1 /* r4 = abs(r4). */ + cmpw cr7,r4,r0 /* if (abs(r4) <= inf). */ + cmpwi cr6,r5,0 /* r5 == 0x00000000? */ + bltlr cr7 /* LT means we have a denormal. */ + bgt cr7,L(NaN) /* GT means we have a NaN. */ + beqlr cr6 /* EQ means we have +/-INF. */ +L(NaN): + li r3,1 /* x == NaN? */ + blr + END (__isnan) + +hidden_def (__isnan) +weak_alias (__isnan, isnan) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isnan, __isnanf) +hidden_def (__isnanf) +weak_alias (__isnanf, isnanf) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isnan, __isnanl) +weak_alias (__isnan, isnanl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0); +compat_symbol (libc, isnan, isnanl, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S new file mode 100644 index 0000000000..b48c85e0d3 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isnan.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/Implies b/sysdeps/powerpc/powerpc64/power7/Implies deleted file mode 100644 index 13b03309fb..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power5 diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies deleted file mode 100644 index 13b03309fb..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power5 diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S new file mode 100644 index 0000000000..6763d1adc8 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S @@ -0,0 +1,68 @@ +/* finite(). PowerPC64/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado <luisgpm@br.ibm.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +/* int __finite(x) */ + .section ".toc","aw" +.LC0: /* 1.0 */ + .tc FD_ONE[TC],0x3ff0000000000000 + .section ".text" + .type __finite, @function + .machine power7 +EALIGN (__finite, 4, 0) + CALL_MCOUNT 0 + lfd fp0,.LC0@toc(r2) + ftdiv cr7,fp1,fp0 + li r3,1 + bflr 30 + + /* If we are here, we either have +/-INF, + NaN or denormal. */ + + stfd fp1,-16(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + + lhz r4,-16(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + clrlwi r4,r4,17 /* r4 = abs(r4). */ + cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ + bltlr cr7 /* LT means finite, other non-finite. */ + li r3,0 + blr + END (__finite) + +hidden_def (__finite) +weak_alias (__finite, finite) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__finite, __finitef) +hidden_def (__finitef) +weak_alias (__finitef, finitef) + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __finite, __finitel, GLIBC_2_0); +compat_symbol (libc, finite, finitel, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S new file mode 100644 index 0000000000..54bd94176d --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S @@ -0,0 +1 @@ +/* This function uses the same code as s_finite.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S new file mode 100644 index 0000000000..f896d38026 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S @@ -0,0 +1,71 @@ +/* isinf(). PowerPC64/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado <luisgpm@br.ibm.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +/* int __isinf(x) */ + .section ".toc","aw" +.LC0: /* 1.0 */ + .tc FD_ONE[TC],0x3ff0000000000000 + .section ".text" + .type __isinf, @function + .machine power7 +EALIGN (__isinf, 4, 0) + CALL_MCOUNT 0 + lfd fp0,.LC0@toc(r2) + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 29 /* If not INF, return. */ + + /* Either we have -INF/+INF or a denormal. */ + + stfd fp1,-16(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + lhz r4,-16(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ + li r3,1 + beqlr cr7 /* EQ means INF, otherwise -INF. */ + li r3,-1 + blr + END (__isinf) + +hidden_def (__isinf) +weak_alias (__isinf, isinf) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isinf, __isinff) +hidden_def (__isinff) +weak_alias (__isinff, isinff) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isinf, __isinfl) +weak_alias (__isinf, isinfl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); +compat_symbol (libc, isinf, isinfl, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S new file mode 100644 index 0000000000..be759e091e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isinf.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S new file mode 100644 index 0000000000..8877012598 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S @@ -0,0 +1,69 @@ +/* isnan(). PowerPC64/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado <luisgpm@br.ibm.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +/* int __isnan(x) */ + .section ".toc","aw" +.LC0: /* 1.0 */ + .tc FD_ONE[TC],0x3ff0000000000000 + .section ".text" + .type __isnan, @function + .machine power7 +EALIGN (__isnan, 4, 0) + CALL_MCOUNT 0 + lfd fp0,.LC0@toc(r2) + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 30 /* If not NaN, finish. */ + + stfd fp1,-16(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + ld r4,-16(r1) /* Load FP into GPR. */ + lis r0,0x7ff0 + sldi r0,r0,32 /* const long r0 0x7ff00000 00000000. */ + clrldi r4,r4,1 /* x = fabs(x) */ + cmpd cr7,r4,r0 /* if (fabs(x) <= inf) */ + blelr cr7 /* LE means not NaN. */ + li r3,1 /* else return 1 */ + blr + END (__isnan) + +hidden_def (__isnan) +weak_alias (__isnan, isnan) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isnan, __isnanf) +hidden_def (__isnanf) +weak_alias (__isnanf, isnanf) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isnan, __isnanl) +weak_alias (__isnan, isnanl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0); +compat_symbol (libc, isnan, isnanl, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S new file mode 100644 index 0000000000..b48c85e0d3 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isnan.S. */ diff --git a/sysdeps/s390/fpu/fegetenv.c b/sysdeps/s390/fpu/fegetenv.c index a244f2ca8b..04da54c94c 100644 --- a/sysdeps/s390/fpu/fegetenv.c +++ b/sysdeps/s390/fpu/fegetenv.c @@ -20,10 +20,6 @@ #include <fenv_libc.h> #include <fpu_control.h> -#include <stddef.h> -#include <asm/ptrace.h> -#include <sys/ptrace.h> -#include <unistd.h> int fegetenv (fenv_t *envp) @@ -33,3 +29,4 @@ fegetenv (fenv_t *envp) /* Success. */ return 0; } +libm_hidden_def (fegetenv) diff --git a/sysdeps/s390/s390-64/utf16-utf32-z9.c b/sysdeps/s390/s390-64/utf16-utf32-z9.c index 868dea68ca..14daf2118f 100644 --- a/sysdeps/s390/s390-64/utf16-utf32-z9.c +++ b/sysdeps/s390/s390-64/utf16-utf32-z9.c @@ -203,7 +203,10 @@ gconv_end (struct __gconv_step *data) swapping). */ #define BODY \ { \ - if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + /* The hardware instruction currently fails to report an error for \ + isolated low surrogates so we have to disable the instruction \ + until this gets resolved. */ \ + if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \ { \ HARDWARE_CONVERT ("cu24 %0, %1, 1"); \ if (inptr != inend) \ @@ -229,6 +232,12 @@ gconv_end (struct __gconv_step *data) } \ else \ { \ + /* An isolated low-surrogate was found. This has to be \ + considered ill-formed. */ \ + if (__builtin_expect (u1 >= 0xdc00, 0)) \ + { \ + STANDARD_FROM_LOOP_ERR_HANDLER (2); \ + } \ /* It's a surrogate character. At least the first word says \ it is. */ \ if (__builtin_expect (inptr + 4 > inend, 0)) \ diff --git a/sysdeps/s390/s390-64/utf8-utf16-z9.c b/sysdeps/s390/s390-64/utf8-utf16-z9.c index 531d3ebd4b..5f73f3c59e 100644 --- a/sysdeps/s390/s390-64/utf8-utf16-z9.c +++ b/sysdeps/s390/s390-64/utf8-utf16-z9.c @@ -345,9 +345,12 @@ gconv_end (struct __gconv_step *data) Operation. */ #define BODY \ { \ - if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + /* The hardware instruction currently fails to report an error for \ + isolated low surrogates so we have to disable the instruction \ + until this gets resolved. */ \ + if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \ { \ - HARDWARE_CONVERT ("cu21 %0, %1"); \ + HARDWARE_CONVERT ("cu21 %0, %1, 1"); \ if (inptr != inend) \ { \ /* Check if the third byte is \ @@ -388,7 +391,7 @@ gconv_end (struct __gconv_step *data) \ outptr += 2; \ } \ - else if (c >= 0x0800 && c <= 0xd7ff) \ + else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff) \ { \ /* Three byte UTF-8 char. */ \ \ diff --git a/sysdeps/sh/sh4/fpu/fegetenv.c b/sysdeps/sh/sh4/fpu/fegetenv.c index c07b32af30..683939b52d 100644 --- a/sysdeps/sh/sh4/fpu/fegetenv.c +++ b/sysdeps/sh/sh4/fpu/fegetenv.c @@ -29,3 +29,4 @@ fegetenv (fenv_t *envp) return 0; } +libm_hidden_def (fegetenv) diff --git a/sysdeps/sparc/fpu/fegetenv.c b/sysdeps/sparc/fpu/fegetenv.c index 36486f5973..c606a9cac0 100644 --- a/sysdeps/sparc/fpu/fegetenv.c +++ b/sysdeps/sparc/fpu/fegetenv.c @@ -34,4 +34,5 @@ strong_alias (__fegetenv, __old_fegetenv) compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1); #endif +libm_hidden_ver (__fegetenv, fegetenv) versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2); diff --git a/sysdeps/sparc/sparc32/dl-irel.h b/sysdeps/sparc/sparc32/dl-irel.h new file mode 100644 index 0000000000..1891938d6d --- /dev/null +++ b/sysdeps/sparc/sparc32/dl-irel.h @@ -0,0 +1,55 @@ +/* Machine-dependent ELF indirect relocation inline functions. + SPARC 32-bit version. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _DL_IREL_H +#define _DL_IREL_H + +#include <stdio.h> +#include <unistd.h> +#include <dl-plt.h> + +#define ELF_MACHINE_IRELA 1 + +static inline void +__attribute ((always_inline)) +elf_irela (const Elf32_Rela *reloc) +{ + unsigned int r_type = ELF32_R_TYPE (reloc->r_info); + + if (__builtin_expect (r_type == R_SPARC_IRELATIVE, 1)) + { + Elf32_Addr *const reloc_addr = (void *) reloc->r_offset; + Elf32_Addr value = ((Elf32_Addr (*) (void)) reloc->r_addend) (); + *reloc_addr = value; + } + else if (__builtin_expect (r_type == R_SPARC_JMP_IREL, 1)) + { + Elf32_Addr *const reloc_addr = (void *) reloc->r_offset; + Elf32_Addr value = ((Elf32_Addr (*) (void)) reloc->r_addend) (); + + sparc_fixup_plt (reloc, reloc_addr, value, 0, 1); + } + else if (r_type == R_SPARC_NONE) + ; + else + __libc_fatal ("unexpected reloc type in static binary"); +} + +#endif /* dl-irel.h */ diff --git a/sysdeps/sparc/sparc32/dl-machine.h b/sysdeps/sparc/sparc32/dl-machine.h index b3b7852d87..e1385f7aca 100644 --- a/sysdeps/sparc/sparc32/dl-machine.h +++ b/sysdeps/sparc/sparc32/dl-machine.h @@ -1,5 +1,5 @@ /* Machine-dependent ELF dynamic relocation inline functions. SPARC version. - Copyright (C) 1996-2003, 2004, 2005, 2006, 2007 + Copyright (C) 1996-2003, 2004, 2005, 2006, 2007, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -27,20 +27,13 @@ #include <sys/param.h> #include <ldsodefs.h> #include <tls.h> +#include <dl-plt.h> #ifndef VALIDX # define VALIDX(tag) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM \ + DT_EXTRANUM + DT_VALTAGIDX (tag)) #endif -/* Some SPARC opcodes we need to use for self-modifying code. */ -#define OPCODE_NOP 0x01000000 /* nop */ -#define OPCODE_CALL 0x40000000 /* call ?; add PC-rel word address */ -#define OPCODE_SETHI_G1 0x03000000 /* sethi ?, %g1; add value>>10 */ -#define OPCODE_JMP_G1 0x81c06000 /* jmp %g1+?; add lo 10 bits of value */ -#define OPCODE_SAVE_SP 0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */ -#define OPCODE_BA 0x30800000 /* b,a ?; add PC-rel word address */ - /* Return nonzero iff ELF header is compatible with the running host. */ static inline int elf_machine_matches_host (const Elf32_Ehdr *ehdr) @@ -312,41 +305,6 @@ _dl_start_user:\n\ .size _dl_start_user, . - _dl_start_user\n\ .previous"); -static inline __attribute__ ((always_inline)) Elf32_Addr -sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr, - Elf32_Addr value, int t, int do_flush) -{ - Elf32_Sword disp = value - (Elf32_Addr) reloc_addr; - - if (0 && disp >= -0x800000 && disp < 0x800000) - { - /* Don't need to worry about thread safety. We're writing just one - instruction. */ - - reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff); - if (do_flush) - __asm __volatile ("flush %0" : : "r"(reloc_addr)); - } - else - { - /* For thread safety, write the instructions from the bottom and - flush before we overwrite the critical "b,a". This of course - need not be done during bootstrapping, since there are no threads. - But we also can't tell if we _can_ use flush, so don't. */ - - reloc_addr += t; - reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff); - if (do_flush) - __asm __volatile ("flush %0+4" : : "r"(reloc_addr)); - - reloc_addr[0] = OPCODE_SETHI_G1 | (value >> 10); - if (do_flush) - __asm __volatile ("flush %0" : : "r"(reloc_addr)); - } - - return value; -} - static inline Elf32_Addr elf_machine_fixup_plt (struct link_map *map, lookup_t t, const Elf32_Rela *reloc, @@ -433,6 +391,13 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc, value += reloc->r_addend; /* Assume copy relocs have zero addend. */ + if (sym != NULL + && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0) + && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)) + { + value = ((Elf32_Addr (*) (void)) value) (); + } + switch (r_type) { #if !defined RTLD_BOOTSTRAP && !defined RESOLVE_CONFLICT_FIND_MAP @@ -460,6 +425,13 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc, case R_SPARC_32: *reloc_addr = value; break; + case R_SPARC_IRELATIVE: + value = ((Elf32_Addr (*) (void)) value) (); + *reloc_addr = value; + break; + case R_SPARC_JMP_IREL: + value = ((Elf32_Addr (*) (void)) value) (); + /* Fall thru */ case R_SPARC_JMP_SLOT: { #if !defined RTLD_BOOTSTRAP && !defined __sparc_v9__ @@ -578,16 +550,21 @@ __attribute__ ((always_inline)) elf_machine_lazy_rel (struct link_map *map, Elf32_Addr l_addr, const Elf32_Rela *reloc) { - switch (ELF32_R_TYPE (reloc->r_info)) + Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset); + const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); + + if (__builtin_expect (r_type == R_SPARC_JMP_SLOT, 1)) + ; + else if (r_type == R_SPARC_JMP_IREL) { - case R_SPARC_NONE: - break; - case R_SPARC_JMP_SLOT: - break; - default: - _dl_reloc_bad_type (map, ELFW(R_TYPE) (reloc->r_info), 1); - break; + Elf32_Addr value = map->l_addr + reloc->r_addend; + value = ((Elf32_Addr (*) (void)) value) (); + sparc_fixup_plt (reloc, reloc_addr, value, 0, 1); } + else if (r_type == R_SPARC_NONE) + ; + else + _dl_reloc_bad_type (map, r_type, 1); } #endif /* RESOLVE_MAP */ diff --git a/sysdeps/sparc/sparc32/dl-plt.h b/sysdeps/sparc/sparc32/dl-plt.h new file mode 100644 index 0000000000..edcc5c1374 --- /dev/null +++ b/sysdeps/sparc/sparc32/dl-plt.h @@ -0,0 +1,62 @@ +/* PLT fixups. Sparc 32-bit version. + Copyright (C) 1996-2003, 2004, 2005, 2006, 2007, 2010 + Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* Some SPARC opcodes we need to use for self-modifying code. */ +#define OPCODE_NOP 0x01000000 /* nop */ +#define OPCODE_CALL 0x40000000 /* call ?; add PC-rel word address */ +#define OPCODE_SETHI_G1 0x03000000 /* sethi ?, %g1; add value>>10 */ +#define OPCODE_JMP_G1 0x81c06000 /* jmp %g1+?; add lo 10 bits of value */ +#define OPCODE_SAVE_SP 0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */ +#define OPCODE_BA 0x30800000 /* b,a ?; add PC-rel word address */ + +static inline __attribute__ ((always_inline)) Elf32_Addr +sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr, + Elf32_Addr value, int t, int do_flush) +{ + Elf32_Sword disp = value - (Elf32_Addr) reloc_addr; + + if (0 && disp >= -0x800000 && disp < 0x800000) + { + /* Don't need to worry about thread safety. We're writing just one + instruction. */ + + reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff); + if (do_flush) + __asm __volatile ("flush %0" : : "r"(reloc_addr)); + } + else + { + /* For thread safety, write the instructions from the bottom and + flush before we overwrite the critical "b,a". This of course + need not be done during bootstrapping, since there are no threads. + But we also can't tell if we _can_ use flush, so don't. */ + + reloc_addr += t; + reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff); + if (do_flush) + __asm __volatile ("flush %0+4" : : "r"(reloc_addr)); + + reloc_addr[0] = OPCODE_SETHI_G1 | (value >> 10); + if (do_flush) + __asm __volatile ("flush %0" : : "r"(reloc_addr)); + } + + return value; +} diff --git a/sysdeps/sparc/sparc64/dl-irel.h b/sysdeps/sparc/sparc64/dl-irel.h new file mode 100644 index 0000000000..1a2a0a3dd5 --- /dev/null +++ b/sysdeps/sparc/sparc64/dl-irel.h @@ -0,0 +1,58 @@ +/* Machine-dependent ELF indirect relocation inline functions. + SPARC 64-bit version. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _DL_IREL_H +#define _DL_IREL_H + +#include <stdio.h> +#include <unistd.h> +#include <dl-plt.h> + +#define ELF_MACHINE_IRELA 1 + +static inline void +__attribute ((always_inline)) +elf_irela (const Elf64_Rela *reloc) +{ + unsigned int r_type = (reloc->r_info & 0xff); + + if (__builtin_expect (r_type == R_SPARC_IRELATIVE, 1)) + { + Elf64_Addr *const reloc_addr = (void *) reloc->r_offset; + Elf64_Addr value = ((Elf64_Addr (*) (void)) reloc->r_addend) (); + *reloc_addr = value; + } + else if (__builtin_expect (r_type == R_SPARC_JMP_IREL, 1)) + { + Elf64_Addr *const reloc_addr = (void *) reloc->r_offset; + Elf64_Addr value = ((Elf64_Addr (*) (void)) reloc->r_addend) (); + struct link_map map = { .l_addr = 0 }; + + /* 'high' is always zero, for large PLT entries the linker + emits an R_SPARC_IRELATIVE. */ + sparc64_fixup_plt (&map, reloc, reloc_addr, value, 0, 0); + } + else if (r_type == R_SPARC_NONE) + ; + else + __libc_fatal ("unexpected reloc type in static binary"); +} + +#endif /* dl-irel.h */ diff --git a/sysdeps/sparc/sparc64/dl-machine.h b/sysdeps/sparc/sparc64/dl-machine.h index 3eee672912..b4f43e9cf5 100644 --- a/sysdeps/sparc/sparc64/dl-machine.h +++ b/sysdeps/sparc/sparc64/dl-machine.h @@ -1,6 +1,6 @@ /* Machine-dependent ELF dynamic relocation inline functions. Sparc64 version. - Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006 - Free Software Foundation, Inc. + Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, + 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -27,6 +27,7 @@ #include <sys/param.h> #include <ldsodefs.h> #include <sysdep.h> +#include <dl-plt.h> #ifndef VALIDX # define VALIDX(tag) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM \ @@ -89,132 +90,6 @@ elf_machine_load_address (void) return (Elf64_Addr) got - *got + (Elf32_Sword) ((pc[2] - pc[3]) * 4) - 4; } -/* We have 4 cases to handle. And we code different code sequences - for each one. I love V9 code models... */ -static inline void __attribute__ ((always_inline)) -sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc, - Elf64_Addr *reloc_addr, Elf64_Addr value, - Elf64_Addr high, int t) -{ - unsigned int *insns = (unsigned int *) reloc_addr; - Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr; - Elf64_Sxword disp = value - plt_vaddr; - - /* Now move plt_vaddr up to the call instruction. */ - plt_vaddr += ((t + 1) * 4); - - /* PLT entries .PLT32768 and above look always the same. */ - if (__builtin_expect (high, 0) != 0) - { - *reloc_addr = value - map->l_addr; - } - /* Near destination. */ - else if (disp >= -0x800000 && disp < 0x800000) - { - /* As this is just one instruction, it is thread safe and so - we can avoid the unnecessary sethi FOO, %g1. - b,a target */ - insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff); - __asm __volatile ("flush %0" : : "r" (insns)); - } - /* 32-bit Sparc style, the target is in the lower 32-bits of - address space. */ - else if (insns += t, (value >> 32) == 0) - { - /* sethi %hi(target), %g1 - jmpl %g1 + %lo(target), %g0 */ - - insns[1] = 0x81c06000 | (value & 0x3ff); - __asm __volatile ("flush %0 + 4" : : "r" (insns)); - - insns[0] = 0x03000000 | ((unsigned int)(value >> 10)); - __asm __volatile ("flush %0" : : "r" (insns)); - } - /* We can also get somewhat simple sequences if the distance between - the target and the PLT entry is within +/- 2GB. */ - else if ((plt_vaddr > value - && ((plt_vaddr - value) >> 31) == 0) - || (value > plt_vaddr - && ((value - plt_vaddr) >> 31) == 0)) - { - unsigned int displacement; - - if (plt_vaddr > value) - displacement = (0 - (plt_vaddr - value)); - else - displacement = value - plt_vaddr; - - /* mov %o7, %g1 - call displacement - mov %g1, %o7 */ - - insns[2] = 0x9e100001; - __asm __volatile ("flush %0 + 8" : : "r" (insns)); - - insns[1] = 0x40000000 | (displacement >> 2); - __asm __volatile ("flush %0 + 4" : : "r" (insns)); - - insns[0] = 0x8210000f; - __asm __volatile ("flush %0" : : "r" (insns)); - } - /* Worst case, ho hum... */ - else - { - unsigned int high32 = (value >> 32); - unsigned int low32 = (unsigned int) value; - - /* ??? Some tricks can be stolen from the sparc64 egcs backend - constant formation code I wrote. -DaveM */ - - if (__builtin_expect (high32 & 0x3ff, 0)) - { - /* sethi %hh(value), %g1 - sethi %lm(value), %g5 - or %g1, %hm(value), %g1 - or %g5, %lo(value), %g5 - sllx %g1, 32, %g1 - jmpl %g1 + %g5, %g0 - nop */ - - insns[5] = 0x81c04005; - __asm __volatile ("flush %0 + 20" : : "r" (insns)); - - insns[4] = 0x83287020; - __asm __volatile ("flush %0 + 16" : : "r" (insns)); - - insns[3] = 0x8a116000 | (low32 & 0x3ff); - __asm __volatile ("flush %0 + 12" : : "r" (insns)); - - insns[2] = 0x82106000 | (high32 & 0x3ff); - } - else - { - /* sethi %hh(value), %g1 - sethi %lm(value), %g5 - sllx %g1, 32, %g1 - or %g5, %lo(value), %g5 - jmpl %g1 + %g5, %g0 - nop */ - - insns[4] = 0x81c04005; - __asm __volatile ("flush %0 + 16" : : "r" (insns)); - - insns[3] = 0x8a116000 | (low32 & 0x3ff); - __asm __volatile ("flush %0 + 12" : : "r" (insns)); - - insns[2] = 0x83287020; - } - - __asm __volatile ("flush %0 + 8" : : "r" (insns)); - - insns[1] = 0x0b000000 | (low32 >> 10); - __asm __volatile ("flush %0 + 4" : : "r" (insns)); - - insns[0] = 0x03000000 | (high32 >> 10); - __asm __volatile ("flush %0" : : "r" (insns)); - } -} - static inline Elf64_Addr __attribute__ ((always_inline)) elf_machine_fixup_plt (struct link_map *map, lookup_t t, const Elf64_Rela *reloc, @@ -549,6 +424,11 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc, value += reloc->r_addend; /* Assume copy relocs have zero addend. */ + if (sym != NULL + && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0) + && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)) + value = ((Elf64_Addr (*) (void)) value) (); + switch (r_type) { #if !defined RTLD_BOOTSTRAP && !defined RESOLVE_CONFLICT_FIND_MAP @@ -576,6 +456,13 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc, case R_SPARC_GLOB_DAT: *reloc_addr = value; break; + case R_SPARC_IRELATIVE: + value = ((Elf64_Addr (*) (void)) value) (); + *reloc_addr = value; + break; + case R_SPARC_JMP_IREL: + value = ((Elf64_Addr (*) (void)) value) (); + /* Fall thru */ case R_SPARC_JMP_SLOT: #ifdef RESOLVE_CONFLICT_FIND_MAP /* R_SPARC_JMP_SLOT conflicts against .plt[32768+] @@ -757,16 +644,29 @@ __attribute__ ((always_inline)) elf_machine_lazy_rel (struct link_map *map, Elf64_Addr l_addr, const Elf64_Rela *reloc) { - switch (ELF64_R_TYPE (reloc->r_info)) + Elf64_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset); + const unsigned int r_type = ELF64_R_TYPE (reloc->r_info); + + if (__builtin_expect (r_type == R_SPARC_JMP_SLOT, 1)) + ; + else if (r_type == R_SPARC_JMP_IREL + || r_type == R_SPARC_IRELATIVE) { - case R_SPARC_NONE: - break; - case R_SPARC_JMP_SLOT: - break; - default: - _dl_reloc_bad_type (map, ELFW(R_TYPE) (reloc->r_info), 1); - break; + Elf64_Addr value = map->l_addr + reloc->r_addend; + value = ((Elf64_Addr (*) (void)) value) (); + if (r_type == R_SPARC_JMP_IREL) + { + /* 'high' is always zero, for large PLT entries the linker + emits an R_SPARC_IRELATIVE. */ + sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0); + } + else + *reloc_addr = value; } + else if (r_type == R_SPARC_NONE) + ; + else + _dl_reloc_bad_type (map, r_type, 1); } #endif /* RESOLVE_MAP */ diff --git a/sysdeps/sparc/sparc64/dl-plt.h b/sysdeps/sparc/sparc64/dl-plt.h new file mode 100644 index 0000000000..e06be43a0a --- /dev/null +++ b/sysdeps/sparc/sparc64/dl-plt.h @@ -0,0 +1,144 @@ +/* PLT fixups. Sparc 64-bit version. + Copyright (C) 1997-2006, 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* We have 4 cases to handle. And we code different code sequences + for each one. I love V9 code models... */ +static inline void __attribute__ ((always_inline)) +sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc, + Elf64_Addr *reloc_addr, Elf64_Addr value, + Elf64_Addr high, int t) +{ + unsigned int *insns = (unsigned int *) reloc_addr; + Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr; + Elf64_Sxword disp = value - plt_vaddr; + + /* Now move plt_vaddr up to the call instruction. */ + plt_vaddr += ((t + 1) * 4); + + /* PLT entries .PLT32768 and above look always the same. */ + if (__builtin_expect (high, 0) != 0) + { + *reloc_addr = value - map->l_addr; + } + /* Near destination. */ + else if (disp >= -0x800000 && disp < 0x800000) + { + /* As this is just one instruction, it is thread safe and so + we can avoid the unnecessary sethi FOO, %g1. + b,a target */ + insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff); + __asm __volatile ("flush %0" : : "r" (insns)); + } + /* 32-bit Sparc style, the target is in the lower 32-bits of + address space. */ + else if (insns += t, (value >> 32) == 0) + { + /* sethi %hi(target), %g1 + jmpl %g1 + %lo(target), %g0 */ + + insns[1] = 0x81c06000 | (value & 0x3ff); + __asm __volatile ("flush %0 + 4" : : "r" (insns)); + + insns[0] = 0x03000000 | ((unsigned int)(value >> 10)); + __asm __volatile ("flush %0" : : "r" (insns)); + } + /* We can also get somewhat simple sequences if the distance between + the target and the PLT entry is within +/- 2GB. */ + else if ((plt_vaddr > value + && ((plt_vaddr - value) >> 31) == 0) + || (value > plt_vaddr + && ((value - plt_vaddr) >> 31) == 0)) + { + unsigned int displacement; + + if (plt_vaddr > value) + displacement = (0 - (plt_vaddr - value)); + else + displacement = value - plt_vaddr; + + /* mov %o7, %g1 + call displacement + mov %g1, %o7 */ + + insns[2] = 0x9e100001; + __asm __volatile ("flush %0 + 8" : : "r" (insns)); + + insns[1] = 0x40000000 | (displacement >> 2); + __asm __volatile ("flush %0 + 4" : : "r" (insns)); + + insns[0] = 0x8210000f; + __asm __volatile ("flush %0" : : "r" (insns)); + } + /* Worst case, ho hum... */ + else + { + unsigned int high32 = (value >> 32); + unsigned int low32 = (unsigned int) value; + + /* ??? Some tricks can be stolen from the sparc64 egcs backend + constant formation code I wrote. -DaveM */ + + if (__builtin_expect (high32 & 0x3ff, 0)) + { + /* sethi %hh(value), %g1 + sethi %lm(value), %g5 + or %g1, %hm(value), %g1 + or %g5, %lo(value), %g5 + sllx %g1, 32, %g1 + jmpl %g1 + %g5, %g0 + nop */ + + insns[5] = 0x81c04005; + __asm __volatile ("flush %0 + 20" : : "r" (insns)); + + insns[4] = 0x83287020; + __asm __volatile ("flush %0 + 16" : : "r" (insns)); + + insns[3] = 0x8a116000 | (low32 & 0x3ff); + __asm __volatile ("flush %0 + 12" : : "r" (insns)); + + insns[2] = 0x82106000 | (high32 & 0x3ff); + } + else + { + /* sethi %hh(value), %g1 + sethi %lm(value), %g5 + sllx %g1, 32, %g1 + or %g5, %lo(value), %g5 + jmpl %g1 + %g5, %g0 + nop */ + + insns[4] = 0x81c04005; + __asm __volatile ("flush %0 + 16" : : "r" (insns)); + + insns[3] = 0x8a116000 | (low32 & 0x3ff); + __asm __volatile ("flush %0 + 12" : : "r" (insns)); + + insns[2] = 0x83287020; + } + + __asm __volatile ("flush %0 + 8" : : "r" (insns)); + + insns[1] = 0x0b000000 | (low32 >> 10); + __asm __volatile ("flush %0 + 4" : : "r" (insns)); + + insns[0] = 0x03000000 | (high32 >> 10); + __asm __volatile ("flush %0" : : "r" (insns)); + } +} diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S index 4cb968505d..cfd9864f63 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S @@ -28,19 +28,12 @@ #define __longjmp ____longjmp_chk #ifdef PIC -# ifdef HAVE_ASM_PPC_REL16 # define LOAD_ARG \ bcl 20,31,1f; \ 1: mflr r3; \ addis r3,r3,_GLOBAL_OFFSET_TABLE_-1b@ha; \ addi r3,r3,_GLOBAL_OFFSET_TABLE_-1b@l; \ lwz r3,.LC0@got(r3) -# else -# define LOAD_ARG \ - bl _GLOBAL_OFFSET_TABLE_-4@local; \ - mflr r3; \ - lwz r3,.LC0@got(r3) -# endif #else # define LOAD_ARG \ lis r3,.LC0@ha; \ diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S index e945834945..4c8c6b433b 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S @@ -36,17 +36,10 @@ ENTRY (BP_SYM (__brk)) DO_CALL(SYS_ify(brk)) lwz r6,8(r1) #ifdef PIC -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r5 addis r5,r5,__curbrk-1b@ha stw r3,__curbrk-1b@l(r5) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r5 - lwz r5,__curbrk@got(r5) - stw r3,0(r5) -# endif #else lis r4,__curbrk@ha stw r3,__curbrk@l(r4) diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S index 63e1773e22..27285ed4a5 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S @@ -145,15 +145,10 @@ ENTRY(__CONTEXT_FUNC_NAME) # ifdef __CONTEXT_ENABLE_VRS # ifdef PIC mflr r8 -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r7 addis r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha addi r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r7 -# endif # ifdef SHARED lwz r7,_rtld_global_ro@got(r7) mtlr r8 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies index d379a2dd12..af946119aa 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies @@ -1,3 +1,4 @@ # Make sure this comes before the powerpc/powerpc32/fpu that's # listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies. +powerpc/powerpc32/power7/fpu powerpc/powerpc32/power5/fpu diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S index 127c9e4581..f304090868 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S @@ -73,15 +73,10 @@ ENTRY(__CONTEXT_FUNC_NAME) #ifdef PIC mflr r8 -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r7 addis r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha addi r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r7 -# endif # ifdef SHARED lwz r7,_rtld_global_ro@got(r7) mtlr r8 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S index 89b1a61954..62efee2dce 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S @@ -146,15 +146,10 @@ ENTRY(__CONTEXT_FUNC_NAME) # ifdef PIC mflr r8 -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r7 addis r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha addi r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r7 -# endif # ifdef SHARED lwz r7,_rtld_global_ro@got(r7) mtlr r8 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies index c46b3d42af..ca112208d1 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies @@ -1,3 +1,4 @@ # Make sure this comes before the powerpc/powerpc64/fpu that's # listed in unix/sysv/linux/powerpc/powerpc64/fpu/Implies. +powerpc/powerpc64/power7/fpu powerpc/powerpc64/power5/fpu diff --git a/sysdeps/x86_64/fpu/fegetenv.c b/sysdeps/x86_64/fpu/fegetenv.c index fa5a8dadcb..2159a1fab1 100644 --- a/sysdeps/x86_64/fpu/fegetenv.c +++ b/sysdeps/x86_64/fpu/fegetenv.c @@ -28,3 +28,4 @@ fegetenv (fenv_t *envp) /* Success. */ return 0; } +libm_hidden_def (fegetenv) |