diff options
Diffstat (limited to 'sysdeps')
31 files changed, 1569 insertions, 723 deletions
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 12bcfc273f..32286d8d38 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -9,9 +9,11 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ - memcmp-ssse3 memcmp-sse4 strcasestr-nonascii + memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \ + strlen-sse2 strlen-sse2-bsf ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c +CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S new file mode 100644 index 0000000000..0dc651f017 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S @@ -0,0 +1,127 @@ +/* strlen with SSE2 and BSF + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#if defined SHARED && !defined NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) +#define PARMS 4 + 8 /* Preserve ESI and EDI. */ +#define STR PARMS +#define ENTRANCE PUSH (%esi); PUSH (%edi); cfi_remember_state +#define RETURN POP (%edi); POP (%esi); ret; \ + cfi_restore_state; cfi_remember_state + + .text +ENTRY ( __strlen_sse2_bsf) + ENTRANCE + mov STR(%esp), %edi + xor %eax, %eax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%edi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %edi, %eax + and $-16, %eax + jmp L(align16_start) +L(next): + + mov %edi, %eax + and $-16, %eax + pcmpeqb (%eax), %xmm0 + mov $-1, %esi + sub %eax, %ecx + shl %cl, %esi + pmovmskb %xmm0, %edx + and %esi, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop) +L(exit): + sub %edi, %eax +L(exit_less16): + bsf %edx, %edx + add %edx, %eax + RETURN +L(exit16): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $16, %eax + RETURN +L(exit32): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $32, %eax + RETURN +L(exit48): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $48, %eax + POP (%edi) + POP (%esi) + ret + +END ( __strlen_sse2_bsf) + +#endif diff --git a/sysdeps/i386/i686/multiarch/strlen-sse2.S b/sysdeps/i386/i686/multiarch/strlen-sse2.S new file mode 100644 index 0000000000..65809d985b --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strlen-sse2.S @@ -0,0 +1,347 @@ +/* strlen with SSE2 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#if defined SHARED && !defined NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) +#define PARMS 4 +#define STR PARMS +#define ENTRANCE +#define RETURN ret + + .text +ENTRY (__strlen_sse2) + ENTRANCE + mov STR(%esp), %edx + xor %eax, %eax + cmpb $0, (%edx) + jz L(exit_tail0) + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmpb $0, 3(%edx) + jz L(exit_tail3) + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmpb $0, 7(%edx) + jz L(exit_tail7) + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmpb $0, 11(%edx) + jz L(exit_tail11) + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmpb $0, 15(%edx) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + mov %edx, %eax + mov %edx, %ecx + and $-16, %eax + add $16, %ecx + add $16, %eax + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + and $-0x40, %eax + PUSH (%esi) + PUSH (%edi) + PUSH (%ebx) + PUSH (%ebp) + xor %ebp, %ebp +L(aligned_64): + pcmpeqb (%eax), %xmm0 + pcmpeqb 16(%eax), %xmm1 + pcmpeqb 32(%eax), %xmm2 + pcmpeqb 48(%eax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %esi + pmovmskb %xmm2, %edi + pmovmskb %xmm3, %ebx + or %edx, %ebp + or %esi, %ebp + or %edi, %ebp + or %ebx, %ebp + lea 64(%eax), %eax + jz L(aligned_64) +L(48leave): + test %edx, %edx + jnz L(aligned_64_exit_16) + test %esi, %esi + jnz L(aligned_64_exit_32) + test %edi, %edi + jnz L(aligned_64_exit_48) + mov %ebx, %edx + lea (%eax), %eax + jmp L(aligned_64_exit) +L(aligned_64_exit_48): + lea -16(%eax), %eax + mov %edi, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_32): + lea -32(%eax), %eax + mov %esi, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_16): + lea -48(%eax), %eax +L(aligned_64_exit): + POP (%ebp) + POP (%ebx) + POP (%edi) + POP (%esi) +L(exit): + sub %ecx, %eax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + RETURN + +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + RETURN + + .p2align 4 +L(exit_tail1): + add $1, %eax + RETURN + +L(exit_tail2): + add $2, %eax + RETURN + +L(exit_tail3): + add $3, %eax + RETURN + +L(exit_tail4): + add $4, %eax + RETURN + +L(exit_tail5): + add $5, %eax + RETURN + +L(exit_tail6): + add $6, %eax + RETURN + +L(exit_tail7): + add $7, %eax + RETURN + +L(exit_tail8): + add $8, %eax + RETURN + +L(exit_tail9): + add $9, %eax + RETURN + +L(exit_tail10): + add $10, %eax + RETURN + +L(exit_tail11): + add $11, %eax + RETURN + +L(exit_tail12): + add $12, %eax + RETURN + +L(exit_tail13): + add $13, %eax + RETURN + +L(exit_tail14): + add $14, %eax + RETURN + +L(exit_tail15): + add $15, %eax + ret + +END (__strlen_sse2) + +#endif diff --git a/sysdeps/i386/i686/multiarch/strlen.S b/sysdeps/i386/i686/multiarch/strlen.S index 9786add745..9d465c8454 100644 --- a/sysdeps/i386/i686/multiarch/strlen.S +++ b/sysdeps/i386/i686/multiarch/strlen.S @@ -1,5 +1,5 @@ /* Multiple versions of strlen - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009, 2010 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -48,6 +48,9 @@ ENTRY(strlen) 1: leal __strlen_ia32@GOTOFF(%ebx), %eax testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) jz 2f + leal __strlen_sse2_bsf@GOTOFF(%ebx), %eax + testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx) + jz 2f leal __strlen_sse2@GOTOFF(%ebx), %eax 2: popl %ebx cfi_adjust_cfa_offset (-4); @@ -55,84 +58,6 @@ ENTRY(strlen) ret END(strlen) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -#define RETURN popl %esi; CFI_POP (esi); ret - - .text -ENTRY (__strlen_sse2) -/* - * This implementation uses SSE instructions to compare up to 16 bytes - * at a time looking for the end of string (null char). - */ - pushl %esi - cfi_adjust_cfa_offset (4) - cfi_rel_offset (%esi, 0) - mov 8(%esp), %eax - mov %eax, %ecx - pxor %xmm0, %xmm0 /* 16 null chars */ - mov %eax, %esi - and $15, %ecx - jz 1f /* string is 16 byte aligned */ - - /* - * Unaligned case. Round down to 16-byte boundary before comparing - * 16 bytes for a null char. The code then compensates for any extra chars - * preceding the start of the string. - */ - and $-16, %esi - - pcmpeqb (%esi), %xmm0 - lea 16(%eax), %esi - pmovmskb %xmm0, %edx - - shr %cl, %edx /* Compensate for bytes preceding the string */ - test %edx, %edx - jnz 2f - sub %ecx, %esi /* no null, adjust to next 16-byte boundary */ - pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */ - - .p2align 4 -1: /* 16 byte aligned */ - pcmpeqb (%esi), %xmm0 /* look for null bytes */ - pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */ - - add $16, %esi /* prepare to search next 16 bytes */ - test %edx, %edx /* if no null byte, %edx must be 0 */ - jnz 2f /* found a null */ - - pcmpeqb (%esi), %xmm0 - pmovmskb %xmm0, %edx - add $16, %esi - test %edx, %edx - jnz 2f - - pcmpeqb (%esi), %xmm0 - pmovmskb %xmm0, %edx - add $16, %esi - test %edx, %edx - jnz 2f - - pcmpeqb (%esi), %xmm0 - pmovmskb %xmm0, %edx - add $16, %esi - test %edx, %edx - jz 1b - -2: - neg %eax - lea -16(%eax, %esi), %eax /* calculate exact offset */ - bsf %edx, %ecx /* Least significant 1 bit is index of null */ - add %ecx, %eax - popl %esi - cfi_adjust_cfa_offset (-4) - cfi_restore (%esi) - ret - -END (__strlen_sse2) - # undef ENTRY # define ENTRY(name) \ .type __strlen_ia32, @function; \ diff --git a/sysdeps/i386/i686/multiarch/varshift.c b/sysdeps/i386/i686/multiarch/varshift.c new file mode 100644 index 0000000000..7760b966e2 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/varshift.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/varshift.c> diff --git a/sysdeps/i386/i686/multiarch/varshift.h b/sysdeps/i386/i686/multiarch/varshift.h new file mode 100644 index 0000000000..7c72c70d67 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/varshift.h @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/varshift.h> diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h index 0892b1d9df..901a44e4b3 100644 --- a/sysdeps/powerpc/powerpc64/dl-machine.h +++ b/sysdeps/powerpc/powerpc64/dl-machine.h @@ -1,6 +1,6 @@ /* Machine-dependent ELF dynamic relocation inline functions. PowerPC64 version. - Copyright 1995-2005, 2006, 2008 Free Software Foundation, Inc. + Copyright 1995-2005, 2006, 2008, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/syscalls.list b/sysdeps/unix/sysv/linux/powerpc/powerpc32/syscalls.list index 30122853a1..aaa3557831 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/syscalls.list +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/syscalls.list @@ -8,3 +8,4 @@ oldsetrlimit EXTRA setrlimit i:ip __old_setrlimit setrlimit@GLIBC_2.0 readahead - readahead i:iiiii __readahead readahead prlimit64 EXTRA prlimit64 i:iipp prlimit64 +fanotify_mark EXTRA fanotify_mark i:iiiiis fanotify_mark diff --git a/sysdeps/unix/sysv/linux/sparc/sys/epoll.h b/sysdeps/unix/sysv/linux/sparc/sys/epoll.h index cc0ddef69e..9943f21383 100644 --- a/sysdeps/unix/sysv/linux/sparc/sys/epoll.h +++ b/sysdeps/unix/sysv/linux/sparc/sys/epoll.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2006, 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2006, 2007, 2008, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -31,7 +31,7 @@ typedef __sigset_t sigset_t; #endif -/* Flags to be passed to epoll_create2. */ +/* Flags to be passed to epoll_create1. */ enum { EPOLL_CLOEXEC = 0x400000, @@ -101,8 +101,9 @@ __BEGIN_DECLS returned by epoll_create() should be closed with close(). */ extern int epoll_create (int __size) __THROW; -/* Same as epoll_create but with an additional FLAGS parameter. */ -extern int epoll_create2 (int __size, int __flags) __THROW; +/* Same as epoll_create but with an FLAGS parameter. The unused SIZE + parameter has been dropped. */ +extern int epoll_create1 (int __flags) __THROW; /* Manipulate an epoll instance "epfd". Returns 0 in case of success, diff --git a/sysdeps/unix/sysv/linux/x86_64/sys/epoll.h b/sysdeps/unix/sysv/linux/x86_64/sys/epoll.h index 234798e4b8..d799c845c1 100644 --- a/sysdeps/unix/sysv/linux/x86_64/sys/epoll.h +++ b/sysdeps/unix/sysv/linux/x86_64/sys/epoll.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2008, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -31,7 +31,7 @@ typedef __sigset_t sigset_t; #endif -/* Flags to be passed to epoll_create2. */ +/* Flags to be passed to epoll_create1. */ enum { EPOLL_CLOEXEC = 02000000, diff --git a/sysdeps/x86_64/add_n.S b/sysdeps/x86_64/add_n.S index 7883f6c840..f0b4c3f78c 100644 --- a/sysdeps/x86_64/add_n.S +++ b/sysdeps/x86_64/add_n.S @@ -1,6 +1,6 @@ -/* Add two limb vectors of the same length > 0 and store sum in a third - limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. +/* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + Copyright (C) 2006, 2007 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -21,22 +21,81 @@ #include "sysdep.h" #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define vp %rdx +#define n %rcx +#define cy %r8 + +#ifndef func +# define func __mpn_add_n +# define ADCSBB adc +#endif + .text -ENTRY (__mpn_add_n) - leaq (%rsi,%rcx,8), %rsi - leaq (%rdi,%rcx,8), %rdi - leaq (%rdx,%rcx,8), %rdx - negq %rcx - xorl %eax, %eax # clear cy - .p2align 2 -L(loop): - movq (%rsi,%rcx,8), %rax - movq (%rdx,%rcx,8), %r10 - adcq %r10, %rax - movq %rax, (%rdi,%rcx,8) - incq %rcx - jne L(loop) - movq %rcx, %rax # zero %rax - adcq %rax, %rax +ENTRY (func) + xor %r8, %r8 + mov (up), %r10 + mov (vp), %r11 + + lea -8(up,n,8), up + lea -8(vp,n,8), vp + lea -16(rp,n,8), rp + mov %ecx, %eax + neg n + and $3, %eax + je L(b00) + add %rax, n /* clear low rcx bits for jrcxz */ + cmp $2, %eax + jl L(b01) + je L(b10) + +L(b11): shr %r8 /* set cy */ + jmp L(e11) + +L(b00): shr %r8 /* set cy */ + mov %r10, %r8 + mov %r11, %r9 + lea 4(n), n + jmp L(e00) + +L(b01): shr %r8 /* set cy */ + jmp L(e01) + +L(b10): shr %r8 /* set cy */ + mov %r10, %r8 + mov %r11, %r9 + jmp L(e10) + +L(end): ADCSBB %r11, %r10 + mov %r10, 8(rp) + mov %ecx, %eax /* clear eax, ecx contains 0 */ + adc %eax, %eax ret -END (__mpn_add_n) + + .p2align 4 +L(top): + mov -24(up,n,8), %r8 + mov -24(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -24(rp,n,8) +L(e00): + mov -16(up,n,8), %r10 + mov -16(vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, -16(rp,n,8) +L(e11): + mov -8(up,n,8), %r8 + mov -8(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -8(rp,n,8) +L(e10): + mov (up,n,8), %r10 + mov (vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, (rp,n,8) +L(e01): + jrcxz L(end) + lea 4(n), n + jmp L(top) +END (func) diff --git a/sysdeps/x86_64/addmul_1.S b/sysdeps/x86_64/addmul_1.S index bdb5226a33..e997896703 100644 --- a/sysdeps/x86_64/addmul_1.S +++ b/sysdeps/x86_64/addmul_1.S @@ -1,6 +1,6 @@ -/* AMD64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add +/* x86-64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add the result to a second limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -21,26 +21,95 @@ #include "sysdep.h" #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define n %rdx +#define v0 %rcx + +#ifndef func +# define func __mpn_addmul_1 +# define ADDSUB add +#endif + .text -ENTRY (__mpn_addmul_1) - movq %rdx, %r11 - leaq (%rsi,%rdx,8), %rsi - leaq (%rdi,%rdx,8), %rdi - negq %r11 - xorl %r8d, %r8d - xorl %r10d, %r10d - .p2align 2 -L(loop): - movq (%rsi,%r11,8), %rax - mulq %rcx - addq (%rdi,%r11,8), %rax - adcq %r10, %rdx - addq %r8, %rax - movq %r10, %r8 - movq %rax, (%rdi,%r11,8) - adcq %rdx, %r8 - incq %r11 - jne L(loop) - movq %r8, %rax +ENTRY (func) + push %rbx + push %rbp + lea (%rdx), %rbx + neg %rbx + + mov (up), %rax + mov (rp), %r10 + + lea -16(rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + + bt $0, %ebx + jc L(odd) + + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + mul %rcx + add $2, %rbx + jns L(n2) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + jmp L(mid) + +L(odd): add $1, %rbx + jns L(n1) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + mul %rcx + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + jmp L(e) + + .p2align 4 +L(top): mul %rcx + ADDSUB %r8, %r10 + lea (%rax), %r8 + mov (up,%rbx,8), %rax + adc %r9, %r11 + mov %r10, -8(rp,%rbx,8) + mov (rp,%rbx,8), %r10 + lea (%rdx), %r9 + adc $0, %rbp +L(mid): mul %rcx + ADDSUB %r11, %r10 + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + adc %rbp, %r8 + mov %r10, (rp,%rbx,8) + mov 8(rp,%rbx,8), %r10 + lea (%rdx), %rbp + adc $0, %r9 +L(e): add $2, %rbx + js L(top) + + mul %rcx + ADDSUB %r8, %r10 + adc %r9, %r11 + mov %r10, -8(rp) + adc $0, %rbp +L(n2): mov (rp), %r10 + ADDSUB %r11, %r10 + adc %rbp, %rax + mov %r10, (rp) + adc $0, %rdx +L(n1): mov 8(rp), %r10 + ADDSUB %rax, %r10 + mov %r10, 8(rp) + mov %ebx, %eax /* zero rax */ + adc %rdx, %rax + pop %rbp + pop %rbx ret -END (__mpn_addmul_1) +END (func) diff --git a/sysdeps/x86_64/lshift.S b/sysdeps/x86_64/lshift.S index 5ac66f0a36..f89d3e09b3 100644 --- a/sysdeps/x86_64/lshift.S +++ b/sysdeps/x86_64/lshift.S @@ -1,5 +1,5 @@ -/* AMD64 __mpn_lshift -- - Copyright 2004, 2006 Free Software Foundation, Inc. +/* x86-64 __mpn_lshift -- + Copyright (C) 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -20,41 +20,98 @@ #include "sysdep.h" #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define n %rdx +#define cnt %cl .text ENTRY (__mpn_lshift) - movq -8(%rsi,%rdx,8), %mm7 - movd %ecx, %mm1 - movl $64, %eax - subl %ecx, %eax - movd %eax, %mm0 - movq %mm7, %mm3 - psrlq %mm0, %mm7 - movd %mm7, %rax - subq $2, %rdx - jl L(endo) - .p2align 2 -L(loop): - movq (%rsi,%rdx,8), %mm6 - movq %mm6, %mm2 - psrlq %mm0, %mm6 - psllq %mm1, %mm3 - por %mm6, %mm3 - movq %mm3, 8(%rdi,%rdx,8) - je L(ende) - movq -8(%rsi,%rdx,8), %mm7 - movq %mm7, %mm3 - psrlq %mm0, %mm7 - psllq %mm1, %mm2 - por %mm7, %mm2 - movq %mm2, (%rdi,%rdx,8) - subq $2, %rdx - jge L(loop) -L(endo): - movq %mm3, %mm2 -L(ende): - psllq %mm1, %mm2 - movq %mm2, (%rdi) - emms + lea -8(rp,n,8), rp + lea -8(up,n,8), up + + mov %edx, %eax + and $3, %eax + jne L(nb00) +L(b00): /* n = 4, 8, 12, ... */ + mov (up), %r10 + mov -8(up), %r11 + xor %eax, %eax + shld %cl, %r10, %rax + mov -16(up), %r8 + lea 24(rp), rp + sub $4, n + jmp L(00) + +L(nb00):/* n = 1, 5, 9, ... */ + cmp $2, %eax + jae L(nb01) +L(b01): mov (up), %r9 + xor %eax, %eax + shld %cl, %r9, %rax + sub $2, n + jb L(le1) + mov -8(up), %r10 + mov -16(up), %r11 + lea -8(up), up + lea 16(rp), rp + jmp L(01) +L(le1): shl %cl, %r9 + mov %r9, (rp) + ret + +L(nb01):/* n = 2, 6, 10, ... */ + jne L(b11) +L(b10): mov (up), %r8 + mov -8(up), %r9 + xor %eax, %eax + shld %cl, %r8, %rax + sub $3, n + jb L(le2) + mov -16(up), %r10 + lea -16(up), up + lea 8(rp), rp + jmp L(10) +L(le2): shld %cl, %r9, %r8 + mov %r8, (rp) + shl %cl, %r9 + mov %r9, -8(rp) + ret + + .p2align 4 /* performance critical! */ +L(b11): /* n = 3, 7, 11, ... */ + mov (up), %r11 + mov -8(up), %r8 + xor %eax, %eax + shld %cl, %r11, %rax + mov -16(up), %r9 + lea -24(up), up + sub $4, n + jb L(end) + + .p2align 4 +L(top): shld %cl, %r8, %r11 + mov (up), %r10 + mov %r11, (rp) +L(10): shld %cl, %r9, %r8 + mov -8(up), %r11 + mov %r8, -8(rp) +L(01): shld %cl, %r10, %r9 + mov -16(up), %r8 + mov %r9, -16(rp) +L(00): shld %cl, %r11, %r10 + mov -24(up), %r9 + mov %r10, -24(rp) + add $-32, up + lea -32(rp), rp + sub $4, n + jnc L(top) + +L(end): shld %cl, %r8, %r11 + mov %r11, (rp) + shld %cl, %r9, %r8 + mov %r8, -8(rp) + shl %cl, %r9 + mov %r9, -16(rp) ret END (__mpn_lshift) diff --git a/sysdeps/x86_64/mul_1.S b/sysdeps/x86_64/mul_1.S index 978916b72c..676afd1755 100644 --- a/sysdeps/x86_64/mul_1.S +++ b/sysdeps/x86_64/mul_1.S @@ -1,6 +1,6 @@ /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store the result in a second limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -21,22 +21,109 @@ #include <sysdep.h> #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define n_param %rdx +#define vl %rcx + +#define n %r11 + .text ENTRY (__mpn_mul_1) - movq %rdx, %r11 - leaq (%rsi,%rdx,8), %rsi - leaq (%rdi,%rdx,8), %rdi - negq %r11 - xorl %r8d, %r8d -L(loop): - movq (%rsi,%r11,8), %rax - mulq %rcx - addq %r8, %rax - movl $0, %r8d - adcq %rdx, %r8 - movq %rax, (%rdi,%r11,8) - incq %r11 - jne L(loop) - movq %r8, %rax + push %rbx + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbx, 0) + xor %r10, %r10 + mov (up), %rax /* read first u limb early */ + mov n_param, %rbx /* move away n from rdx, mul uses it */ + mul vl + mov %rbx, %r11 + + add %r10, %rax + adc $0, %rdx + + and $3, %ebx + jz L(b0) + cmp $2, %ebx + jz L(b2) + jg L(b3) + +L(b1): dec n + jne L(gt1) + mov %rax, (rp) + jmp L(ret) +L(gt1): lea 8(up,n,8), up + lea -8(rp,n,8), rp + neg n + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (up,n,8), %rax + mov %rdx, %r8 + jmp L(L1) + +L(b0): lea (up,n,8), up + lea -16(rp,n,8), rp + neg n + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp L(L0) + +L(b3): lea -8(up,n,8), up + lea -24(rp,n,8), rp + neg n + mov %rax, %rbx + mov %rdx, %r10 + jmp L(L3) + +L(b2): lea -16(up,n,8), up + lea -32(rp,n,8), rp + neg n + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(up,n,8), %rax + mov %rdx, %r9 + jmp L(L2) + + .p2align 4 +L(top): mov %r10, (rp,n,8) + add %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 + mov $0, %r10d +L(L1): mul vl + mov %r9, 8(rp,n,8) + add %rax, %r8 + adc %rdx, %rbx +L(L0): mov 8(up,n,8), %rax + mul vl + mov %r8, 16(rp,n,8) + add %rax, %rbx + adc %rdx, %r10 +L(L3): mov 16(up,n,8), %rax + mul vl + mov %rbx, 24(rp,n,8) + mov $0, %r8d # zero + mov %r8, %rbx # zero + add %rax, %r10 + mov 24(up,n,8), %rax + mov %r8, %r9 # zero + adc %rdx, %r9 +L(L2): mul vl + add $4, n + js L(top) + + mov %r10, (rp,n,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(rp,n,8) + add %r8, %rdx +L(ret): mov %rdx, %rax + + pop %rbx + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbx) ret END (__mpn_mul_1) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index b124524b2e..5d2e34ebc8 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -8,9 +8,10 @@ sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ - strncase_l-ssse3 + strncase_l-ssse3 strlen-sse4 strlen-no-bsf ifeq (yes,$(config-cflags-sse4)) -sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c +sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift +CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 55c9f54f96..786466d5fd 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -72,6 +72,12 @@ __init_cpu_features (void) model += extended_model; switch (model) { + case 0x1c: + case 0x26: + /* BSF is slow on Atom. */ + __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF; + break; + case 0x1a: case 0x1e: case 0x1f: diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 4a211c0864..783b02015e 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -18,6 +18,7 @@ #define bit_Fast_Rep_String (1 << 0) #define bit_Fast_Copy_Backward (1 << 1) +#define bit_Slow_BSF (1 << 2) #ifdef __ASSEMBLER__ @@ -35,6 +36,7 @@ # define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE # define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE +# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ @@ -106,6 +108,7 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_Fast_Rep_String FEATURE_INDEX_1 # define index_Fast_Copy_Backward FEATURE_INDEX_1 +# define index_Slow_BSF FEATURE_INDEX_1 #define HAS_ARCH_FEATURE(idx, bit) \ ((__get_cpu_features ()->feature[idx] & (bit)) != 0) @@ -116,4 +119,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define HAS_FAST_COPY_BACKWARD \ HAS_ARCH_FEATURE (index_Fast_Copy_Backward, bit_Fast_Copy_Backward) +#define HAS_SLOW_BSF \ + HAS_ARCH_FEATURE (index_Slow_BSF, bit_Slow_BSF) + #endif /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c index bbe96273be..379862722b 100644 --- a/sysdeps/x86_64/multiarch/memmove.c +++ b/sysdeps/x86_64/multiarch/memmove.c @@ -1,3 +1,24 @@ +/* Multiple versions of memmove. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <string.h> + #ifndef NOT_IN_libc #include "init-arch.h" @@ -9,13 +30,13 @@ #endif #endif +extern __typeof (memmove) __memmove_sse2 attribute_hidden; +extern __typeof (memmove) __memmove_ssse3 attribute_hidden; +extern __typeof (memmove) __memmove_ssse3_back attribute_hidden; + #include "string/memmove.c" #ifndef NOT_IN_libc -extern __typeof (__memmove_sse2) __memmove_sse2 attribute_hidden; -extern __typeof (__memmove_sse2) __memmove_ssse3 attribute_hidden; -extern __typeof (__memmove_sse2) __memmove_ssse3_back attribute_hidden; - libc_ifunc (memmove, HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c index a474f5f5ee..962501d450 100644 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ b/sysdeps/x86_64/multiarch/memmove_chk.c @@ -1,12 +1,32 @@ +/* Multiple versions of __memmove_chk. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <string.h> #include "init-arch.h" #define MEMMOVE_CHK __memmove_chk_sse2 -#include "debug/memmove_chk.c" +extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden; +extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden; +extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden; -extern __typeof (__memmove_chk_sse2) __memmove_chk_sse2 attribute_hidden; -extern __typeof (__memmove_chk_sse2) __memmove_chk_ssse3 attribute_hidden; -extern __typeof (__memmove_chk_sse2) __memmove_chk_ssse3_back attribute_hidden; +#include "debug/memmove_chk.c" libc_ifunc (__memmove_chk, HAS_SSSE3 diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index daeebe1bf5..0b2ce76926 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -1,5 +1,5 @@ /* strcspn with SSE4.2 intrinsics - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009, 2010 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,6 +20,7 @@ #include <nmmintrin.h> #include <string.h> +#include "varshift.h" /* We use 0x2: _SIDD_SBYTE_OPS @@ -86,8 +87,6 @@ STRCSPN_SSE42 (const char *s, const char *a) const char *aligned; __m128i mask; - /* Fake initialization. gcc otherwise will warn. */ - asm ("" : "=xm" (mask)); int offset = (int) ((size_t) a & 15); if (offset != 0) { @@ -95,54 +94,7 @@ STRCSPN_SSE42 (const char *s, const char *a) aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - switch (offset) - { - case 1: - mask = _mm_srli_si128 (mask0, 1); - break; - case 2: - mask = _mm_srli_si128 (mask0, 2); - break; - case 3: - mask = _mm_srli_si128 (mask0, 3); - break; - case 4: - mask = _mm_srli_si128 (mask0, 4); - break; - case 5: - mask = _mm_srli_si128 (mask0, 5); - break; - case 6: - mask = _mm_srli_si128 (mask0, 6); - break; - case 7: - mask = _mm_srli_si128 (mask0, 7); - break; - case 8: - mask = _mm_srli_si128 (mask0, 8); - break; - case 9: - mask = _mm_srli_si128 (mask0, 9); - break; - case 10: - mask = _mm_srli_si128 (mask0, 10); - break; - case 11: - mask = _mm_srli_si128 (mask0, 11); - break; - case 12: - mask = _mm_srli_si128 (mask0, 12); - break; - case 13: - mask = _mm_srli_si128 (mask0, 13); - break; - case 14: - mask = _mm_srli_si128 (mask0, 14); - break; - case 15: - mask = _mm_srli_si128 (mask0, 15); - break; - } + mask = __m128i_shift_right (mask0, offset); /* Find where the NULL terminator is. */ int length = _mm_cmpistri (mask, mask, 0x3a); @@ -159,55 +111,10 @@ STRCSPN_SSE42 (const char *s, const char *a) if (index != 0) { - /* Combine mask0 and mask1. */ - switch (offset) - { - case 1: - mask = _mm_alignr_epi8 (mask1, mask0, 1); - break; - case 2: - mask = _mm_alignr_epi8 (mask1, mask0, 2); - break; - case 3: - mask = _mm_alignr_epi8 (mask1, mask0, 3); - break; - case 4: - mask = _mm_alignr_epi8 (mask1, mask0, 4); - break; - case 5: - mask = _mm_alignr_epi8 (mask1, mask0, 5); - break; - case 6: - mask = _mm_alignr_epi8 (mask1, mask0, 6); - break; - case 7: - mask = _mm_alignr_epi8 (mask1, mask0, 7); - break; - case 8: - mask = _mm_alignr_epi8 (mask1, mask0, 8); - break; - case 9: - mask = _mm_alignr_epi8 (mask1, mask0, 9); - break; - case 10: - mask = _mm_alignr_epi8 (mask1, mask0, 10); - break; - case 11: - mask = _mm_alignr_epi8 (mask1, mask0, 11); - break; - case 12: - mask = _mm_alignr_epi8 (mask1, mask0, 12); - break; - case 13: - mask = _mm_alignr_epi8 (mask1, mask0, 13); - break; - case 14: - mask = _mm_alignr_epi8 (mask1, mask0, 14); - break; - case 15: - mask = _mm_alignr_epi8 (mask1, mask0, 15); - break; - } + /* Combine mask0 and mask1. We could play games with + palignr, but frankly this data should be in L1 now + so do the merge via an unaligned load. */ + mask = _mm_loadu_si128 ((__m128i *) a); } } } @@ -234,54 +141,7 @@ STRCSPN_SSE42 (const char *s, const char *a) aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); - switch (offset) - { - case 1: - value = _mm_srli_si128 (value, 1); - break; - case 2: - value = _mm_srli_si128 (value, 2); - break; - case 3: - value = _mm_srli_si128 (value, 3); - break; - case 4: - value = _mm_srli_si128 (value, 4); - break; - case 5: - value = _mm_srli_si128 (value, 5); - break; - case 6: - value = _mm_srli_si128 (value, 6); - break; - case 7: - value = _mm_srli_si128 (value, 7); - break; - case 8: - value = _mm_srli_si128 (value, 8); - break; - case 9: - value = _mm_srli_si128 (value, 9); - break; - case 10: - value = _mm_srli_si128 (value, 10); - break; - case 11: - value = _mm_srli_si128 (value, 11); - break; - case 12: - value = _mm_srli_si128 (value, 12); - break; - case 13: - value = _mm_srli_si128 (value, 13); - break; - case 14: - value = _mm_srli_si128 (value, 14); - break; - case 15: - value = _mm_srli_si128 (value, 15); - break; - } + value = __m128i_shift_right (value, offset); int length = _mm_cmpistri (mask, value, 0x2); /* No need to check ZFlag since ZFlag is always 1. */ diff --git a/sysdeps/x86_64/multiarch/strlen-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-no-bsf.S new file mode 100644 index 0000000000..3e52f8165c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-no-bsf.S @@ -0,0 +1,309 @@ +/* strlen without BSF + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#if defined SHARED && !defined NOT_IN_libc + +#include <sysdep.h> + + .section .text.slow,"ax",@progbits +ENTRY (__strlen_no_bsf) + xor %eax, %eax + cmpb $0, (%rdi) + jz L(exit_tail0) + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmpb $0, 3(%rdi) + jz L(exit_tail3) + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmpb $0, 7(%rdi) + jz L(exit_tail7) + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmpb $0, 11(%rdi) + jz L(exit_tail11) + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmpb $0, 15(%rdi) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + mov %rdi, %rcx + mov %rdi, %rax + and $-16, %rax + add $16, %rax + add $16, %rcx + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax + xor %r8d, %r8d +L(aligned_64): + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %esi + pmovmskb %xmm2, %edi + pmovmskb %xmm3, %r9d + or %edx, %r8d + or %esi, %r8d + or %edi, %r8d + or %r9d, %r8d + lea 64(%rax), %rax + jz L(aligned_64) + + test %edx, %edx + jnz L(aligned_64_exit_16) + test %esi, %esi + jnz L(aligned_64_exit_32) + test %edi, %edi + jnz L(aligned_64_exit_48) +L(aligned_64_exit_64): + mov %r9d, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_48): + lea -16(%rax), %rax + mov %edi, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_32): + lea -32(%rax), %rax + mov %esi, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_16): + lea -48(%rax), %rax +L(aligned_64_exit): +L(exit): + sub %rcx, %rax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + ret + +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + ret + .p2align 4 +L(exit_tail1): + add $1, %eax + ret + +L(exit_tail2): + add $2, %eax + ret + +L(exit_tail3): + add $3, %eax + ret + +L(exit_tail4): + add $4, %eax + ret + +L(exit_tail5): + add $5, %eax + ret +L(exit_tail6): + add $6, %eax + ret +L(exit_tail7): + add $7, %eax + ret +L(exit_tail8): + add $8, %eax + ret +L(exit_tail9): + add $9, %eax + ret +L(exit_tail10): + add $10, %eax + ret +L(exit_tail11): + add $11, %eax + ret +L(exit_tail12): + add $12, %eax + ret +L(exit_tail13): + add $13, %eax + ret +L(exit_tail14): + add $14, %eax + ret +L(exit_tail15): + add $15, %eax + ret +END (__strlen_no_bsf) + +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S new file mode 100644 index 0000000000..6b16ea7fa6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-sse4.S @@ -0,0 +1,85 @@ +/* strlen with SSE4 + Copyright (C) 2009, 2010 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@redhat.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#if defined SHARED && !defined NOT_IN_libc + +#include <sysdep.h> + + .section .text.sse4.2,"ax",@progbits +ENTRY (__strlen_sse42) + pxor %xmm1, %xmm1 + movl %edi, %ecx + movq %rdi, %r8 + andq $~15, %rdi + xor %edi, %ecx + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %edx + shrl %cl, %edx + shll %cl, %edx + andl %edx, %edx + jnz L(less16bytes) + pxor %xmm1, %xmm1 + + .p2align 4 +L(more64bytes_loop): + pcmpistri $0x08, 16(%rdi), %xmm1 + jz L(more32bytes) + + pcmpistri $0x08, 32(%rdi), %xmm1 + jz L(more48bytes) + + pcmpistri $0x08, 48(%rdi), %xmm1 + jz L(more64bytes) + + add $64, %rdi + pcmpistri $0x08, (%rdi), %xmm1 + jnz L(more64bytes_loop) + leaq (%rdi,%rcx), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more32bytes): + leaq 16(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more48bytes): + leaq 32(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more64bytes): + leaq 48(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(less16bytes): + subq %r8, %rdi + bsfl %edx, %eax + addq %rdi, %rax + ret + +END (__strlen_sse42) + +#endif diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S index f9641131fa..83a88ecd90 100644 --- a/sysdeps/x86_64/multiarch/strlen.S +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -1,5 +1,5 @@ /* strlen(str) -- determine the length of the string STR. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009, 2010 Free Software Foundation, Inc. Contributed by Ulrich Drepper <drepper@redhat.com>. This file is part of the GNU C Library. @@ -36,74 +36,12 @@ ENTRY(strlen) testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) jz 2f leaq __strlen_sse42(%rip), %rax -2: ret -END(strlen) - - - .section .text.sse4.2,"ax",@progbits - .align 16 - .type __strlen_sse42, @function -__strlen_sse42: - cfi_startproc - CALL_MCOUNT - pxor %xmm1, %xmm1 - movl %edi, %ecx - movq %rdi, %r8 - andq $~15, %rdi - xor %edi, %ecx - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %edx - shrl %cl, %edx - shll %cl, %edx - andl %edx, %edx - jnz L(less16bytes) - pxor %xmm1, %xmm1 - - .p2align 4 -L(more64bytes_loop): - pcmpistri $0x08, 16(%rdi), %xmm1 - jz L(more32bytes) - - pcmpistri $0x08, 32(%rdi), %xmm1 - jz L(more48bytes) - - pcmpistri $0x08, 48(%rdi), %xmm1 - jz L(more64bytes) - - add $64, %rdi - pcmpistri $0x08, (%rdi), %xmm1 - jnz L(more64bytes_loop) - leaq (%rdi,%rcx), %rax - subq %r8, %rax - ret - - .p2align 4 -L(more32bytes): - leaq 16(%rdi,%rcx, 1), %rax - subq %r8, %rax - ret - - .p2align 4 -L(more48bytes): - leaq 32(%rdi,%rcx, 1), %rax - subq %r8, %rax - ret - - .p2align 4 -L(more64bytes): - leaq 48(%rdi,%rcx, 1), %rax - subq %r8, %rax ret - - .p2align 4 -L(less16bytes): - subq %r8, %rdi - bsfl %edx, %eax - addq %rdi, %rax - ret - cfi_endproc - .size __strlen_sse42, .-__strlen_sse42 - +2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) + jz 3f + leaq __strlen_no_bsf(%rip), %rax +3: ret +END(strlen) # undef ENTRY # define ENTRY(name) \ diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c index be9e8ac0a8..6faa259fd7 100644 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -1,5 +1,5 @@ /* strspn with SSE4.2 intrinsics - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009, 2010 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,6 +20,7 @@ #include <nmmintrin.h> #include <string.h> +#include "varshift.h" /* We use 0x12: _SIDD_SBYTE_OPS @@ -71,54 +72,7 @@ __strspn_sse42 (const char *s, const char *a) aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - switch (offset) - { - case 1: - mask = _mm_srli_si128 (mask0, 1); - break; - case 2: - mask = _mm_srli_si128 (mask0, 2); - break; - case 3: - mask = _mm_srli_si128 (mask0, 3); - break; - case 4: - mask = _mm_srli_si128 (mask0, 4); - break; - case 5: - mask = _mm_srli_si128 (mask0, 5); - break; - case 6: - mask = _mm_srli_si128 (mask0, 6); - break; - case 7: - mask = _mm_srli_si128 (mask0, 7); - break; - case 8: - mask = _mm_srli_si128 (mask0, 8); - break; - case 9: - mask = _mm_srli_si128 (mask0, 9); - break; - case 10: - mask = _mm_srli_si128 (mask0, 10); - break; - case 11: - mask = _mm_srli_si128 (mask0, 11); - break; - case 12: - mask = _mm_srli_si128 (mask0, 12); - break; - case 13: - mask = _mm_srli_si128 (mask0, 13); - break; - case 14: - mask = _mm_srli_si128 (mask0, 14); - break; - case 15: - mask = _mm_srli_si128 (mask0, 15); - break; - } + mask = __m128i_shift_right (mask0, offset); /* Find where the NULL terminator is. */ int length = _mm_cmpistri (mask, mask, 0x3a); @@ -135,55 +89,10 @@ __strspn_sse42 (const char *s, const char *a) if (index != 0) { - /* Combine mask0 and mask1. */ - switch (offset) - { - case 1: - mask = _mm_alignr_epi8 (mask1, mask0, 1); - break; - case 2: - mask = _mm_alignr_epi8 (mask1, mask0, 2); - break; - case 3: - mask = _mm_alignr_epi8 (mask1, mask0, 3); - break; - case 4: - mask = _mm_alignr_epi8 (mask1, mask0, 4); - break; - case 5: - mask = _mm_alignr_epi8 (mask1, mask0, 5); - break; - case 6: - mask = _mm_alignr_epi8 (mask1, mask0, 6); - break; - case 7: - mask = _mm_alignr_epi8 (mask1, mask0, 7); - break; - case 8: - mask = _mm_alignr_epi8 (mask1, mask0, 8); - break; - case 9: - mask = _mm_alignr_epi8 (mask1, mask0, 9); - break; - case 10: - mask = _mm_alignr_epi8 (mask1, mask0, 10); - break; - case 11: - mask = _mm_alignr_epi8 (mask1, mask0, 11); - break; - case 12: - mask = _mm_alignr_epi8 (mask1, mask0, 12); - break; - case 13: - mask = _mm_alignr_epi8 (mask1, mask0, 13); - break; - case 14: - mask = _mm_alignr_epi8 (mask1, mask0, 14); - break; - case 15: - mask = _mm_alignr_epi8 (mask1, mask0, 15); - break; - } + /* Combine mask0 and mask1. We could play games with + palignr, but frankly this data should be in L1 now + so do the merge via an unaligned load. */ + mask = _mm_loadu_si128 ((__m128i *) a); } } } @@ -210,54 +119,7 @@ __strspn_sse42 (const char *s, const char *a) aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); - switch (offset) - { - case 1: - value = _mm_srli_si128 (value, 1); - break; - case 2: - value = _mm_srli_si128 (value, 2); - break; - case 3: - value = _mm_srli_si128 (value, 3); - break; - case 4: - value = _mm_srli_si128 (value, 4); - break; - case 5: - value = _mm_srli_si128 (value, 5); - break; - case 6: - value = _mm_srli_si128 (value, 6); - break; - case 7: - value = _mm_srli_si128 (value, 7); - break; - case 8: - value = _mm_srli_si128 (value, 8); - break; - case 9: - value = _mm_srli_si128 (value, 9); - break; - case 10: - value = _mm_srli_si128 (value, 10); - break; - case 11: - value = _mm_srli_si128 (value, 11); - break; - case 12: - value = _mm_srli_si128 (value, 12); - break; - case 13: - value = _mm_srli_si128 (value, 13); - break; - case 14: - value = _mm_srli_si128 (value, 14); - break; - case 15: - value = _mm_srli_si128 (value, 15); - break; - } + value = __m128i_shift_right (value, offset); int length = _mm_cmpistri (mask, value, 0x12); /* No need to check CFlag since it is always 1. */ diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index 45d7a550ac..b408b752fa 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -19,6 +19,7 @@ 02111-1307 USA. */ #include <nmmintrin.h> +#include "varshift.h" #ifndef STRSTR_SSE42 # define STRSTR_SSE42 __strstr_sse42 @@ -82,67 +83,6 @@ 5. failed string compare, go back to scanning */ -/* Fix-up of removal of unneeded data due to 16B aligned load - parameters: - value: 16B data loaded from 16B aligned address. - offset: Offset of target data address relative to 16B aligned load - address. - */ - -static __inline__ __m128i -__m128i_shift_right (__m128i value, int offset) -{ - switch (offset) - { - case 1: - value = _mm_srli_si128 (value, 1); - break; - case 2: - value = _mm_srli_si128 (value, 2); - break; - case 3: - value = _mm_srli_si128 (value, 3); - break; - case 4: - value = _mm_srli_si128 (value, 4); - break; - case 5: - value = _mm_srli_si128 (value, 5); - break; - case 6: - value = _mm_srli_si128 (value, 6); - break; - case 7: - value = _mm_srli_si128 (value, 7); - break; - case 8: - value = _mm_srli_si128 (value, 8); - break; - case 9: - value = _mm_srli_si128 (value, 9); - break; - case 10: - value = _mm_srli_si128 (value, 10); - break; - case 11: - value = _mm_srli_si128 (value, 11); - break; - case 12: - value = _mm_srli_si128 (value, 12); - break; - case 13: - value = _mm_srli_si128 (value, 13); - break; - case 14: - value = _mm_srli_si128 (value, 14); - break; - case 15: - value = _mm_srli_si128 (value, 15); - break; - } - return value; -} - /* Simple replacement of movdqu to address 4KB boundary cross issue. If EOS occurs within less than 16B before 4KB boundary, we don't cross to next page. */ diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c new file mode 100644 index 0000000000..46d72fe3d0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/varshift.c @@ -0,0 +1,26 @@ +/* Helper for variable shifts of SSE registers. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "varshift.h" + +const int8_t ___m128i_shift_right[31] attribute_hidden = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h new file mode 100644 index 0000000000..9554f2ddf9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/varshift.h @@ -0,0 +1,31 @@ +/* Helper for variable shifts of SSE registers. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <stdint.h> +#include <tmmintrin.h> + +extern const int8_t ___m128i_shift_right[31] attribute_hidden; + +static __inline__ __m128i +__m128i_shift_right (__m128i value, unsigned long int offset) +{ + return _mm_shuffle_epi8 (value, + _mm_loadu_si128 ((__m128i *) (___m128i_shift_right + + offset))); +} diff --git a/sysdeps/x86_64/rshift.S b/sysdeps/x86_64/rshift.S index ee0c8aa15c..8ff055169a 100644 --- a/sysdeps/x86_64/rshift.S +++ b/sysdeps/x86_64/rshift.S @@ -1,5 +1,5 @@ -/* AMD64 __mpn_rshift -- - Copyright (C) 2004, 2006 Free Software Foundation, Inc. +/* x86-64 __mpn_rshift -- + Copyright (C) 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -20,43 +20,96 @@ #include "sysdep.h" #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define n %rdx +#define cnt %cl + .text ENTRY (__mpn_rshift) - movq (%rsi), %mm7 - movd %ecx, %mm1 - movl $64, %eax - subl %ecx, %eax - movd %eax, %mm0 - movq %mm7, %mm3 - psllq %mm0, %mm7 - movd %mm7, %rax - leaq (%rsi,%rdx,8), %rsi - leaq (%rdi,%rdx,8), %rdi - negq %rdx - addq $2, %rdx - jg L(endo) - .p2align 2 -L(loop): - movq -8(%rsi,%rdx,8), %mm6 - movq %mm6, %mm2 - psllq %mm0, %mm6 - psrlq %mm1, %mm3 - por %mm6, %mm3 - movq %mm3, -16(%rdi,%rdx,8) - je L(ende) - movq (%rsi,%rdx,8), %mm7 - movq %mm7, %mm3 - psllq %mm0, %mm7 - psrlq %mm1, %mm2 - por %mm7, %mm2 - movq %mm2, -8(%rdi,%rdx,8) - addq $2, %rdx - jle L(loop) -L(endo): - movq %mm3, %mm2 -L(ende): - psrlq %mm1, %mm2 - movq %mm2, -8(%rdi) - emms + mov %edx, %eax + and $3, %eax + jne L(nb00) +L(b00): /* n = 4, 8, 12, ... */ + mov (up), %r10 + mov 8(up), %r11 + xor %eax, %eax + shrd %cl, %r10, %rax + mov 16(up), %r8 + lea 8(up), up + lea -24(rp), rp + sub $4, n + jmp L(00) + +L(nb00):/* n = 1, 5, 9, ... */ + cmp $2, %eax + jae L(nb01) +L(b01): mov (up), %r9 + xor %eax, %eax + shrd %cl, %r9, %rax + sub $2, n + jb L(le1) + mov 8(up), %r10 + mov 16(up), %r11 + lea 16(up), up + lea -16(rp), rp + jmp L(01) +L(le1): shr %cl, %r9 + mov %r9, (rp) + ret + +L(nb01):/* n = 2, 6, 10, ... */ + jne L(b11) +L(b10): mov (up), %r8 + mov 8(up), %r9 + xor %eax, %eax + shrd %cl, %r8, %rax + sub $3, n + jb L(le2) + mov 16(up), %r10 + lea 24(up), up + lea -8(rp), rp + jmp L(10) +L(le2): shrd %cl, %r9, %r8 + mov %r8, (rp) + shr %cl, %r9 + mov %r9, 8(rp) + ret + + .p2align 4 +L(b11): /* n = 3, 7, 11, ... */ + mov (up), %r11 + mov 8(up), %r8 + xor %eax, %eax + shrd %cl, %r11, %rax + mov 16(up), %r9 + lea 32(up), up + sub $4, n + jb L(end) + + .p2align 4 +L(top): shrd %cl, %r8, %r11 + mov -8(up), %r10 + mov %r11, (rp) +L(10): shrd %cl, %r9, %r8 + mov (up), %r11 + mov %r8, 8(rp) +L(01): shrd %cl, %r10, %r9 + mov 8(up), %r8 + mov %r9, 16(rp) +L(00): shrd %cl, %r11, %r10 + mov 16(up), %r9 + mov %r10, 24(rp) + add $32, up + lea 32(rp), rp + sub $4, n + jnc L(top) + +L(end): shrd %cl, %r8, %r11 + mov %r11, (rp) + shrd %cl, %r9, %r8 + mov %r8, 8(rp) + shr %cl, %r9 + mov %r9, 16(rp) ret END (__mpn_rshift) diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index 93aee6bef1..28f828780e 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,5 +1,5 @@ /* strlen(str) -- determine the length of the string STR. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009, 2010 Free Software Foundation, Inc. Contributed by Ulrich Drepper <drepper@redhat.com>. This file is part of the GNU C Library. @@ -23,29 +23,80 @@ .text ENTRY(strlen) - pxor %xmm2, %xmm2 - movq %rdi, %rcx - movq %rdi, %r8 - andq $~15, %rdi - movdqa %xmm2, %xmm1 - pcmpeqb (%rdi), %xmm2 - orl $0xffffffff, %esi - subq %rdi, %rcx - shll %cl, %esi - pmovmskb %xmm2, %edx - andl %esi, %edx - jnz 1f - -2: movdqa 16(%rdi), %xmm0 - leaq 16(%rdi), %rdi + xor %rax, %rax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%rdi), %xmm1 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %edx - testl %edx, %edx - jz 2b + test %edx, %edx + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) +L(next): + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %esi + sub %rax, %rcx + shl %cl, %esi + pmovmskb %xmm0, %edx + and %esi, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) -1: subq %r8, %rdi - bsfl %edx, %eax - addq %rdi, %rax + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%rax), %rax + test %edx, %edx + jz L(align16_loop) +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax + ret + .p2align 4 +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + lea 16(%rdx,%rax), %rax + ret + .p2align 4 +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + lea 32(%rdx,%rax), %rax + ret + .p2align 4 +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + lea 48(%rdx,%rax), %rax ret END(strlen) libc_hidden_builtin_def (strlen) diff --git a/sysdeps/x86_64/sub_n.S b/sysdeps/x86_64/sub_n.S index 48e1a2e0f4..60c15fc3e1 100644 --- a/sysdeps/x86_64/sub_n.S +++ b/sysdeps/x86_64/sub_n.S @@ -1,6 +1,6 @@ -/* AMD64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store +/* x86-64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2006, 2007 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -18,25 +18,7 @@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include "sysdep.h" -#include "asm-syntax.h" +#define func __mpn_sub_n +#define ADCSBB sbb - .text -ENTRY (__mpn_sub_n) - leaq (%rsi,%rcx,8), %rsi - leaq (%rdi,%rcx,8), %rdi - leaq (%rdx,%rcx,8), %rdx - negq %rcx - xorl %eax, %eax # clear cy - .p2align 2 -L(loop): - movq (%rsi,%rcx,8), %rax - movq (%rdx,%rcx,8), %r10 - sbbq %r10, %rax - movq %rax, (%rdi,%rcx,8) - incq %rcx - jne L(loop) - movq %rcx, %rax # zero %rax - adcq %rax, %rax - ret -END (__mpn_sub_n) +#include "add_n.S" diff --git a/sysdeps/x86_64/submul_1.S b/sysdeps/x86_64/submul_1.S index e94c9a7bee..150a92762f 100644 --- a/sysdeps/x86_64/submul_1.S +++ b/sysdeps/x86_64/submul_1.S @@ -1,6 +1,6 @@ -/* AMD64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +/* x86-64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract the result from a second limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -18,29 +18,7 @@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include "sysdep.h" -#include "asm-syntax.h" +#define func __mpn_submul_1 +#define ADDSUB sub - .text -ENTRY (__mpn_submul_1) - movq %rdx, %r11 - leaq (%rsi,%r11,8), %rsi - leaq (%rdi,%r11,8), %rdi - negq %r11 - xorl %r8d, %r8d - .p2align 3 -L(loop): - movq (%rsi,%r11,8), %rax - movq (%rdi,%r11,8), %r10 - mulq %rcx - subq %r8, %r10 - movl $0, %r8d - adcl %r8d, %r8d - subq %rax, %r10 - adcq %rdx, %r8 - movq %r10, (%rdi,%r11,8) - incq %r11 - jne L(loop) - movq %r8, %rax - ret -END (__mpn_submul_1) +#include "addmul_1.S" |