diff options
-rw-r--r-- | ChangeLog | 68 | ||||
-rw-r--r-- | elf/dl-load.c | 14 | ||||
-rw-r--r-- | elf/elf.h | 1 | ||||
-rw-r--r-- | libio/iovdprintf.c | 5 | ||||
-rw-r--r-- | malloc/malloc.c | 16 | ||||
-rw-r--r-- | manual/charset.texi | 10 | ||||
-rw-r--r-- | manual/errno.texi | 2 | ||||
-rw-r--r-- | manual/getopt.texi | 2 | ||||
-rw-r--r-- | manual/math.texi | 2 | ||||
-rw-r--r-- | manual/memory.texi | 2 | ||||
-rw-r--r-- | manual/message.texi | 2 | ||||
-rw-r--r-- | manual/resource.texi | 2 | ||||
-rw-r--r-- | manual/stdio.texi | 2 | ||||
-rw-r--r-- | manual/time.texi | 2 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S | 246 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/memcpy-ssse3.S | 113 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/memset-sse2-rep.S | 14 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/memset-sse2.S | 19 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/strcmp-sse4.S | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/Implies | 1 |
20 files changed, 337 insertions, 190 deletions
diff --git a/ChangeLog b/ChangeLog index f21abc5c3e..b8118dd8f0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2010-02-28 Roland McGrath <roland@redhat.com> + + * elf/elf.h (NT_X86_XSTATE): New macro. + 2010-02-25 David S. Miller <davem@davemloft.net> * sysdeps/sparc/sparc64/Makefile: Add align-cpy rule. @@ -55,6 +59,70 @@ * sysdeps/sparc/sparc64/multiarch/memcpy.S: New file. * sysdeps/sparc/sparc64/multiarch/memset.S: New file. +2010-02-20 H.J. Lu <hongjiu.lu@intel.com> + + [BZ #11332] + * sysdeps/i386/i686/multiarch/strcmp-sse4.S: Use cfi_remember_state + and cfi_restore_state only if USE_AS_STRNCMP is defined. + +2010-02-24 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/Implies: Add ieee754/dbl-64/wordsize-64 entry. + +2010-02-24 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S + (bk_write_less32bytes_2): Renamed to ... + (bk_write_less48bytes): This. + Use unsigned conditional jumps. + Correct unwind info. + Use add/sub instead of lea if possible. + (shl_0_gobble_cache_loop_tail): Removed. + (large_page): Properly adjust ECX. + + * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Use unsigned + conditional jumps. + Correct unwind info. + + * sysdeps/i386/i686/multiarch/memset-sse2-rep.S: Remove redundant + punpcklbw. + Use unsigned conditional jumps. + (128bytesormore_nt): Renamed to ... + (128bytesormore_endof_L1): This. + Use add instead of lea if possible. + Correct unwind info. + * sysdeps/i386/i686/multiarch/memset-sse2.S: Remove redundant + punpcklbw. + Use unsigned conditional jumps. + Use add instead of lea if possible. + Correct unwind info. + +2010-02-24 Ulrich Drepper <drepper@redhat.com> + + [BZ #11319] + * libio/iovdprintf.c (_IO_vdprintf): Explicitly flush stream before + undoing the stream because _IO_FINISH doesn't report failures. + + [BZ #5553] + * malloc/malloc.c (public_vALLOc): Set ar_ptr when trying main_arena. + (public_pVALLOc): Likewise. + Patch by Petr Baudis. + +2010-02-22 Jim Meyering <meyering@redhat.com> + + * manual/math.texi (BSD Random): Fix a typo: s/are/is/ + + * manual/charset.texi: Adjust grammar. + + * manual/errno.texi (Error Messages): Fix doubled-words and typos. + * manual/charset.texi (Selecting the Conversion): Likewise. + * manual/getopt.texi (Getopt Long Options): Likewise. + * manual/memory.texi (Resizing the Data Segment): Likewise. + * manual/message.texi (GUI program problems): Likewise. + * manual/resource.texi (CPU Affinity): Likewise. + * manual/stdio.texi (Streams and Threads): Likewise. + * manual/time.texi (High Accuracy Clock): Likewise. + 2009-02-20 David S. Miller <davem@davemloft.net> * sysdeps/sparc/sparc32/dl-machine.h (elf_machine_runtime_setup): diff --git a/elf/dl-load.c b/elf/dl-load.c index 597193c043..e8c7be55f7 100644 --- a/elf/dl-load.c +++ b/elf/dl-load.c @@ -1,5 +1,5 @@ /* Map in a shared object's segments from the file. - Copyright (C) 1995-2005, 2006, 2007, 2009 Free Software Foundation, Inc. + Copyright (C) 1995-2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -313,7 +313,7 @@ static char * expand_dynamic_string_token (struct link_map *l, const char *s) { /* We make two runs over the string. First we determine how large the - resulting string is and then we copy it over. Since this is now + resulting string is and then we copy it over. Since this is no frequently executed operation we are looking here not for performance but rather for code size. */ size_t cnt; @@ -391,7 +391,7 @@ fillin_rpath (char *rpath, struct r_search_path_elem **result, const char *sep, size_t len = strlen (cp); /* `strsep' can pass an empty string. This has to be - interpreted as `use the current directory'. */ + interpreted as `use the current directory'. */ if (len == 0) { static const char curwd[] = "./"; @@ -1519,7 +1519,7 @@ cannot enable executable stack as shared object requires"); /* Print search path. */ static void print_search_path (struct r_search_path_elem **list, - const char *what, const char *name) + const char *what, const char *name) { char buf[max_dirnamelen + max_capstrlen]; int first = 1; @@ -2044,7 +2044,7 @@ _dl_map_object (struct link_map *loader, const char *name, int preloaded, fd = -1; /* When the object has the RUNPATH information we don't use any - RPATHs. */ + RPATHs. */ if (loader == NULL || loader->l_info[DT_RUNPATH] == NULL) { /* This is the executable's map (if there is one). Make sure that @@ -2067,7 +2067,7 @@ _dl_map_object (struct link_map *loader, const char *name, int preloaded, } /* If dynamically linked, try the DT_RPATH of the executable - itself. NB: we do this for lookups in any namespace. */ + itself. NB: we do this for lookups in any namespace. */ if (fd == -1 && !did_main_map && main_map != NULL && main_map->l_type != lt_loaded && cache_rpath (main_map, &main_map->l_rpath_dirs, DT_RPATH, @@ -2164,7 +2164,7 @@ _dl_map_object (struct link_map *loader, const char *name, int preloaded, /* Add another newline when we are tracing the library loading. */ if (__builtin_expect (GLRO(dl_debug_mask) & DL_DEBUG_LIBS, 0)) - _dl_debug_printf ("\n"); + _dl_debug_printf ("\n"); } else { diff --git a/elf/elf.h b/elf/elf.h index 204a0f9e19..b6b6410c51 100644 --- a/elf/elf.h +++ b/elf/elf.h @@ -619,6 +619,7 @@ typedef struct #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ +#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ /* Legal values for the note segment descriptor types for object files. */ diff --git a/libio/iovdprintf.c b/libio/iovdprintf.c index edab849a44..5284ff8938 100644 --- a/libio/iovdprintf.c +++ b/libio/iovdprintf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1995, 1997-2000, 2001, 2002, 2003, 2006 +/* Copyright (C) 1995, 1997-2000, 2001, 2002, 2003, 2006, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -60,6 +60,9 @@ _IO_vdprintf (d, format, arg) done = INTUSE(_IO_vfprintf) (&tmpfil.file, format, arg); + if (done != EOF && _IO_do_flush (&tmpfil.file) == EOF) + done = EOF; + _IO_FINISH (&tmpfil.file); return done; diff --git a/malloc/malloc.c b/malloc/malloc.c index b43e454f6e..763852ea3b 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -1,5 +1,5 @@ /* Malloc implementation for multiple threads without lock contention. - Copyright (C) 1996-2006, 2007, 2008, 2009 Free Software Foundation, Inc. + Copyright (C) 1996-2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Wolfram Gloger <wg@malloc.de> and Doug Lea <dl@cs.oswego.edu>, 2001. @@ -3933,9 +3933,10 @@ public_vALLOc(size_t bytes) if(!p) { /* Maybe the failure is due to running out of mmapped areas. */ if(ar_ptr != &main_arena) { - (void)mutex_lock(&main_arena.mutex); - p = _int_memalign(&main_arena, pagesz, bytes); - (void)mutex_unlock(&main_arena.mutex); + ar_ptr = &main_arena; + (void)mutex_lock(&ar_ptr->mutex); + p = _int_memalign(ar_ptr, pagesz, bytes); + (void)mutex_unlock(&ar_ptr->mutex); } else { #if USE_ARENAS /* ... or sbrk() has failed and there is still a chance to mmap() */ @@ -3978,9 +3979,10 @@ public_pVALLOc(size_t bytes) if(!p) { /* Maybe the failure is due to running out of mmapped areas. */ if(ar_ptr != &main_arena) { - (void)mutex_lock(&main_arena.mutex); - p = _int_memalign(&main_arena, pagesz, rounded_bytes); - (void)mutex_unlock(&main_arena.mutex); + ar_ptr = &main_arena; + (void)mutex_lock(&ar_ptr->mutex); + p = _int_memalign(ar_ptr, pagesz, rounded_bytes); + (void)mutex_unlock(&ar_ptr->mutex); } else { #if USE_ARENAS /* ... or sbrk() has failed and there is still a chance to mmap() */ diff --git a/manual/charset.texi b/manual/charset.texi index 79854e50bf..808469b8c1 100644 --- a/manual/charset.texi +++ b/manual/charset.texi @@ -393,7 +393,7 @@ We already said above that the currently selected locale for the by the functions we are about to describe. Each locale uses its own character set (given as an argument to @code{localedef}) and this is the one assumed as the external multibyte encoding. The wide character -character set always is UCS-4, at least on GNU systems. +set is always UCS-4, at least on GNU systems. A characteristic of each multibyte character set is the maximum number of bytes that can be necessary to represent one character. This @@ -577,8 +577,8 @@ The @code{btowc} function was introduced in @w{Amendment 1} to @w{ISO C90} and is declared in @file{wchar.h}. @end deftypefun -Despite the limitation that the single byte value always is interpreted -in the initial state this function is actually useful most of the time. +Despite the limitation that the single byte value is always interpreted +in the initial state, this function is actually useful most of the time. Most characters are either entirely single-byte character sets or they are extension to ASCII. But then it is possible to write code like this (not that this specific example is very useful): @@ -607,10 +607,10 @@ that there is no guarantee that one can perform this kind of arithmetic on the character of the character set used for @code{wchar_t} representation. In other situations the bytes are not constant at compile time and so the compiler cannot do the work. In situations like -this it is necessary @code{btowc}. +this, using @code{btowc} is required. @noindent -There also is a function for the conversion in the other direction. +There is also a function for the conversion in the other direction. @comment wchar.h @comment ISO diff --git a/manual/errno.texi b/manual/errno.texi index 03a868e457..3b0af0c384 100644 --- a/manual/errno.texi +++ b/manual/errno.texi @@ -1425,7 +1425,7 @@ available on all systems implementing @w{ISO C}. But often the text @code{perror} generates is not what is wanted and there is no way to extend or change what @code{perror} does. The GNU coding standard, for instance, requires error messages to be preceded by the program name and -programs which read some input files should should provide information +programs which read some input files should provide information about the input file name and the line number in case an error is encountered while reading the file. For these occasions there are two functions available which are widely used throughout the GNU project. diff --git a/manual/getopt.texi b/manual/getopt.texi index 8c9bd20d6d..77045157ef 100644 --- a/manual/getopt.texi +++ b/manual/getopt.texi @@ -269,7 +269,7 @@ When @code{getopt_long} has no more options to handle, it returns @var{argv} of the next remaining argument. @end deftypefun -Since long option names were used before before the @code{getopt_long} +Since long option names were used before the @code{getopt_long} options was invented there are program interfaces which require programs to recognize options like @w{@samp{-option value}} instead of @w{@samp{--option value}}. To enable these programs to use the GNU diff --git a/manual/math.texi b/manual/math.texi index 50e087c487..95e3378c9e 100644 --- a/manual/math.texi +++ b/manual/math.texi @@ -1421,7 +1421,7 @@ pseudo-random number generator. The GNU C library contains four additional functions which contain the state as an explicit parameter and therefore make it possible to handle -thread-local PRNGs. Beside this there are no difference. In fact, the +thread-local PRNGs. Beside this there is no difference. In fact, the four functions already discussed are implemented internally using the following interfaces. diff --git a/manual/memory.texi b/manual/memory.texi index 43afc7bf95..59ea1ee342 100644 --- a/manual/memory.texi +++ b/manual/memory.texi @@ -2379,7 +2379,7 @@ exceed the process' data storage limit. @c The Brk system call in Linux (as opposed to the GNU C Library function) @c is considerably different. It always returns the new end of the data @c segment, whether it succeeds or fails. The GNU C library Brk determines -@c it's a failure if and only if if the system call returns an address less +@c it's a failure if and only if the system call returns an address less @c than the address requested. @end deftypefun diff --git a/manual/message.texi b/manual/message.texi index e772b2de1f..e44545a311 100644 --- a/manual/message.texi +++ b/manual/message.texi @@ -1466,7 +1466,7 @@ have this problem. But there is a very simple and powerful method to handle these kind of problems with the @code{gettext} functions. @noindent -As as example consider the following fictional situation. A GUI program +As an example consider the following fictional situation. A GUI program has a menu bar with the following entries: @smallexample diff --git a/manual/resource.texi b/manual/resource.texi index 4a814c9e4a..05495722f9 100644 --- a/manual/resource.texi +++ b/manual/resource.texi @@ -1288,7 +1288,7 @@ protected from concurrent accesses from different processors. The POSIX standard up to this date is of not much help to solve this problem. The Linux kernel provides a set of interfaces to allow specifying @emph{affinity sets} for a process. The scheduler will -schedule the thread or process on on CPUs specified by the affinity +schedule the thread or process on CPUs specified by the affinity masks. The interfaces which the GNU C library define follow to some extend the Linux kernel interface. diff --git a/manual/stdio.texi b/manual/stdio.texi index 6748513549..9fb209a473 100644 --- a/manual/stdio.texi +++ b/manual/stdio.texi @@ -574,7 +574,7 @@ operation itself is avoided. More importantly, functions like introduction of threads) were implemented as macros which are very fast if the buffer is not empty. With the addition of locking requirements these functions are no longer implemented as macros since they would -would expand to too much code. +expand to too much code. But these macros are still available with the same functionality under the new names @code{putc_unlocked} and @code{getc_unlocked}. This possibly huge difference of speed also suggests the use of the @code{_unlocked} diff --git a/manual/time.texi b/manual/time.texi index 393bccd99f..f1f4254e90 100644 --- a/manual/time.texi +++ b/manual/time.texi @@ -972,7 +972,7 @@ This counter represents the number of calibration errors (caused by large offsets or jitter). @item long int stbcnt -This counter denotes the number of of calibrations where the stability +This counter denotes the number of calibrations where the stability exceeded the threshold. @end table @end deftp diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S index b26037d279..48a109ccd6 100644 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S +++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S @@ -127,10 +127,8 @@ ENTRY (MEMCPY) cmp %eax, %edx jb L(copy_forward) je L(fwd_write_0bytes) - cmp $32, %ecx - jge L(memmove_bwd) - jmp L(bk_write_less32bytes_2) -L(memmove_bwd): + cmp $48, %ecx + jb L(bk_write_less48bytes) add %ecx, %eax cmp %eax, %edx movl SRC(%esp), %eax @@ -139,12 +137,12 @@ L(memmove_bwd): L(copy_forward): #endif cmp $48, %ecx - jge L(48bytesormore) + jae L(48bytesormore) L(fwd_write_less32bytes): #ifndef USE_AS_MEMMOVE cmp %dl, %al - jl L(bk_write) + jb L(bk_write) #endif add %ecx, %edx add %ecx, %eax @@ -162,6 +160,7 @@ L(48bytesormore): movl %edx, %edi and $-16, %edx PUSH (%esi) + cfi_remember_state add $16, %edx movl %edi, %esi sub %edx, %edi @@ -181,7 +180,7 @@ L(48bytesormore): #endif mov %eax, %edi - jge L(large_page) + jae L(large_page) and $0xf, %edi jz L(shl_0) @@ -201,7 +200,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -209,7 +208,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -217,7 +216,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -234,6 +233,8 @@ L(shl_0_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state L(shl_0_gobble): #ifdef DATA_CACHE_SIZE_HALF @@ -251,8 +252,8 @@ L(shl_0_gobble): shr $3, %esi sub %esi, %edi cmp %edi, %ecx - jge L(shl_0_gobble_mem_start) - lea -128(%ecx), %ecx + jae L(shl_0_gobble_mem_start) + sub $128, %ecx ALIGN (4) L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 @@ -275,11 +276,10 @@ L(shl_0_gobble_cache_loop): movaps %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_cache_loop) -L(shl_0_gobble_cache_loop_tail): - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_cache_less_64bytes) + jae L(shl_0_gobble_cache_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_cache_less_64bytes) movdqa (%eax), %xmm0 sub $0x40, %ecx @@ -297,7 +297,7 @@ L(shl_0_gobble_cache_loop_tail): add $0x40, %edx L(shl_0_cache_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_cache_less_32bytes) + jb L(shl_0_cache_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -307,7 +307,7 @@ L(shl_0_cache_less_64bytes): add $0x20, %edx L(shl_0_cache_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_cache_less_16bytes) + jb L(shl_0_cache_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -320,12 +320,13 @@ L(shl_0_cache_less_16bytes): POP (%edi) BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_0_gobble_mem_start): cmp %al, %dl je L(copy_page_by_rep) - lea -128(%ecx), %ecx + sub $128, %ecx L(shl_0_gobble_mem_loop): prefetchnta 0x1c0(%eax) prefetchnta 0x280(%eax) @@ -352,10 +353,10 @@ L(shl_0_gobble_mem_loop): movaps %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_mem_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_mem_less_64bytes) + jae L(shl_0_gobble_mem_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_mem_less_64bytes) movdqa (%eax), %xmm0 sub $0x40, %ecx @@ -373,7 +374,7 @@ L(shl_0_gobble_mem_loop): add $0x40, %edx L(shl_0_mem_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_mem_less_32bytes) + jb L(shl_0_mem_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -383,7 +384,7 @@ L(shl_0_mem_less_64bytes): add $0x20, %edx L(shl_0_mem_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_mem_less_16bytes) + jb L(shl_0_mem_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -396,14 +397,15 @@ L(shl_0_mem_less_16bytes): POP (%edi) BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_1): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -1(%eax), %eax + sub $1, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_1_loop): @@ -418,7 +420,7 @@ L(shl_1_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_1_end) + jb L(shl_1_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -433,20 +435,22 @@ L(shl_1_loop): jae L(shl_1_loop) L(shl_1_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 1(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_2): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -2(%eax), %eax + sub $2, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_2_loop): @@ -461,7 +465,7 @@ L(shl_2_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_2_end) + jb L(shl_2_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -476,20 +480,22 @@ L(shl_2_loop): jae L(shl_2_loop) L(shl_2_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 2(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_3): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -3(%eax), %eax + sub $3, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_3_loop): @@ -504,7 +510,7 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_3_end) + jb L(shl_3_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -519,20 +525,22 @@ L(shl_3_loop): jae L(shl_3_loop) L(shl_3_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 3(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_4): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -4(%eax), %eax + sub $4, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_4_loop): @@ -547,7 +555,7 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_4_end) + jb L(shl_4_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -562,20 +570,22 @@ L(shl_4_loop): jae L(shl_4_loop) L(shl_4_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 4(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_5): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -5(%eax), %eax + sub $5, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_5_loop): @@ -590,7 +600,7 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_5_end) + jb L(shl_5_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -605,21 +615,22 @@ L(shl_5_loop): jae L(shl_5_loop) L(shl_5_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 5(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_6): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -6(%eax), %eax + sub $6, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_6_loop): @@ -634,7 +645,7 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_6_end) + jb L(shl_6_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -649,20 +660,22 @@ L(shl_6_loop): jae L(shl_6_loop) L(shl_6_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 6(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_7): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -7(%eax), %eax + sub $7, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_7_loop): @@ -677,7 +690,7 @@ L(shl_7_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_7_end) + jb L(shl_7_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -692,20 +705,22 @@ L(shl_7_loop): jae L(shl_7_loop) L(shl_7_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 7(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_8): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -8(%eax), %eax + sub $8, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_8_loop): @@ -720,7 +735,7 @@ L(shl_8_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_8_end) + jb L(shl_8_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -735,20 +750,22 @@ L(shl_8_loop): jae L(shl_8_loop) L(shl_8_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 8(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_9): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -9(%eax), %eax + sub $9, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_9_loop): @@ -763,7 +780,7 @@ L(shl_9_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_9_end) + jb L(shl_9_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -778,20 +795,22 @@ L(shl_9_loop): jae L(shl_9_loop) L(shl_9_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 9(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_10): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -10(%eax), %eax + sub $10, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_10_loop): @@ -806,7 +825,7 @@ L(shl_10_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_10_end) + jb L(shl_10_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -821,20 +840,22 @@ L(shl_10_loop): jae L(shl_10_loop) L(shl_10_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 10(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_11): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -11(%eax), %eax + sub $11, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_11_loop): @@ -849,7 +870,7 @@ L(shl_11_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_11_end) + jb L(shl_11_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -864,20 +885,22 @@ L(shl_11_loop): jae L(shl_11_loop) L(shl_11_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 11(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_12): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -12(%eax), %eax + sub $12, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_12_loop): @@ -892,7 +915,7 @@ L(shl_12_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_12_end) + jb L(shl_12_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -907,20 +930,22 @@ L(shl_12_loop): jae L(shl_12_loop) L(shl_12_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 12(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_13): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -13(%eax), %eax + sub $13, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_13_loop): @@ -935,7 +960,7 @@ L(shl_13_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_13_end) + jb L(shl_13_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -950,20 +975,22 @@ L(shl_13_loop): jae L(shl_13_loop) L(shl_13_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 13(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_14): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -14(%eax), %eax + sub $14, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_14_loop): @@ -978,7 +1005,7 @@ L(shl_14_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_14_end) + jb L(shl_14_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -993,21 +1020,22 @@ L(shl_14_loop): jae L(shl_14_loop) L(shl_14_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 14(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_15): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -15(%eax), %eax + sub $15, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_15_loop): @@ -1022,7 +1050,7 @@ L(shl_15_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_15_end) + jb L(shl_15_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1037,7 +1065,7 @@ L(shl_15_loop): jae L(shl_15_loop) L(shl_15_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 15(%edi, %eax), %eax @@ -1241,20 +1269,23 @@ L(fwd_write_3bytes): movl DEST(%esp), %eax # endif #endif - RETURN + RETURN_END + cfi_restore_state + cfi_remember_state ALIGN (4) L(large_page): movdqu (%eax), %xmm1 - lea 16(%eax), %eax movdqu %xmm0, (%esi) movntdq %xmm1, (%edx) - lea 16(%edx), %edx + add $0x10, %eax + add $0x10, %edx + sub $0x10, %ecx cmp %al, %dl je L(copy_page_by_rep) L(large_page_loop_init): POP (%esi) - lea -0x90(%ecx), %ecx + sub $0x80, %ecx POP (%edi) L(large_page_loop): prefetchnta 0x1c0(%eax) @@ -1280,9 +1311,9 @@ L(large_page_loop): movntdq %xmm7, 0x70(%edx) lea 0x80(%edx), %edx jae L(large_page_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(large_page_less_64bytes) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(large_page_less_64bytes) movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 @@ -1298,7 +1329,7 @@ L(large_page_loop): sub $0x40, %ecx L(large_page_less_64bytes): cmp $32, %ecx - jl L(large_page_less_32bytes) + jb L(large_page_less_32bytes) movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 lea 0x20(%eax), %eax @@ -1312,6 +1343,8 @@ L(large_page_less_32bytes): sfence BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(copy_page_by_rep): mov %eax, %esi @@ -1658,18 +1691,18 @@ L(table_48_bytes_bwd): L(copy_backward): PUSH (%esi) movl %eax, %esi - lea (%ecx,%edx,1),%edx - lea (%ecx,%esi,1),%esi + add %ecx, %edx + add %ecx, %esi testl $0x3, %edx jnz L(bk_align) L(bk_aligned_4): cmp $64, %ecx - jge L(bk_write_more64bytes) + jae L(bk_write_more64bytes) L(bk_write_64bytesless): cmp $32, %ecx - jl L(bk_write_less32bytes) + jb L(bk_write_less32bytes) L(bk_write_more32bytes): /* Copy 32 bytes at a time. */ @@ -1698,13 +1731,14 @@ L(bk_write_less32bytes): sub %ecx, %edx sub %ecx, %eax POP (%esi) -L(bk_write_less32bytes_2): +L(bk_write_less48bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + CFI_PUSH (%esi) ALIGN (4) L(bk_align): cmp $8, %ecx - jle L(bk_write_less32bytes) + jbe L(bk_write_less32bytes) testl $1, %edx /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, then (EDX & 2) must be != 0. */ @@ -1760,7 +1794,7 @@ L(bk_ssse3_align): L(bk_ssse3_cpy_pre): cmp $64, %ecx - jl L(bk_write_more32bytes) + jb L(bk_write_more32bytes) L(bk_ssse3_cpy): sub $64, %esi @@ -1775,7 +1809,7 @@ L(bk_ssse3_cpy): movdqu (%esi), %xmm0 movdqa %xmm0, (%edx) cmp $64, %ecx - jge L(bk_ssse3_cpy) + jae L(bk_ssse3_cpy) jmp L(bk_write_64bytesless) #endif diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S index 749c82d379..ec9eeb95e4 100644 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -128,7 +128,7 @@ ENTRY (MEMCPY) jb L(copy_forward) je L(fwd_write_0bytes) cmp $32, %ecx - jge L(memmove_bwd) + jae L(memmove_bwd) jmp L(bk_write_less32bytes_2) L(memmove_bwd): add %ecx, %eax @@ -139,12 +139,12 @@ L(memmove_bwd): L(copy_forward): #endif cmp $48, %ecx - jge L(48bytesormore) + jae L(48bytesormore) L(fwd_write_less32bytes): #ifndef USE_AS_MEMMOVE cmp %dl, %al - jl L(bk_write) + jb L(bk_write) #endif add %ecx, %edx add %ecx, %eax @@ -162,6 +162,7 @@ L(48bytesormore): movl %edx, %edi and $-16, %edx PUSH (%esi) + cfi_remember_state add $16, %edx movl %edi, %esi sub %edx, %edi @@ -181,12 +182,14 @@ L(48bytesormore): #endif mov %eax, %edi - jge L(large_page) + jae L(large_page) and $0xf, %edi jz L(shl_0) BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_0): movdqu %xmm0, (%esi) @@ -202,7 +205,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -210,7 +213,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -218,7 +221,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -234,6 +237,7 @@ L(shl_0_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + CFI_PUSH (%edi) L(shl_0_gobble): #ifdef DATA_CACHE_SIZE_HALF @@ -250,7 +254,7 @@ L(shl_0_gobble): POP (%edi) lea -128(%ecx), %ecx - jge L(shl_0_gobble_mem_loop) + jae L(shl_0_gobble_mem_loop) L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 movdqa 0x10(%eax), %xmm1 @@ -272,8 +276,7 @@ L(shl_0_gobble_cache_loop): movdqa %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_cache_loop) -L(shl_0_gobble_cache_loop_tail): + jae L(shl_0_gobble_cache_loop) cmp $-0x40, %ecx lea 0x80(%ecx), %ecx jl L(shl_0_cache_less_64bytes) @@ -294,7 +297,7 @@ L(shl_0_gobble_cache_loop_tail): add $0x40, %edx L(shl_0_cache_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_cache_less_32bytes) + jb L(shl_0_cache_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -304,7 +307,7 @@ L(shl_0_cache_less_64bytes): add $0x20, %edx L(shl_0_cache_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_cache_less_16bytes) + jb L(shl_0_cache_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -342,7 +345,7 @@ L(shl_0_gobble_mem_loop): movdqa %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_mem_loop) + jae L(shl_0_gobble_mem_loop) cmp $-0x40, %ecx lea 0x80(%ecx), %ecx jl L(shl_0_mem_less_64bytes) @@ -363,7 +366,7 @@ L(shl_0_gobble_mem_loop): add $0x40, %edx L(shl_0_mem_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_mem_less_32bytes) + jb L(shl_0_mem_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -373,7 +376,7 @@ L(shl_0_mem_less_64bytes): add $0x20, %edx L(shl_0_mem_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_mem_less_16bytes) + jb L(shl_0_mem_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -384,7 +387,8 @@ L(shl_0_mem_less_16bytes): add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_1): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -406,7 +410,7 @@ L(shl_1_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_1_end) + jb L(shl_1_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -428,6 +432,8 @@ L(shl_1_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_2): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -449,7 +455,7 @@ L(shl_2_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_2_end) + jb L(shl_2_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -471,6 +477,8 @@ L(shl_2_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_3): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -492,7 +500,7 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_3_end) + jb L(shl_3_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -514,6 +522,8 @@ L(shl_3_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_4): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -535,7 +545,7 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_4_end) + jb L(shl_4_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -557,6 +567,8 @@ L(shl_4_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_5): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -578,7 +590,7 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_5_end) + jb L(shl_5_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -600,7 +612,8 @@ L(shl_5_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_6): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -622,7 +635,7 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_6_end) + jb L(shl_6_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -644,6 +657,8 @@ L(shl_6_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_7): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -665,7 +680,7 @@ L(shl_7_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_7_end) + jb L(shl_7_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -687,6 +702,8 @@ L(shl_7_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_8): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -708,7 +725,7 @@ L(shl_8_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_8_end) + jb L(shl_8_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -730,6 +747,8 @@ L(shl_8_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_9): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -751,7 +770,7 @@ L(shl_9_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_9_end) + jb L(shl_9_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -773,6 +792,8 @@ L(shl_9_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_10): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -794,7 +815,7 @@ L(shl_10_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_10_end) + jb L(shl_10_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -816,6 +837,8 @@ L(shl_10_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_11): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -837,7 +860,7 @@ L(shl_11_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_11_end) + jb L(shl_11_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -859,6 +882,8 @@ L(shl_11_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_12): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -880,7 +905,7 @@ L(shl_12_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_12_end) + jb L(shl_12_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -902,6 +927,8 @@ L(shl_12_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_13): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -923,7 +950,7 @@ L(shl_13_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_13_end) + jb L(shl_13_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -945,6 +972,8 @@ L(shl_13_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_14): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -966,7 +995,7 @@ L(shl_14_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_14_end) + jb L(shl_14_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -988,7 +1017,8 @@ L(shl_14_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_15): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -1010,7 +1040,7 @@ L(shl_15_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_15_end) + jb L(shl_15_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1229,8 +1259,10 @@ L(fwd_write_3bytes): movl DEST(%esp), %eax # endif #endif - RETURN + RETURN_END + cfi_restore_state + cfi_remember_state ALIGN (4) L(large_page): movdqu (%eax), %xmm1 @@ -1281,7 +1313,7 @@ L(large_page_loop): sub $0x40, %ecx L(large_page_less_64bytes): cmp $32, %ecx - jl L(large_page_less_32bytes) + jb L(large_page_less_32bytes) movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 lea 0x20(%eax), %eax @@ -1617,11 +1649,11 @@ L(copy_backward): L(bk_aligned_4): cmp $64, %ecx - jge L(bk_write_more64bytes) + jae L(bk_write_more64bytes) L(bk_write_64bytesless): cmp $32, %ecx - jl L(bk_write_less32bytes) + jb L(bk_write_less32bytes) L(bk_write_more32bytes): /* Copy 32 bytes at a time. */ @@ -1653,10 +1685,11 @@ L(bk_write_less32bytes): L(bk_write_less32bytes_2): BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + CFI_PUSH (%esi) ALIGN (4) L(bk_align): cmp $8, %ecx - jle L(bk_write_less32bytes) + jbe L(bk_write_less32bytes) testl $1, %edx /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, then (EDX & 2) must be != 0. */ @@ -1712,7 +1745,7 @@ L(bk_ssse3_align): L(bk_ssse3_cpy_pre): cmp $64, %ecx - jl L(bk_write_more32bytes) + jb L(bk_write_more32bytes) L(bk_ssse3_cpy): sub $64, %esi @@ -1727,7 +1760,7 @@ L(bk_ssse3_cpy): movdqu (%esi), %xmm0 movdqa %xmm0, (%edx) cmp $64, %ecx - jge L(bk_ssse3_cpy) + jae L(bk_ssse3_cpy) jmp L(bk_write_64bytesless) #endif diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S index 84afffeb66..f9a0b13d0c 100644 --- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S +++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S @@ -243,7 +243,6 @@ L(32bytesormore): pxor %xmm0, %xmm0 #else movd %eax, %xmm0 - punpcklbw %xmm0, %xmm0 pshufd $0, %xmm0, %xmm0 #endif testl $0xf, %edx @@ -261,7 +260,7 @@ L(not_aligned_16): ALIGN (4) L(aligned_16): cmp $128, %ecx - jge L(128bytesormore) + jae L(128bytesormore) L(aligned_16_less128bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) @@ -293,7 +292,7 @@ L(128bytesormore): * fast string will prefetch and combine data efficiently. */ cmp %edi, %ecx - jae L(128bytesormore_nt) + jae L(128bytesormore_endof_L1) subl $128, %ecx L(128bytesormore_normal): sub $128, %ecx @@ -306,7 +305,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jl L(128bytesless_normal) + jb L(128bytesless_normal) sub $128, %ecx @@ -319,15 +318,16 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jge L(128bytesormore_normal) + jae L(128bytesormore_normal) L(128bytesless_normal): POP (%edi) - lea 128(%ecx), %ecx + add $128, %ecx BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + CFI_PUSH (%edi) ALIGN (4) -L(128bytesormore_nt): +L(128bytesormore_endof_L1): mov %edx, %edi mov %ecx, %edx shr $2, %ecx diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S index b2b979193e..92ad601bf2 100644 --- a/sysdeps/i386/i686/multiarch/memset-sse2.S +++ b/sysdeps/i386/i686/multiarch/memset-sse2.S @@ -243,7 +243,6 @@ L(32bytesormore): pxor %xmm0, %xmm0 #else movd %eax, %xmm0 - punpcklbw %xmm0, %xmm0 pshufd $0, %xmm0, %xmm0 #endif testl $0xf, %edx @@ -261,7 +260,7 @@ L(not_aligned_16): ALIGN (4) L(aligned_16): cmp $128, %ecx - jge L(128bytesormore) + jae L(128bytesormore) L(aligned_16_less128bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) @@ -287,14 +286,17 @@ L(128bytesormore): #ifdef DATA_CACHE_SIZE POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) cmp $DATA_CACHE_SIZE, %ecx #else # ifdef SHARED +# define RESTORE_EBX_STATE call __i686.get_pc_thunk.bx add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx # else POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) cmp __x86_data_cache_size, %ecx # endif #endif @@ -312,7 +314,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jl L(128bytesless_normal) + jb L(128bytesless_normal) sub $128, %ecx @@ -325,10 +327,10 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jge L(128bytesormore_normal) + jae L(128bytesormore_normal) L(128bytesless_normal): - lea 128(%ecx), %ecx + add $128, %ecx BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) ALIGN (4) @@ -346,11 +348,12 @@ L(128bytes_L2_normal): movaps %xmm0, 0x70(%edx) add $128, %edx cmp $128, %ecx - jge L(128bytes_L2_normal) + jae L(128bytes_L2_normal) L(128bytesless_L2_normal): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + RESTORE_EBX_STATE L(128bytesormore_nt_start): sub %ebx, %ecx ALIGN (4) @@ -368,7 +371,7 @@ L(128bytesormore_shared_cache_loop): movdqa %xmm0, 0x70(%edx) add $0x80, %edx cmp $0x80, %ebx - jge L(128bytesormore_shared_cache_loop) + jae L(128bytesormore_shared_cache_loop) cmp $0x80, %ecx jb L(shared_cache_loop_end) ALIGN (4) @@ -384,7 +387,7 @@ L(128bytesormore_nt): movntdq %xmm0, 0x70(%edx) add $0x80, %edx cmp $0x80, %ecx - jge L(128bytesormore_nt) + jae L(128bytesormore_nt) sfence L(shared_cache_loop_end): #if defined DATA_CACHE_SIZE || !defined SHARED diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S index d5fd23e15c..81d6ec66f7 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -178,7 +178,9 @@ L(first4bytes): PUSH (%ebx) PUSH (%edi) PUSH (%esi) +#ifdef USE_AS_STRNCMP cfi_remember_state +#endif mov %edx, %edi mov %eax, %esi xorl %eax, %eax @@ -246,8 +248,8 @@ L(ret): ret .p2align 4 - cfi_restore_state #ifdef USE_AS_STRNCMP + cfi_restore_state L(more16byteseq): POP (%esi) POP (%edi) diff --git a/sysdeps/x86_64/Implies b/sysdeps/x86_64/Implies index 2b8412b0b6..2e0a323e13 100644 --- a/sysdeps/x86_64/Implies +++ b/sysdeps/x86_64/Implies @@ -1,4 +1,5 @@ wordsize-64 ieee754/ldbl-96 +ieee754/dbl-64/wordsize-64 ieee754/dbl-64 ieee754/flt-32 |