20 files changed, 337 insertions, 190 deletions
diff --git a/ChangeLog b/ChangeLog
index f21abc5c3e..b8118dd8f0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2010-02-28  Roland McGrath  <roland@redhat.com>
+
+	* elf/elf.h (NT_X86_XSTATE): New macro.
+
 2010-02-25  David S. Miller  <davem@davemloft.net>
 
 	* sysdeps/sparc/sparc64/Makefile: Add align-cpy rule.
@@ -55,6 +59,70 @@
 	* sysdeps/sparc/sparc64/multiarch/memcpy.S: New file.
 	* sysdeps/sparc/sparc64/multiarch/memset.S: New file.
 
+2010-02-20  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #11332]
+	* sysdeps/i386/i686/multiarch/strcmp-sse4.S: Use cfi_remember_state
+	and cfi_restore_state only if USE_AS_STRNCMP is defined.
+
+2010-02-24  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/x86_64/Implies: Add ieee754/dbl-64/wordsize-64 entry.
+
+2010-02-24  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
+	(bk_write_less32bytes_2): Renamed to ...
+	(bk_write_less48bytes): This.
+	Use unsigned conditional jumps.
+	Correct unwind info.
+	Use add/sub instead of lea if possible.
+	(shl_0_gobble_cache_loop_tail): Removed.
+	(large_page): Properly adjust ECX.
+
+	* sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Use unsigned
+	conditional jumps.
+	Correct unwind info.
+
+	* sysdeps/i386/i686/multiarch/memset-sse2-rep.S: Remove redundant
+	punpcklbw.
+	Use unsigned conditional jumps.
+	(128bytesormore_nt): Renamed to ...
+	(128bytesormore_endof_L1): This.
+	Use add instead of lea if possible.
+	Correct unwind info.
+	* sysdeps/i386/i686/multiarch/memset-sse2.S: Remove redundant
+	punpcklbw.
+	Use unsigned conditional jumps.
+	Use add instead of lea if possible.
+	Correct unwind info.
+
+2010-02-24  Ulrich Drepper  <drepper@redhat.com>
+
+	[BZ #11319]
+	* libio/iovdprintf.c (_IO_vdprintf): Explicitly flush stream before
+	undoing the stream because _IO_FINISH doesn't report failures.
+
+	[BZ #5553]
+	* malloc/malloc.c (public_vALLOc): Set ar_ptr when trying main_arena.
+	(public_pVALLOc): Likewise.
+	Patch by Petr Baudis.
+
+2010-02-22  Jim Meyering  <meyering@redhat.com>
+
+	* manual/math.texi (BSD Random): Fix a typo: s/are/is/
+
+	* manual/charset.texi: Adjust grammar.
+
+	* manual/errno.texi (Error Messages): Fix doubled-words and typos.
+	* manual/charset.texi (Selecting the Conversion): Likewise.
+	* manual/getopt.texi (Getopt Long Options): Likewise.
+	* manual/memory.texi (Resizing the Data Segment): Likewise.
+	* manual/message.texi (GUI program problems): Likewise.
+	* manual/resource.texi (CPU Affinity): Likewise.
+	* manual/stdio.texi (Streams and Threads): Likewise.
+	* manual/time.texi (High Accuracy Clock): Likewise.
+
 2009-02-20  David S. Miller  <davem@davemloft.net>
 
 	* sysdeps/sparc/sparc32/dl-machine.h (elf_machine_runtime_setup):
diff --git a/elf/dl-load.c b/elf/dl-load.c
index 597193c043..e8c7be55f7 100644
--- a/elf/dl-load.c
+++ b/elf/dl-load.c
@@ -1,5 +1,5 @@
 /* Map in a shared object's segments from the file.
-   Copyright (C) 1995-2005, 2006, 2007, 2009 Free Software Foundation, Inc.
+   Copyright (C) 1995-2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -313,7 +313,7 @@ static char *
 expand_dynamic_string_token (struct link_map *l, const char *s)
 {
   /* We make two runs over the string.  First we determine how large the
-     resulting string is and then we copy it over.  Since this is now
+     resulting string is and then we copy it over.  Since this is no
      frequently executed operation we are looking here not for performance
      but rather for code size.  */
   size_t cnt;
@@ -391,7 +391,7 @@ fillin_rpath (char *rpath, struct r_search_path_elem **result, const char *sep,
       size_t len = strlen (cp);
 
       /* `strsep' can pass an empty string.  This has to be
-         interpreted as `use the current directory'. */
+	 interpreted as `use the current directory'. */
       if (len == 0)
 	{
 	  static const char curwd[] = "./";
@@ -1519,7 +1519,7 @@ cannot enable executable stack as shared object requires");
 /* Print search path.  */
 static void
 print_search_path (struct r_search_path_elem **list,
-                   const char *what, const char *name)
+		   const char *what, const char *name)
 {
   char buf[max_dirnamelen + max_capstrlen];
   int first = 1;
@@ -2044,7 +2044,7 @@ _dl_map_object (struct link_map *loader, const char *name, int preloaded,
       fd = -1;
 
       /* When the object has the RUNPATH information we don't use any
-         RPATHs.  */
+	 RPATHs.  */
       if (loader == NULL || loader->l_info[DT_RUNPATH] == NULL)
 	{
 	  /* This is the executable's map (if there is one).  Make sure that
@@ -2067,7 +2067,7 @@ _dl_map_object (struct link_map *loader, const char *name, int preloaded,
 	      }
 
 	  /* If dynamically linked, try the DT_RPATH of the executable
-             itself.  NB: we do this for lookups in any namespace.  */
+	     itself.  NB: we do this for lookups in any namespace.  */
 	  if (fd == -1 && !did_main_map
 	      && main_map != NULL && main_map->l_type != lt_loaded
 	      && cache_rpath (main_map, &main_map->l_rpath_dirs, DT_RPATH,
@@ -2164,7 +2164,7 @@ _dl_map_object (struct link_map *loader, const char *name, int preloaded,
 
       /* Add another newline when we are tracing the library loading.  */
       if (__builtin_expect (GLRO(dl_debug_mask) & DL_DEBUG_LIBS, 0))
-        _dl_debug_printf ("\n");
+	_dl_debug_printf ("\n");
     }
   else
     {
diff --git a/elf/elf.h b/elf/elf.h
index 204a0f9e19..b6b6410c51 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -619,6 +619,7 @@ typedef struct
 #define NT_PPC_VSX	0x102		/* PowerPC VSX registers */
 #define NT_386_TLS	0x200		/* i386 TLS slots (struct user_desc) */
 #define NT_386_IOPERM	0x201		/* x86 io permission bitmap (1=deny) */
+#define NT_X86_XSTATE	0x202		/* x86 extended state using xsave */
 
 /* Legal values for the note segment descriptor types for object files.  */
 
diff --git a/libio/iovdprintf.c b/libio/iovdprintf.c
index edab849a44..5284ff8938 100644
--- a/libio/iovdprintf.c
+++ b/libio/iovdprintf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995, 1997-2000, 2001, 2002, 2003, 2006
+/* Copyright (C) 1995, 1997-2000, 2001, 2002, 2003, 2006, 2010
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -60,6 +60,9 @@ _IO_vdprintf (d, format, arg)
 
   done = INTUSE(_IO_vfprintf) (&tmpfil.file, format, arg);
 
+  if (done != EOF && _IO_do_flush (&tmpfil.file) == EOF)
+    done = EOF;
+
   _IO_FINISH (&tmpfil.file);
 
   return done;
diff --git a/malloc/malloc.c b/malloc/malloc.c
index b43e454f6e..763852ea3b 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1,5 +1,5 @@
 /* Malloc implementation for multiple threads without lock contention.
-   Copyright (C) 1996-2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+   Copyright (C) 1996-2009, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Wolfram Gloger <wg@malloc.de>
    and Doug Lea <dl@cs.oswego.edu>, 2001.
@@ -3933,9 +3933,10 @@ public_vALLOc(size_t bytes)
   if(!p) {
     /* Maybe the failure is due to running out of mmapped areas. */
     if(ar_ptr != &main_arena) {
-      (void)mutex_lock(&main_arena.mutex);
-      p = _int_memalign(&main_arena, pagesz, bytes);
-      (void)mutex_unlock(&main_arena.mutex);
+      ar_ptr = &main_arena;
+      (void)mutex_lock(&ar_ptr->mutex);
+      p = _int_memalign(ar_ptr, pagesz, bytes);
+      (void)mutex_unlock(&ar_ptr->mutex);
     } else {
 #if USE_ARENAS
       /* ... or sbrk() has failed and there is still a chance to mmap() */
@@ -3978,9 +3979,10 @@ public_pVALLOc(size_t bytes)
   if(!p) {
     /* Maybe the failure is due to running out of mmapped areas. */
     if(ar_ptr != &main_arena) {
-      (void)mutex_lock(&main_arena.mutex);
-      p = _int_memalign(&main_arena, pagesz, rounded_bytes);
-      (void)mutex_unlock(&main_arena.mutex);
+      ar_ptr = &main_arena;
+      (void)mutex_lock(&ar_ptr->mutex);
+      p = _int_memalign(ar_ptr, pagesz, rounded_bytes);
+      (void)mutex_unlock(&ar_ptr->mutex);
     } else {
 #if USE_ARENAS
       /* ... or sbrk() has failed and there is still a chance to mmap() */
diff --git a/manual/charset.texi b/manual/charset.texi
index 79854e50bf..808469b8c1 100644
--- a/manual/charset.texi
+++ b/manual/charset.texi
@@ -393,7 +393,7 @@ We already said above that the currently selected locale for the
 by the functions we are about to describe.  Each locale uses its own
 character set (given as an argument to @code{localedef}) and this is the
 one assumed as the external multibyte encoding.  The wide character
-character set always is UCS-4, at least on GNU systems.
+set is always UCS-4, at least on GNU systems.
 
 A characteristic of each multibyte character set is the maximum number
 of bytes that can be necessary to represent one character.  This
@@ -577,8 +577,8 @@ The @code{btowc} function was introduced in @w{Amendment 1} to @w{ISO C90}
 and is declared in @file{wchar.h}.
 @end deftypefun
 
-Despite the limitation that the single byte value always is interpreted
-in the initial state this function is actually useful most of the time.
+Despite the limitation that the single byte value is always interpreted
+in the initial state, this function is actually useful most of the time.
 Most characters are either entirely single-byte character sets or they
 are extension to ASCII.  But then it is possible to write code like this
 (not that this specific example is very useful):
@@ -607,10 +607,10 @@ that there is no guarantee that one can perform this kind of arithmetic
 on the character of the character set used for @code{wchar_t}
 representation.  In other situations the bytes are not constant at
 compile time and so the compiler cannot do the work.  In situations like
-this it is necessary @code{btowc}.
+this, using @code{btowc} is required.
 
 @noindent
-There also is a function for the conversion in the other direction.
+There is also a function for the conversion in the other direction.
 
 @comment wchar.h
 @comment ISO
diff --git a/manual/errno.texi b/manual/errno.texi
index 03a868e457..3b0af0c384 100644
--- a/manual/errno.texi
+++ b/manual/errno.texi
@@ -1425,7 +1425,7 @@ available on all systems implementing @w{ISO C}.  But often the text
 @code{perror} generates is not what is wanted and there is no way to
 extend or change what @code{perror} does.  The GNU coding standard, for
 instance, requires error messages to be preceded by the program name and
-programs which read some input files should should provide information
+programs which read some input files should provide information
 about the input file name and the line number in case an error is
 encountered while reading the file.  For these occasions there are two
 functions available which are widely used throughout the GNU project.
diff --git a/manual/getopt.texi b/manual/getopt.texi
index 8c9bd20d6d..77045157ef 100644
--- a/manual/getopt.texi
+++ b/manual/getopt.texi
@@ -269,7 +269,7 @@ When @code{getopt_long} has no more options to handle, it returns
 @var{argv} of the next remaining argument.
 @end deftypefun
 
-Since long option names were used before before the @code{getopt_long}
+Since long option names were used before the @code{getopt_long}
 options was invented there are program interfaces which require programs
 to recognize options like @w{@samp{-option value}} instead of
 @w{@samp{--option value}}.  To enable these programs to use the GNU
diff --git a/manual/math.texi b/manual/math.texi
index 50e087c487..95e3378c9e 100644
--- a/manual/math.texi
+++ b/manual/math.texi
@@ -1421,7 +1421,7 @@ pseudo-random number generator.
 
 The GNU C library contains four additional functions which contain the
 state as an explicit parameter and therefore make it possible to handle
-thread-local PRNGs.  Beside this there are no difference.  In fact, the
+thread-local PRNGs.  Beside this there is no difference.  In fact, the
 four functions already discussed are implemented internally using the
 following interfaces.
 
diff --git a/manual/memory.texi b/manual/memory.texi
index 43afc7bf95..59ea1ee342 100644
--- a/manual/memory.texi
+++ b/manual/memory.texi
@@ -2379,7 +2379,7 @@ exceed the process' data storage limit.
 @c The Brk system call in Linux (as opposed to the GNU C Library function)
 @c is considerably different.  It always returns the new end of the data
 @c segment, whether it succeeds or fails.  The GNU C library Brk determines
-@c it's a failure if and only if if the system call returns an address less
+@c it's a failure if and only if the system call returns an address less
 @c than the address requested.
 
 @end deftypefun
diff --git a/manual/message.texi b/manual/message.texi
index e772b2de1f..e44545a311 100644
--- a/manual/message.texi
+++ b/manual/message.texi
@@ -1466,7 +1466,7 @@ have this problem.  But there is a very simple and powerful method to
 handle these kind of problems with the @code{gettext} functions.
 
 @noindent
-As as example consider the following fictional situation.  A GUI program
+As an example consider the following fictional situation.  A GUI program
 has a menu bar with the following entries:
 
 @smallexample
diff --git a/manual/resource.texi b/manual/resource.texi
index 4a814c9e4a..05495722f9 100644
--- a/manual/resource.texi
+++ b/manual/resource.texi
@@ -1288,7 +1288,7 @@ protected from concurrent accesses from different processors.
 The POSIX standard up to this date is of not much help to solve this
 problem.  The Linux kernel provides a set of interfaces to allow
 specifying @emph{affinity sets} for a process.  The scheduler will
-schedule the thread or process on on CPUs specified by the affinity
+schedule the thread or process on CPUs specified by the affinity
 masks.  The interfaces which the GNU C library define follow to some
 extend the Linux kernel interface.
 
diff --git a/manual/stdio.texi b/manual/stdio.texi
index 6748513549..9fb209a473 100644
--- a/manual/stdio.texi
+++ b/manual/stdio.texi
@@ -574,7 +574,7 @@ operation itself is avoided.  More importantly, functions like
 introduction of threads) were implemented as macros which are very fast
 if the buffer is not empty.  With the addition of locking requirements
 these functions are no longer implemented as macros since they would
-would expand to too much code.
+expand to too much code.
 But these macros are still available with the same functionality under the new
 names @code{putc_unlocked} and @code{getc_unlocked}.  This possibly huge
 difference of speed also suggests the use of the @code{_unlocked}
diff --git a/manual/time.texi b/manual/time.texi
index 393bccd99f..f1f4254e90 100644
--- a/manual/time.texi
+++ b/manual/time.texi
@@ -972,7 +972,7 @@ This counter represents the number of calibration errors (caused by
 large offsets or jitter).
 
 @item long int stbcnt
-This counter denotes the number of of calibrations where the stability
+This counter denotes the number of calibrations where the stability
 exceeded the threshold.
 @end table
 @end deftp
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
index b26037d279..48a109ccd6 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -127,10 +127,8 @@ ENTRY (MEMCPY)
 	cmp	%eax, %edx
 	jb	L(copy_forward)
 	je	L(fwd_write_0bytes)
-	cmp	$32, %ecx
-	jge	L(memmove_bwd)
-	jmp	L(bk_write_less32bytes_2)
-L(memmove_bwd):
+	cmp	$48, %ecx
+	jb	L(bk_write_less48bytes)
 	add	%ecx, %eax
 	cmp	%eax, %edx
 	movl	SRC(%esp), %eax
@@ -139,12 +137,12 @@ L(memmove_bwd):
 L(copy_forward):
 #endif
 	cmp	$48, %ecx
-	jge	L(48bytesormore)
+	jae	L(48bytesormore)
 
 L(fwd_write_less32bytes):
 #ifndef USE_AS_MEMMOVE
 	cmp	%dl, %al
-	jl	L(bk_write)
+	jb	L(bk_write)
 #endif
 	add	%ecx, %edx
 	add	%ecx, %eax
@@ -162,6 +160,7 @@ L(48bytesormore):
 	movl	%edx, %edi
 	and	$-16, %edx
 	PUSH (%esi)
+	cfi_remember_state
 	add	$16, %edx
 	movl	%edi, %esi
 	sub	%edx, %edi
@@ -181,7 +180,7 @@ L(48bytesormore):
 #endif
 
 	mov	%eax, %edi
-	jge	L(large_page)
+	jae	L(large_page)
 	and	$0xf, %edi
 	jz	L(shl_0)
 
@@ -201,7 +200,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -209,7 +208,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -217,7 +216,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -234,6 +233,8 @@ L(shl_0_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 L(shl_0_gobble):
 
 #ifdef DATA_CACHE_SIZE_HALF
@@ -251,8 +252,8 @@ L(shl_0_gobble):
 	shr	$3, %esi
 	sub	%esi, %edi
 	cmp	%edi, %ecx
-	jge	L(shl_0_gobble_mem_start)
-	lea	-128(%ecx), %ecx
+	jae	L(shl_0_gobble_mem_start)
+	sub	$128, %ecx
 	ALIGN (4)
 L(shl_0_gobble_cache_loop):
 	movdqa	(%eax), %xmm0
@@ -275,11 +276,10 @@ L(shl_0_gobble_cache_loop):
 	movaps	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 
-	jge	L(shl_0_gobble_cache_loop)
-L(shl_0_gobble_cache_loop_tail):
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_cache_less_64bytes)
+	jae	L(shl_0_gobble_cache_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_cache_less_64bytes)
 
 	movdqa	(%eax), %xmm0
 	sub	$0x40, %ecx
@@ -297,7 +297,7 @@ L(shl_0_gobble_cache_loop_tail):
 	add	$0x40, %edx
 L(shl_0_cache_less_64bytes):
 	cmp	$0x20, %ecx
-	jl	L(shl_0_cache_less_32bytes)
+	jb	L(shl_0_cache_less_32bytes)
 	movdqa	(%eax), %xmm0
 	sub	$0x20, %ecx
 	movdqa	0x10(%eax), %xmm1
@@ -307,7 +307,7 @@ L(shl_0_cache_less_64bytes):
 	add	$0x20, %edx
 L(shl_0_cache_less_32bytes):
 	cmp	$0x10, %ecx
-	jl	L(shl_0_cache_less_16bytes)
+	jb	L(shl_0_cache_less_16bytes)
 	sub	$0x10, %ecx
 	movdqa	(%eax), %xmm0
 	add	$0x10, %eax
@@ -320,12 +320,13 @@ L(shl_0_cache_less_16bytes):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_0_gobble_mem_start):
 	cmp	%al, %dl
 	je	L(copy_page_by_rep)
-	lea	-128(%ecx), %ecx
+	sub	$128, %ecx
 L(shl_0_gobble_mem_loop):
 	prefetchnta 0x1c0(%eax)
 	prefetchnta 0x280(%eax)
@@ -352,10 +353,10 @@ L(shl_0_gobble_mem_loop):
 	movaps	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 
-	jge	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_mem_less_64bytes)
+	jae	L(shl_0_gobble_mem_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_mem_less_64bytes)
 
 	movdqa	(%eax), %xmm0
 	sub	$0x40, %ecx
@@ -373,7 +374,7 @@ L(shl_0_gobble_mem_loop):
 	add	$0x40, %edx
 L(shl_0_mem_less_64bytes):
 	cmp	$0x20, %ecx
-	jl	L(shl_0_mem_less_32bytes)
+	jb	L(shl_0_mem_less_32bytes)
 	movdqa	(%eax), %xmm0
 	sub	$0x20, %ecx
 	movdqa	0x10(%eax), %xmm1
@@ -383,7 +384,7 @@ L(shl_0_mem_less_64bytes):
 	add	$0x20, %edx
 L(shl_0_mem_less_32bytes):
 	cmp	$0x10, %ecx
-	jl	L(shl_0_mem_less_16bytes)
+	jb	L(shl_0_mem_less_16bytes)
 	sub	$0x10, %ecx
 	movdqa	(%eax), %xmm0
 	add	$0x10, %eax
@@ -396,14 +397,15 @@ L(shl_0_mem_less_16bytes):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_1):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-1(%eax), %eax
+	sub	$1, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_1_loop):
@@ -418,7 +420,7 @@ L(shl_1_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_1_end)
+	jb	L(shl_1_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -433,20 +435,22 @@ L(shl_1_loop):
 	jae	L(shl_1_loop)
 
 L(shl_1_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	1(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_2):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-2(%eax), %eax
+	sub	$2, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_2_loop):
@@ -461,7 +465,7 @@ L(shl_2_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_2_end)
+	jb	L(shl_2_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -476,20 +480,22 @@ L(shl_2_loop):
 	jae	L(shl_2_loop)
 
 L(shl_2_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	2(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_3):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-3(%eax), %eax
+	sub	$3, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_3_loop):
@@ -504,7 +510,7 @@ L(shl_3_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_3_end)
+	jb	L(shl_3_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -519,20 +525,22 @@ L(shl_3_loop):
 	jae	L(shl_3_loop)
 
 L(shl_3_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	3(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_4):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-4(%eax), %eax
+	sub	$4, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_4_loop):
@@ -547,7 +555,7 @@ L(shl_4_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_4_end)
+	jb	L(shl_4_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -562,20 +570,22 @@ L(shl_4_loop):
 	jae	L(shl_4_loop)
 
 L(shl_4_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	4(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_5):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-5(%eax), %eax
+	sub	$5, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_5_loop):
@@ -590,7 +600,7 @@ L(shl_5_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_5_end)
+	jb	L(shl_5_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -605,21 +615,22 @@ L(shl_5_loop):
 	jae	L(shl_5_loop)
 
 L(shl_5_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	5(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_6):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-6(%eax), %eax
+	sub	$6, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_6_loop):
@@ -634,7 +645,7 @@ L(shl_6_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_6_end)
+	jb	L(shl_6_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -649,20 +660,22 @@ L(shl_6_loop):
 	jae	L(shl_6_loop)
 
 L(shl_6_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	6(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_7):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-7(%eax), %eax
+	sub	$7, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_7_loop):
@@ -677,7 +690,7 @@ L(shl_7_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_7_end)
+	jb	L(shl_7_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -692,20 +705,22 @@ L(shl_7_loop):
 	jae	L(shl_7_loop)
 
 L(shl_7_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	7(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_8):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-8(%eax), %eax
+	sub	$8, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_8_loop):
@@ -720,7 +735,7 @@ L(shl_8_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_8_end)
+	jb	L(shl_8_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -735,20 +750,22 @@ L(shl_8_loop):
 	jae	L(shl_8_loop)
 
 L(shl_8_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	8(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_9):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-9(%eax), %eax
+	sub	$9, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_9_loop):
@@ -763,7 +780,7 @@ L(shl_9_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_9_end)
+	jb	L(shl_9_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -778,20 +795,22 @@ L(shl_9_loop):
 	jae	L(shl_9_loop)
 
 L(shl_9_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	9(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_10):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-10(%eax), %eax
+	sub	$10, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_10_loop):
@@ -806,7 +825,7 @@ L(shl_10_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_10_end)
+	jb	L(shl_10_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -821,20 +840,22 @@ L(shl_10_loop):
 	jae	L(shl_10_loop)
 
 L(shl_10_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	10(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_11):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-11(%eax), %eax
+	sub	$11, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_11_loop):
@@ -849,7 +870,7 @@ L(shl_11_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_11_end)
+	jb	L(shl_11_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -864,20 +885,22 @@ L(shl_11_loop):
 	jae	L(shl_11_loop)
 
 L(shl_11_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	11(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_12):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-12(%eax), %eax
+	sub	$12, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_12_loop):
@@ -892,7 +915,7 @@ L(shl_12_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_12_end)
+	jb	L(shl_12_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -907,20 +930,22 @@ L(shl_12_loop):
 	jae	L(shl_12_loop)
 
 L(shl_12_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	12(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_13):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-13(%eax), %eax
+	sub	$13, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_13_loop):
@@ -935,7 +960,7 @@ L(shl_13_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_13_end)
+	jb	L(shl_13_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -950,20 +975,22 @@ L(shl_13_loop):
 	jae	L(shl_13_loop)
 
 L(shl_13_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	13(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_14):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-14(%eax), %eax
+	sub	$14, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_14_loop):
@@ -978,7 +1005,7 @@ L(shl_14_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_14_end)
+	jb	L(shl_14_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -993,21 +1020,22 @@ L(shl_14_loop):
 	jae	L(shl_14_loop)
 
 L(shl_14_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	14(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_15):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-15(%eax), %eax
+	sub	$15, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_15_loop):
@@ -1022,7 +1050,7 @@ L(shl_15_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_15_end)
+	jb	L(shl_15_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -1037,7 +1065,7 @@ L(shl_15_loop):
 	jae	L(shl_15_loop)
 
 L(shl_15_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	15(%edi, %eax), %eax
@@ -1241,20 +1269,23 @@ L(fwd_write_3bytes):
 	movl	DEST(%esp), %eax
 # endif
 #endif
-	RETURN
+	RETURN_END
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(large_page):
 	movdqu	(%eax), %xmm1
-	lea	16(%eax), %eax
 	movdqu	%xmm0, (%esi)
 	movntdq	%xmm1, (%edx)
-	lea	16(%edx), %edx
+	add	$0x10, %eax
+	add	$0x10, %edx
+	sub	$0x10, %ecx
 	cmp	%al, %dl
 	je	L(copy_page_by_rep)
 L(large_page_loop_init):
 	POP (%esi)
-	lea	-0x90(%ecx), %ecx
+	sub	$0x80, %ecx
 	POP (%edi)
 L(large_page_loop):
 	prefetchnta	0x1c0(%eax)
@@ -1280,9 +1311,9 @@ L(large_page_loop):
 	movntdq	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 	jae	L(large_page_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(large_page_less_64bytes)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(large_page_less_64bytes)
 
 	movdqu	(%eax), %xmm0
 	movdqu	0x10(%eax), %xmm1
@@ -1298,7 +1329,7 @@ L(large_page_loop):
 	sub	$0x40, %ecx
 L(large_page_less_64bytes):
 	cmp	$32, %ecx
-	jl	L(large_page_less_32bytes)
+	jb	L(large_page_less_32bytes)
 	movdqu	(%eax), %xmm0
 	movdqu	0x10(%eax), %xmm1
 	lea	0x20(%eax), %eax
@@ -1312,6 +1343,8 @@ L(large_page_less_32bytes):
 	sfence
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(copy_page_by_rep):
 	mov	%eax, %esi
@@ -1658,18 +1691,18 @@ L(table_48_bytes_bwd):
 L(copy_backward):
 	PUSH (%esi)
 	movl	%eax, %esi
-	lea	(%ecx,%edx,1),%edx
-	lea	(%ecx,%esi,1),%esi
+	add	%ecx, %edx
+	add	%ecx, %esi
 	testl	$0x3, %edx
 	jnz	L(bk_align)
 
 L(bk_aligned_4):
 	cmp	$64, %ecx
-	jge	L(bk_write_more64bytes)
+	jae	L(bk_write_more64bytes)
 
 L(bk_write_64bytesless):
 	cmp	$32, %ecx
-	jl	L(bk_write_less32bytes)
+	jb	L(bk_write_less32bytes)
 
 L(bk_write_more32bytes):
 	/* Copy 32 bytes at a time.  */
@@ -1698,13 +1731,14 @@ L(bk_write_less32bytes):
 	sub	%ecx, %edx
 	sub	%ecx, %eax
 	POP (%esi)
-L(bk_write_less32bytes_2):
+L(bk_write_less48bytes):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
 
+	CFI_PUSH (%esi)
 	ALIGN (4)
 L(bk_align):
 	cmp	$8, %ecx
-	jle	L(bk_write_less32bytes)
+	jbe	L(bk_write_less32bytes)
 	testl	$1, %edx
 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
 	   then (EDX & 2) must be != 0.  */
@@ -1760,7 +1794,7 @@ L(bk_ssse3_align):
 
 L(bk_ssse3_cpy_pre):
 	cmp	$64, %ecx
-	jl	L(bk_write_more32bytes)
+	jb	L(bk_write_more32bytes)
 
 L(bk_ssse3_cpy):
 	sub	$64, %esi
@@ -1775,7 +1809,7 @@ L(bk_ssse3_cpy):
 	movdqu	(%esi), %xmm0
 	movdqa	%xmm0, (%edx)
 	cmp	$64, %ecx
-	jge	L(bk_ssse3_cpy)
+	jae	L(bk_ssse3_cpy)
 	jmp	L(bk_write_64bytesless)
 
 #endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
index 749c82d379..ec9eeb95e4 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -128,7 +128,7 @@ ENTRY (MEMCPY)
 	jb	L(copy_forward)
 	je	L(fwd_write_0bytes)
 	cmp	$32, %ecx
-	jge	L(memmove_bwd)
+	jae	L(memmove_bwd)
 	jmp	L(bk_write_less32bytes_2)
 L(memmove_bwd):
 	add	%ecx, %eax
@@ -139,12 +139,12 @@ L(memmove_bwd):
 L(copy_forward):
 #endif
 	cmp	$48, %ecx
-	jge	L(48bytesormore)
+	jae	L(48bytesormore)
 
 L(fwd_write_less32bytes):
 #ifndef USE_AS_MEMMOVE
 	cmp	%dl, %al
-	jl	L(bk_write)
+	jb	L(bk_write)
 #endif
 	add	%ecx, %edx
 	add	%ecx, %eax
@@ -162,6 +162,7 @@ L(48bytesormore):
 	movl	%edx, %edi
 	and	$-16, %edx
 	PUSH (%esi)
+	cfi_remember_state
 	add	$16, %edx
 	movl	%edi, %esi
 	sub	%edx, %edi
@@ -181,12 +182,14 @@ L(48bytesormore):
 #endif
 
 	mov	%eax, %edi
-	jge	L(large_page)
+	jae	L(large_page)
 	and	$0xf, %edi
 	jz	L(shl_0)
 
 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_0):
 	movdqu	%xmm0, (%esi)
@@ -202,7 +205,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -210,7 +213,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -218,7 +221,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -234,6 +237,7 @@ L(shl_0_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
+	CFI_PUSH (%edi)
 L(shl_0_gobble):
 
 #ifdef DATA_CACHE_SIZE_HALF
@@ -250,7 +254,7 @@ L(shl_0_gobble):
 
 	POP (%edi)
 	lea	-128(%ecx), %ecx
-	jge	L(shl_0_gobble_mem_loop)
+	jae	L(shl_0_gobble_mem_loop)
 L(shl_0_gobble_cache_loop):
 	movdqa	(%eax), %xmm0
 	movdqa	0x10(%eax), %xmm1
@@ -272,8 +276,7 @@ L(shl_0_gobble_cache_loop):
 	movdqa	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 
-	jge	L(shl_0_gobble_cache_loop)
-L(shl_0_gobble_cache_loop_tail):
+	jae	L(shl_0_gobble_cache_loop)
 	cmp	$-0x40, %ecx
 	lea	0x80(%ecx), %ecx
 	jl	L(shl_0_cache_less_64bytes)
@@ -294,7 +297,7 @@ L(shl_0_gobble_cache_loop_tail):
 	add	$0x40, %edx
 L(shl_0_cache_less_64bytes):
 	cmp	$0x20, %ecx
-	jl	L(shl_0_cache_less_32bytes)
+	jb	L(shl_0_cache_less_32bytes)
 	movdqa	(%eax), %xmm0
 	sub	$0x20, %ecx
 	movdqa	0x10(%eax), %xmm1
@@ -304,7 +307,7 @@ L(shl_0_cache_less_64bytes):
 	add	$0x20, %edx
 L(shl_0_cache_less_32bytes):
 	cmp	$0x10, %ecx
-	jl	L(shl_0_cache_less_16bytes)
+	jb	L(shl_0_cache_less_16bytes)
 	sub	$0x10, %ecx
 	movdqa	(%eax), %xmm0
 	add	$0x10, %eax
@@ -342,7 +345,7 @@ L(shl_0_gobble_mem_loop):
 	movdqa	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 
-	jge	L(shl_0_gobble_mem_loop)
+	jae	L(shl_0_gobble_mem_loop)
 	cmp	$-0x40, %ecx
 	lea	0x80(%ecx), %ecx
 	jl	L(shl_0_mem_less_64bytes)
@@ -363,7 +366,7 @@ L(shl_0_gobble_mem_loop):
 	add	$0x40, %edx
 L(shl_0_mem_less_64bytes):
 	cmp	$0x20, %ecx
-	jl	L(shl_0_mem_less_32bytes)
+	jb	L(shl_0_mem_less_32bytes)
 	movdqa	(%eax), %xmm0
 	sub	$0x20, %ecx
 	movdqa	0x10(%eax), %xmm1
@@ -373,7 +376,7 @@ L(shl_0_mem_less_64bytes):
 	add	$0x20, %edx
 L(shl_0_mem_less_32bytes):
 	cmp	$0x10, %ecx
-	jl	L(shl_0_mem_less_16bytes)
+	jb	L(shl_0_mem_less_16bytes)
 	sub	$0x10, %ecx
 	movdqa	(%eax), %xmm0
 	add	$0x10, %eax
@@ -384,7 +387,8 @@ L(shl_0_mem_less_16bytes):
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_1):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -406,7 +410,7 @@ L(shl_1_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_1_end)
+	jb	L(shl_1_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -428,6 +432,8 @@ L(shl_1_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_2):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -449,7 +455,7 @@ L(shl_2_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_2_end)
+	jb	L(shl_2_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -471,6 +477,8 @@ L(shl_2_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_3):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -492,7 +500,7 @@ L(shl_3_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_3_end)
+	jb	L(shl_3_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -514,6 +522,8 @@ L(shl_3_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_4):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -535,7 +545,7 @@ L(shl_4_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_4_end)
+	jb	L(shl_4_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -557,6 +567,8 @@ L(shl_4_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_5):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -578,7 +590,7 @@ L(shl_5_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_5_end)
+	jb	L(shl_5_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -600,7 +612,8 @@ L(shl_5_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_6):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -622,7 +635,7 @@ L(shl_6_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_6_end)
+	jb	L(shl_6_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -644,6 +657,8 @@ L(shl_6_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_7):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -665,7 +680,7 @@ L(shl_7_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_7_end)
+	jb	L(shl_7_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -687,6 +702,8 @@ L(shl_7_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_8):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -708,7 +725,7 @@ L(shl_8_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_8_end)
+	jb	L(shl_8_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -730,6 +747,8 @@ L(shl_8_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_9):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -751,7 +770,7 @@ L(shl_9_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_9_end)
+	jb	L(shl_9_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -773,6 +792,8 @@ L(shl_9_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_10):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -794,7 +815,7 @@ L(shl_10_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_10_end)
+	jb	L(shl_10_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -816,6 +837,8 @@ L(shl_10_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_11):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -837,7 +860,7 @@ L(shl_11_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_11_end)
+	jb	L(shl_11_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -859,6 +882,8 @@ L(shl_11_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_12):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -880,7 +905,7 @@ L(shl_12_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_12_end)
+	jb	L(shl_12_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -902,6 +927,8 @@ L(shl_12_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_13):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -923,7 +950,7 @@ L(shl_13_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_13_end)
+	jb	L(shl_13_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -945,6 +972,8 @@ L(shl_13_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_14):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -966,7 +995,7 @@ L(shl_14_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_14_end)
+	jb	L(shl_14_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -988,7 +1017,8 @@ L(shl_14_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_15):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -1010,7 +1040,7 @@ L(shl_15_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_15_end)
+	jb	L(shl_15_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -1229,8 +1259,10 @@ L(fwd_write_3bytes):
 	movl	DEST(%esp), %eax
 # endif
 #endif
-	RETURN
+	RETURN_END
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(large_page):
 	movdqu	(%eax), %xmm1
@@ -1281,7 +1313,7 @@ L(large_page_loop):
 	sub	$0x40, %ecx
 L(large_page_less_64bytes):
 	cmp	$32, %ecx
-	jl	L(large_page_less_32bytes)
+	jb	L(large_page_less_32bytes)
 	movdqu	(%eax), %xmm0
 	movdqu	0x10(%eax), %xmm1
 	lea	0x20(%eax), %eax
@@ -1617,11 +1649,11 @@ L(copy_backward):
 
 L(bk_aligned_4):
 	cmp	$64, %ecx
-	jge	L(bk_write_more64bytes)
+	jae	L(bk_write_more64bytes)
 
 L(bk_write_64bytesless):
 	cmp	$32, %ecx
-	jl	L(bk_write_less32bytes)
+	jb	L(bk_write_less32bytes)
 
 L(bk_write_more32bytes):
 	/* Copy 32 bytes at a time.  */
@@ -1653,10 +1685,11 @@ L(bk_write_less32bytes):
 L(bk_write_less32bytes_2):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
 
+	CFI_PUSH (%esi)
 	ALIGN (4)
 L(bk_align):
 	cmp	$8, %ecx
-	jle	L(bk_write_less32bytes)
+	jbe	L(bk_write_less32bytes)
 	testl	$1, %edx
 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
 	   then (EDX & 2) must be != 0.  */
@@ -1712,7 +1745,7 @@ L(bk_ssse3_align):
 
 L(bk_ssse3_cpy_pre):
 	cmp	$64, %ecx
-	jl	L(bk_write_more32bytes)
+	jb	L(bk_write_more32bytes)
 
 L(bk_ssse3_cpy):
 	sub	$64, %esi
@@ -1727,7 +1760,7 @@ L(bk_ssse3_cpy):
 	movdqu	(%esi), %xmm0
 	movdqa	%xmm0, (%edx)
 	cmp	$64, %ecx
-	jge	L(bk_ssse3_cpy)
+	jae	L(bk_ssse3_cpy)
 	jmp	L(bk_write_64bytesless)
 
 #endif
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
index 84afffeb66..f9a0b13d0c 100644
--- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
+++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -243,7 +243,6 @@ L(32bytesormore):
 	pxor	%xmm0, %xmm0
 #else
 	movd	%eax, %xmm0
-	punpcklbw %xmm0, %xmm0
 	pshufd	$0, %xmm0, %xmm0
 #endif
 	testl	$0xf, %edx
@@ -261,7 +260,7 @@ L(not_aligned_16):
 	ALIGN (4)
 L(aligned_16):
 	cmp	$128, %ecx
-	jge	L(128bytesormore)
+	jae	L(128bytesormore)
 
 L(aligned_16_less128bytes):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
@@ -293,7 +292,7 @@ L(128bytesormore):
  * fast string will prefetch and combine data efficiently.
  */
 	cmp	%edi, %ecx
-	jae	L(128bytesormore_nt)
+	jae	L(128bytesormore_endof_L1)
 	subl	$128, %ecx
 L(128bytesormore_normal):
 	sub	$128, %ecx
@@ -306,7 +305,7 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jl	L(128bytesless_normal)
+	jb	L(128bytesless_normal)
 
 
 	sub	$128, %ecx
@@ -319,15 +318,16 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jge	L(128bytesormore_normal)
+	jae	L(128bytesormore_normal)
 
 L(128bytesless_normal):
 	POP (%edi)
-	lea	128(%ecx), %ecx
+	add	$128, %ecx
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
+	CFI_PUSH (%edi)
 	ALIGN (4)
-L(128bytesormore_nt):
+L(128bytesormore_endof_L1):
 	mov	%edx, %edi
 	mov	%ecx, %edx
 	shr	$2, %ecx
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S
index b2b979193e..92ad601bf2 100644
--- a/sysdeps/i386/i686/multiarch/memset-sse2.S
+++ b/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -243,7 +243,6 @@ L(32bytesormore):
 	pxor	%xmm0, %xmm0
 #else
 	movd	%eax, %xmm0
-	punpcklbw %xmm0, %xmm0
 	pshufd	$0, %xmm0, %xmm0
 #endif
 	testl	$0xf, %edx
@@ -261,7 +260,7 @@ L(not_aligned_16):
 	ALIGN (4)
 L(aligned_16):
 	cmp	$128, %ecx
-	jge	L(128bytesormore)
+	jae	L(128bytesormore)
 
 L(aligned_16_less128bytes):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
@@ -287,14 +286,17 @@ L(128bytesormore):
 
 #ifdef DATA_CACHE_SIZE
 	POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
 	cmp	$DATA_CACHE_SIZE, %ecx
 #else
 # ifdef SHARED
+#  define RESTORE_EBX_STATE
 	call	__i686.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
 # else
 	POP (%ebx)
+#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
 	cmp	__x86_data_cache_size, %ecx
 # endif
 #endif
@@ -312,7 +314,7 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jl	L(128bytesless_normal)
+	jb	L(128bytesless_normal)
 
 
 	sub	$128, %ecx
@@ -325,10 +327,10 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jge	L(128bytesormore_normal)
+	jae	L(128bytesormore_normal)
 
 L(128bytesless_normal):
-	lea	128(%ecx), %ecx
+	add	$128, %ecx
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
 	ALIGN (4)
@@ -346,11 +348,12 @@ L(128bytes_L2_normal):
 	movaps	%xmm0, 0x70(%edx)
 	add	$128, %edx
 	cmp	$128, %ecx
-	jge	L(128bytes_L2_normal)
+	jae	L(128bytes_L2_normal)
 
 L(128bytesless_L2_normal):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
+	RESTORE_EBX_STATE
 L(128bytesormore_nt_start):
 	sub	%ebx, %ecx
 	ALIGN (4)
@@ -368,7 +371,7 @@ L(128bytesormore_shared_cache_loop):
 	movdqa	%xmm0, 0x70(%edx)
 	add	$0x80, %edx
 	cmp	$0x80, %ebx
-	jge	L(128bytesormore_shared_cache_loop)
+	jae	L(128bytesormore_shared_cache_loop)
 	cmp	$0x80, %ecx
 	jb	L(shared_cache_loop_end)
 	ALIGN (4)
@@ -384,7 +387,7 @@ L(128bytesormore_nt):
 	movntdq	%xmm0, 0x70(%edx)
 	add	$0x80, %edx
 	cmp	$0x80, %ecx
-	jge	L(128bytesormore_nt)
+	jae	L(128bytesormore_nt)
 	sfence
 L(shared_cache_loop_end):
 #if defined DATA_CACHE_SIZE || !defined SHARED
diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
index d5fd23e15c..81d6ec66f7 100644
--- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -178,7 +178,9 @@ L(first4bytes):
 	PUSH	(%ebx)
 	PUSH	(%edi)
 	PUSH	(%esi)
+#ifdef USE_AS_STRNCMP
 	cfi_remember_state
+#endif
 	mov	%edx, %edi
 	mov	%eax, %esi
 	xorl	%eax, %eax
@@ -246,8 +248,8 @@ L(ret):
 	ret
 
 	.p2align 4
-	cfi_restore_state
 #ifdef USE_AS_STRNCMP
+	cfi_restore_state
 L(more16byteseq):
 	POP	(%esi)
 	POP	(%edi)
diff --git a/sysdeps/x86_64/Implies b/sysdeps/x86_64/Implies
index 2b8412b0b6..2e0a323e13 100644
--- a/sysdeps/x86_64/Implies
+++ b/sysdeps/x86_64/Implies
@@ -1,4 +1,5 @@
 wordsize-64
 ieee754/ldbl-96
+ieee754/dbl-64/wordsize-64
 ieee754/dbl-64
 ieee754/flt-32