summary refs log tree commit diff
diff options
context:
space:
mode:
authorAndreas Schwab <schwab@redhat.com>2010-04-16 12:49:09 +0200
committerAndreas Schwab <schwab@redhat.com>2010-04-16 12:49:09 +0200
commit54627f3c15f4a6e2ff02e60afacdb5f8008d399a (patch)
treeca0a25e66c8e6d48c7db07e6152a0549c27a1916
parent1ef17274decb5381a50a2146cb3444630f67b527 (diff)
parent1cdb2151fbad6bff650e85a0476972881bbc027b (diff)
downloadglibc-54627f3c15f4a6e2ff02e60afacdb5f8008d399a.tar.gz
glibc-54627f3c15f4a6e2ff02e60afacdb5f8008d399a.tar.xz
glibc-54627f3c15f4a6e2ff02e60afacdb5f8008d399a.zip
Merge remote branch 'origin/master' into fedora/master
-rw-r--r--ChangeLog66
-rw-r--r--elf/dl-version.c12
-rw-r--r--elf/elf.h24
-rw-r--r--stdlib/tst-makecontext3.c62
-rw-r--r--string/test-strncmp.c39
-rw-r--r--sysdeps/i386/configure25
-rw-r--r--sysdeps/i386/configure.in11
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile6
-rw-r--r--sysdeps/i386/i686/multiarch/Versions5
-rw-r--r--sysdeps/i386/i686/multiarch/s_fma-fma.c30
-rw-r--r--sysdeps/i386/i686/multiarch/s_fma.c36
-rw-r--r--sysdeps/i386/i686/multiarch/s_fmaf-fma.c30
-rw-r--r--sysdeps/i386/i686/multiarch/s_fmaf.c36
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp-sse4.S4
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp-ssse3.S11
-rw-r--r--sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c64
-rw-r--r--sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c62
-rw-r--r--sysdeps/x86_64/elf/configure26
-rw-r--r--sysdeps/x86_64/elf/configure.in11
-rw-r--r--sysdeps/x86_64/multiarch/Makefile2
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h38
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-sse4.S1635
-rw-r--r--sysdeps/x86_64/multiarch/memcmp.S59
-rw-r--r--sysdeps/x86_64/multiarch/rtld-memcmp.c1
24 files changed, 2133 insertions, 162 deletions
diff --git a/ChangeLog b/ChangeLog
index bed6dfea8c..c22e562774 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,69 @@
+2010-04-15  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* string/test-strncmp.c (check_result): New function.
+	(do_one_test): Use it.
+	(check1): New function.
+	(test_main): Use it.
+	* sysdeps/i386/i686/multiarch/strcmp-sse4.S (crosspage): Properly
+	update source and destination.
+	* sysdeps/i386/i686/multiarch/strcmp-ssse3.S (gobble_ashr_12):
+	Properly check and update counter.
+
+2010-04-14  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/elf/configure.in: Move AVX test to ....
+	* sysdeps/i386/configure.in: ...here.
+	* sysdeps/i386/i686/multiarch/Makefile (libm-sysdep_routines): Define.
+	(CFLAGS-s_fma-fma.c): Define.
+	(CFLAGS-s_fmaf-fma.c): Define.
+	* sysdeps/i386/i686/multiarch/Versions: New file.
+	* sysdeps/i386/i686/multiarch/s_fma-fma.c: New file.
+	* sysdeps/i386/i686/multiarch/s_fma.c: New file.
+	* sysdeps/i386/i686/multiarch/s_fmaf-fma.c: New file.
+	* sysdeps/i386/i686/multiarch/s_fmaf.c: New file.
+
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Check
+	DATA_CACHE_SIZE_HALF instead of SHARED_CACHE_SIZE_HALF.
+
+2010-04-14  Andreas Schwab  <schwab@redhat.com>
+
+	* elf/dl-version.c (_dl_check_map_versions): Avoid index overflow
+	when dependencies are missing.
+
+2010-04-14  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Optimized for unaligned
+	data.
+
+2010-04-12  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	memcmp-sse4.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: New file.
+	* sysdeps/x86_64/multiarch/memcmp.S: New file.
+	* sysdeps/x86_64/multiarch/rtld-memcmp.c: New file.
+
+2010-04-13  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/x86_64/multiarch/init-arch.h: Pretty printing.
+	Add SSE 4.1 macros.
+
+2010-04-10  Matt Fleming  <matt@console-pimps.org>
+
+	* elf/elf.h: Add SH specific ELF header flags.
+
+2010-04-13  Andreas Schwab  <schwab@redhat.com>
+
+	* sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c: Fix setup of
+	overflow area.
+	* sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c: Likewise.
+
+2010-04-12  Andreas Schwab  <schwab@redhat.com>
+
+	* stdlib/tst-makecontext3.c (main): Initialize ucontext_t objects
+	only with getcontext.  Test for unimplemented makecontext by
+	checking errno.
+
 2010-04-09  Ulrich Drepper  <drepper@redhat.com>
 
 	* nscd/aicache.c (addhstaiX): Correct passing memory to address
diff --git a/elf/dl-version.c b/elf/dl-version.c
index 9e881162a6..c59a6c3cd3 100644
--- a/elf/dl-version.c
+++ b/elf/dl-version.c
@@ -322,10 +322,14 @@ _dl_check_map_versions (struct link_map *map, int verbose, int trace_mode)
 	      while (1)
 		{
 		  ElfW(Half) ndx = aux->vna_other & 0x7fff;
-		  map->l_versions[ndx].hash = aux->vna_hash;
-		  map->l_versions[ndx].hidden = aux->vna_other & 0x8000;
-		  map->l_versions[ndx].name = &strtab[aux->vna_name];
-		  map->l_versions[ndx].filename = &strtab[ent->vn_file];
+		  /* In trace mode, dependencies may be missing.  */
+		  if (__builtin_expect (ndx < map->l_nversions, 1))
+		    {
+		      map->l_versions[ndx].hash = aux->vna_hash;
+		      map->l_versions[ndx].hidden = aux->vna_other & 0x8000;
+		      map->l_versions[ndx].name = &strtab[aux->vna_name];
+		      map->l_versions[ndx].filename = &strtab[ent->vn_file];
+		    }
 
 		  if (aux->vna_next == 0)
 		    /* No more symbols.  */
diff --git a/elf/elf.h b/elf/elf.h
index 1efe359119..a9558a3ddd 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -2477,6 +2477,30 @@ typedef Elf32_Addr Elf32_Conflict;
 
 /* SH specific declarations */
 
+/* Processor specific flags for the ELF header e_flags field.  */
+#define EF_SH_MACH_MASK		0x1f
+#define EF_SH_UNKNOWN		0x0
+#define EF_SH1			0x1
+#define EF_SH2			0x2
+#define EF_SH3			0x3
+#define EF_SH_DSP		0x4
+#define EF_SH3_DSP		0x5
+#define EF_SH4AL_DSP		0x6
+#define EF_SH3E			0x8
+#define EF_SH4			0x9
+#define EF_SH2E			0xb
+#define EF_SH4A			0xc
+#define EF_SH2A			0xd
+#define EF_SH4_NOFPU		0x10
+#define EF_SH4A_NOFPU		0x11
+#define EF_SH4_NOMMU_NOFPU	0x12
+#define EF_SH2A_NOFPU		0x13
+#define EF_SH3_NOMMU		0x14
+#define EF_SH2A_SH4_NOFPU	0x15
+#define EF_SH2A_SH3_NOFPU	0x16
+#define EF_SH2A_SH4		0x17
+#define EF_SH2A_SH3E		0x18
+
 /* SH relocs.  */
 #define	R_SH_NONE		0
 #define	R_SH_DIR32		1
diff --git a/stdlib/tst-makecontext3.c b/stdlib/tst-makecontext3.c
index f127c6a579..a44169ae36 100644
--- a/stdlib/tst-makecontext3.c
+++ b/stdlib/tst-makecontext3.c
@@ -136,38 +136,42 @@ main (void)
       exit (1);
     }
 
-  ctx[1] = ctx[0];
+  if (getcontext (&ctx[1]) != 0)
+    {
+      printf ("%s: getcontext: %m\n", __FUNCTION__);
+      exit (1);
+    }
+
   ctx[1].uc_stack.ss_sp = st1;
   ctx[1].uc_stack.ss_size = sizeof st1;
   ctx[1].uc_link = &ctx[0];
-  {
-    ucontext_t tempctx = ctx[1];
-    makecontext (&ctx[1], (void (*) (void)) f1, 33,
-		 0x00000001 << flag, 0x00000004 << flag,
-		 0x00000012 << flag, 0x00000048 << flag,
-		 0x00000123 << flag, 0x0000048d << flag,
-		 0x00001234 << flag, 0x000048d1 << flag,
-		 0x00012345 << flag, 0x00048d15 << flag,
-		 0x00123456 << flag, 0x0048d159 << flag,
-		 0x01234567 << flag, 0x048d159e << flag,
-		 0x12345678 << flag, 0x48d159e2 << flag,
-		 0x23456789 << flag, 0x8d159e26 << flag,
-		 0x3456789a << flag, 0xd159e26a << flag,
-		 0x456789ab << flag, 0x159e26af << flag,
-		 0x56789abc << flag, 0x59e26af3 << flag,
-		 0x6789abcd << flag, 0x9e26af37 << flag,
-		 0x789abcde << flag, 0xe26af37b << flag,
-		 0x89abcdef << flag, 0x26af37bc << flag,
-		 0x9abcdef0 << flag, 0x6af37bc3 << flag,
-		 0xabcdef0f << flag);
-
-    /* Without this check, a stub makecontext can make us spin forever.  */
-    if (memcmp (&tempctx, &ctx[1], sizeof ctx[1]) == 0)
-      {
-	puts ("makecontext was a no-op, presuming not implemented");
-	return 0;
-      }
-  }
+  errno = 0;
+  makecontext (&ctx[1], (void (*) (void)) f1, 33,
+	       0x00000001 << flag, 0x00000004 << flag,
+	       0x00000012 << flag, 0x00000048 << flag,
+	       0x00000123 << flag, 0x0000048d << flag,
+	       0x00001234 << flag, 0x000048d1 << flag,
+	       0x00012345 << flag, 0x00048d15 << flag,
+	       0x00123456 << flag, 0x0048d159 << flag,
+	       0x01234567 << flag, 0x048d159e << flag,
+	       0x12345678 << flag, 0x48d159e2 << flag,
+	       0x23456789 << flag, 0x8d159e26 << flag,
+	       0x3456789a << flag, 0xd159e26a << flag,
+	       0x456789ab << flag, 0x159e26af << flag,
+	       0x56789abc << flag, 0x59e26af3 << flag,
+	       0x6789abcd << flag, 0x9e26af37 << flag,
+	       0x789abcde << flag, 0xe26af37b << flag,
+	       0x89abcdef << flag, 0x26af37bc << flag,
+	       0x9abcdef0 << flag, 0x6af37bc3 << flag,
+	       0xabcdef0f << flag);
+
+  /* Without this check, a stub makecontext can make us spin forever.  */
+  if (errno == ENOSYS)
+    {
+      puts ("makecontext not implemented");
+      back_in_main = 1;
+      return 0;
+    }
 
   /* Play some tricks with this context.  */
   if (++global == 1)
diff --git a/string/test-strncmp.c b/string/test-strncmp.c
index 5adf0eb311..3687879c25 100644
--- a/string/test-strncmp.c
+++ b/string/test-strncmp.c
@@ -1,5 +1,5 @@
 /* Test and measure strncmp functions.
-   Copyright (C) 1999, 2002, 2003 Free Software Foundation, Inc.
+   Copyright (C) 1999, 2002, 2003, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Written by Jakub Jelinek <jakub@redhat.com>, 1999.
 
@@ -51,8 +51,8 @@ stupid_strncmp (const char *s1, const char *s2, size_t n)
   return ret;
 }
 
-static void
-do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
+static int
+check_result (impl_t *impl, const char *s1, const char *s2, size_t n,
 	     int exp_result)
 {
   int result = CALL (impl, s1, s2, n);
@@ -63,9 +63,19 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
       error (0, 0, "Wrong result in function %s %d %d", impl->name,
 	     result, exp_result);
       ret = 1;
-      return;
+      return -1;
     }
 
+  return 0;
+}
+
+static void
+do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
+	     int exp_result)
+{
+  if (check_result (impl, s1, s2, n, exp_result) < 0)
+    return;
+
   if (HP_TIMING_AVAIL)
     {
       hp_timing_t start __attribute ((unused));
@@ -283,6 +293,25 @@ do_random_tests (void)
     }
 }
 
+static void
+check1 (void)
+{
+  char *s1 = (char *)(buf1 + 0xb2c);
+  char *s2 = (char *)(buf1 + 0xfd8);
+  size_t i;
+  int exp_result;
+
+  strcpy(s1, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrs");
+  strcpy(s2, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkLMNOPQRSTUV");
+
+  for (i = 0; i < 80; i++)
+    {
+      exp_result = simple_strncmp (s1, s2, i);
+      FOR_EACH_IMPL (impl, 0)
+	 check_result (impl, s1, s2, i, exp_result);
+    }
+}
+
 int
 test_main (void)
 {
@@ -290,6 +319,8 @@ test_main (void)
 
   test_init ();
 
+  check1 ();
+
   printf ("%23s", "");
   FOR_EACH_IMPL (impl, 0)
     printf ("\t%s", impl->name);
diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
index 7814b3b313..21225cd9c9 100644
--- a/sysdeps/i386/configure
+++ b/sysdeps/i386/configure
@@ -656,3 +656,28 @@ fi
 fi
 { $as_echo "$as_me:$LINENO: result: $libc_cv_as_i686" >&5
 $as_echo "$libc_cv_as_i686" >&6; }
+
+{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5
+$as_echo_n "checking for AVX support... " >&6; }
+if test "${libc_cv_cc_avx+set}" = set; then
+  $as_echo_n "(cached) " >&6
+else
+  if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  libc_cv_cc_avx=yes
+else
+  libc_cv_cc_avx=no
+fi
+fi
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5
+$as_echo "$libc_cv_cc_avx" >&6; }
+if test $libc_cv_cc_avx = yes; then
+  cat >>confdefs.h <<\_ACEOF
+#define HAVE_AVX_SUPPORT 1
+_ACEOF
+
+fi
diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in
index 9fc7fa59fe..d8dd648f80 100644
--- a/sysdeps/i386/configure.in
+++ b/sysdeps/i386/configure.in
@@ -55,3 +55,14 @@ if AC_TRY_COMMAND([${CC-cc} -Wa,-mtune=i686 -xc /dev/null -S -o /dev/null]); the
 else
   libc_cv_as_i686=no
 fi])
+
+dnl Check if -mavx works.
+AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl
+if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then
+  libc_cv_cc_avx=yes
+else
+  libc_cv_cc_avx=no
+fi])
+if test $libc_cv_cc_avx = yes; then
+  AC_DEFINE(HAVE_AVX_SUPPORT)
+fi
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index e8847d6fc4..124595068d 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -19,3 +19,9 @@ CFLAGS-strstr.c += -msse4
 CFLAGS-strcasestr.c += -msse4
 endif
 endif
+
+ifeq (mathyes,$(subdir)$(config-cflags-avx))
+libm-sysdep_routines += s_fma-fma s_fmaf-fma
+CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse
+CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse
+endif
diff --git a/sysdeps/i386/i686/multiarch/Versions b/sysdeps/i386/i686/multiarch/Versions
new file mode 100644
index 0000000000..59b185ac8d
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/Versions
@@ -0,0 +1,5 @@
+libc {
+  GLIBC_PRIVATE {
+    __get_cpu_features;
+  }
+}
diff --git a/sysdeps/i386/i686/multiarch/s_fma-fma.c b/sysdeps/i386/i686/multiarch/s_fma-fma.c
new file mode 100644
index 0000000000..e6f77aec77
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/s_fma-fma.c
@@ -0,0 +1,30 @@
+/* FMA version of fma.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+double
+__fma_fma (double x, double y, double z)
+{
+  asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
+#endif
diff --git a/sysdeps/i386/i686/multiarch/s_fma.c b/sysdeps/i386/i686/multiarch/s_fma.c
new file mode 100644
index 0000000000..d9291b0be8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/s_fma.c
@@ -0,0 +1,36 @@
+/* Multiple versions of fma.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
+extern double __fma_fma (double x, double y, double z) attribute_hidden;
+
+libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_ia32);
+weak_alias (__fma, fma)
+
+# define __fma __fma_ia32
+#endif
+
+#include <math/s_fma.c>
diff --git a/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
new file mode 100644
index 0000000000..887e9c3829
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
@@ -0,0 +1,30 @@
+/* FMA version of fmaf.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+float
+__fmaf_fma (float x, float y, float z)
+{
+  asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
+#endif
diff --git a/sysdeps/i386/i686/multiarch/s_fmaf.c b/sysdeps/i386/i686/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..4ea9be48ac
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/s_fmaf.c
@@ -0,0 +1,36 @@
+/* Multiple versions of fmaf.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
+extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
+
+libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_ia32);
+weak_alias (__fmaf, fmaf)
+
+# define __fmaf __fmaf_ia32
+#endif
+
+#include <math/s_fmaf.c>
diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
index 81d6ec66f7..0de0a113c0 100644
--- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -223,8 +223,8 @@ L(crosspage):
 	inc	%edx
 	cmp	$15, %edx
 	jle	L(crosspage)
-	add	$16, %edi
-	add	$16, %esi
+	add	%edx, %edi
+	add	%edx, %esi
 	jmp	L(check_offset)
 
 	.p2align 4
diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
index 40994c05b1..a4de2259d2 100644
--- a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -1484,17 +1484,18 @@ L(gobble_ashr_12):
 	sub	$0xffff, %esi
 	jnz	L(exit)
 
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
 	add	$16, %ecx
 	movdqa	%xmm4, %xmm3
 
 	add	$16, %edi
 	jg	L(nibble_ashr_12)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$16, %ebp
-	lea	-16(%ebp), %ebp
-	jbe	L(more8byteseq)
-#endif
 	movdqa	(%eax, %ecx), %xmm1
 	movdqa	(%edx, %ecx), %xmm2
 	movdqa	%xmm2, %xmm4
diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c b/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c
index 94760e0c2b..0e309c3e29 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c
+++ b/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001 Free Software Foundation, Inc.
+/* Copyright (C) 2001, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Martin Schwidefsky (schwidefsky@de.ibm.com).
 
@@ -28,15 +28,15 @@
    double, complex and structure with sizes 0, 2, 4 or 8
    won't work.
    makecontext sets up a stack and the registers for the
-   context. The stack looks like this:
-           size                         offset
+   user context. The stack looks like this:
+	   size                         offset
     %r15 ->    +-----------------------+
-             4 | back chain (zero)     |  0
-             4 | reserved              |  4
-            88 | save area for (*func) |  8
-               +-----------------------+
-             n | overflow parameters   | 96
-               +-----------------------+
+	     4 | back chain (zero)     |  0
+	     4 | reserved              |  4
+	    88 | save area for (*func) |  8
+	       +-----------------------+
+	     n | overflow parameters   | 96
+	       +-----------------------+
    The registers are set up like this:
      %r2-%r6: parameters 1 to 5
      %r7    : (*func) pointer
@@ -54,27 +54,27 @@ void
 __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...)
 {
   extern void __makecontext_ret (void);
-  unsigned long *sp;
+  unsigned long int *sp;
   va_list ap;
-  int i;
 
-  sp = (unsigned long *) (((unsigned long) ucp->uc_stack.ss_sp
-			   + ucp->uc_stack.ss_size) & -8L);
+  sp = (unsigned long int *) (((unsigned long int) ucp->uc_stack.ss_sp
+			       + ucp->uc_stack.ss_size) & -8L);
 
   /* Set the return address to trampoline.  */
-  ucp->uc_mcontext.gregs[14] = (long) __makecontext_ret;
+  ucp->uc_mcontext.gregs[14] = (long int) __makecontext_ret;
 
   /* Set register parameters.  */
   va_start (ap, argc);
-  for (i = 0; (i < argc) && (i < 5); i++)
-    ucp->uc_mcontext.gregs[2+i] = va_arg (ap, long);
+  for (int i = 0; i < argc && i < 5; ++i)
+    ucp->uc_mcontext.gregs[2 + i] = va_arg (ap, long int);
 
   /* The remaining arguments go to the overflow area.  */
-  if (argc > 5) {
-    sp -= argc - 5;
-    for (i = 5; i < argc; i++)
-      sp[i] = va_arg(ap, long);
-  }
+  if (argc > 5)
+    {
+      sp -= argc - 5;
+      for (int i = 5; i < argc; ++i)
+	sp[i - 5] = va_arg (ap, long int);
+    }
   va_end (ap);
 
   /* Make room for the save area and set the backchain.  */
@@ -82,24 +82,24 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...)
   *sp = 0;
 
   /* Pass (*func) to __start_context in %r7.  */
-  ucp->uc_mcontext.gregs[7] = (long) func;
+  ucp->uc_mcontext.gregs[7] = (long int) func;
 
   /* Pass ucp->uc_link to __start_context in %r8.  */
-  ucp->uc_mcontext.gregs[8] = (long) ucp->uc_link;
+  ucp->uc_mcontext.gregs[8] = (long int) ucp->uc_link;
 
   /* Pass address of setcontext in %r9.  */
-  ucp->uc_mcontext.gregs[9] = (long) &setcontext;
+  ucp->uc_mcontext.gregs[9] = (long int) &setcontext;
 
   /* Set stack pointer.  */
-  ucp->uc_mcontext.gregs[15] = (long) sp;
+  ucp->uc_mcontext.gregs[15] = (long int) sp;
 }
 
-asm(".text\n"
-    ".type __makecontext_ret,@function\n"
-    "__makecontext_ret:\n"
-    "      basr  %r14,%r7\n"
-    "      lr    %r2,%r8\n"
-    "      br    %r9\n"
-    ".size __makecontext_ret, .-__makecontext_ret");
+asm (".text\n"
+     ".type __makecontext_ret,@function\n"
+     "__makecontext_ret:\n"
+     "      basr  %r14,%r7\n"
+     "      lr    %r2,%r8\n"
+     "      br    %r9\n"
+     ".size __makecontext_ret, .-__makecontext_ret");
 
 weak_alias (__makecontext, makecontext)
diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c b/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c
index b08f1b4047..40ff3eefb4 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c
+++ b/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001 Free Software Foundation, Inc.
+/* Copyright (C) 2001, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Martin Schwidefsky (schwidefsky@de.ibm.com).
 
@@ -29,14 +29,14 @@
    won't work.
    makecontext sets up a stack and the registers for the
    user context. The stack looks like this:
-           size                         offset
+	   size                         offset
     %r15 ->    +-----------------------+
-             8 | back chain (zero)     |  0
-             8 | reserved              |  8
-           144 | save area for (*func) | 16
-               +-----------------------+
-             n | overflow parameters   | 160
-               +-----------------------+
+	     8 | back chain (zero)     |  0
+	     8 | reserved              |  8
+	   144 | save area for (*func) | 16
+	       +-----------------------+
+	     n | overflow parameters   | 160
+	       +-----------------------+
    The registers are set up like this:
      %r2-%r6: parameters 1 to 5
      %r7    : (*func) pointer
@@ -54,27 +54,27 @@ void
 __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...)
 {
   extern void __makecontext_ret (void);
-  unsigned long *sp;
+  unsigned long int *sp;
   va_list ap;
-  int i;
 
-  sp = (unsigned long *) (((unsigned long) ucp->uc_stack.ss_sp
-			   + ucp->uc_stack.ss_size) & -8L);
+  sp = (unsigned long int *) (((unsigned long int) ucp->uc_stack.ss_sp
+			       + ucp->uc_stack.ss_size) & -8L);
 
   /* Set the return address to trampoline.  */
-  ucp->uc_mcontext.gregs[14] = (long) __makecontext_ret;
+  ucp->uc_mcontext.gregs[14] = (long int) __makecontext_ret;
 
   /* Set register parameters.  */
   va_start (ap, argc);
-  for (i = 0; (i < argc) && (i < 5); i++)
-    ucp->uc_mcontext.gregs[2+i] = va_arg (ap, long);
+  for (int i = 0; i < argc && i < 5; ++i)
+    ucp->uc_mcontext.gregs[2 + i] = va_arg (ap, long int);
 
   /* The remaining arguments go to the overflow area.  */
-  if (argc > 5) {
-    sp -= argc - 5;
-    for (i = 5; i < argc; i++)
-      sp[i] = va_arg(ap, long);
-  }
+  if (argc > 5)
+    {
+      sp -= argc - 5;
+      for (int i = 5; i < argc; ++i)
+	sp[i - 5] = va_arg (ap, long int);
+    }
   va_end (ap);
 
   /* Make room for the save area and set the backchain.  */
@@ -82,24 +82,24 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...)
   *sp = 0;
 
   /* Pass (*func) to __start_context in %r7.  */
-  ucp->uc_mcontext.gregs[7] = (long) func;
+  ucp->uc_mcontext.gregs[7] = (long int) func;
 
   /* Pass ucp->uc_link to __start_context in %r8.  */
-  ucp->uc_mcontext.gregs[8] = (long) ucp->uc_link;
+  ucp->uc_mcontext.gregs[8] = (long int) ucp->uc_link;
 
   /* Pass address of setcontext in %r9.  */
-  ucp->uc_mcontext.gregs[9] = (long) &setcontext;
+  ucp->uc_mcontext.gregs[9] = (long int) &setcontext;
 
   /* Set stack pointer.  */
-  ucp->uc_mcontext.gregs[15] = (long) sp;
+  ucp->uc_mcontext.gregs[15] = (long int) sp;
 }
 
-asm(".text\n"
-    ".type __makecontext_ret,@function\n"
-    "__makecontext_ret:\n"
-    "      basr  %r14,%r7\n"
-    "      lgr   %r2,%r8\n"
-    "      br    %r9\n"
-    ".size __makecontext_ret, .-__makecontext_ret");
+asm (".text\n"
+     ".type __makecontext_ret,@function\n"
+     "__makecontext_ret:\n"
+     "      basr  %r14,%r7\n"
+     "      lgr   %r2,%r8\n"
+     "      br    %r9\n"
+     ".size __makecontext_ret, .-__makecontext_ret");
 
 weak_alias (__makecontext, makecontext)
diff --git a/sysdeps/x86_64/elf/configure b/sysdeps/x86_64/elf/configure
index 0b93b0424e..f722b9e600 100644
--- a/sysdeps/x86_64/elf/configure
+++ b/sysdeps/x86_64/elf/configure
@@ -46,29 +46,3 @@ fi
 cat >>confdefs.h <<\_ACEOF
 #define PI_STATIC_AND_HIDDEN 1
 _ACEOF
-
-
-{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5
-$as_echo_n "checking for AVX support... " >&6; }
-if test "${libc_cv_cc_avx+set}" = set; then
-  $as_echo_n "(cached) " >&6
-else
-  if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null'
-  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; }; then
-  libc_cv_cc_avx=yes
-else
-  libc_cv_cc_avx=no
-fi
-fi
-{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5
-$as_echo "$libc_cv_cc_avx" >&6; }
-if test $libc_cv_cc_avx = yes; then
-  cat >>confdefs.h <<\_ACEOF
-#define HAVE_AVX_SUPPORT 1
-_ACEOF
-
-fi
diff --git a/sysdeps/x86_64/elf/configure.in b/sysdeps/x86_64/elf/configure.in
index 14d1875302..9cb59d009c 100644
--- a/sysdeps/x86_64/elf/configure.in
+++ b/sysdeps/x86_64/elf/configure.in
@@ -32,14 +32,3 @@ fi
 dnl It is always possible to access static and hidden symbols in an
 dnl position independent way.
 AC_DEFINE(PI_STATIC_AND_HIDDEN)
-
-dnl Check if -mavx works.
-AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl
-if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then
-  libc_cv_cc_avx=yes
-else
-  libc_cv_cc_avx=no
-fi])
-if test $libc_cv_cc_avx = yes; then
-  AC_DEFINE(HAVE_AVX_SUPPORT)
-fi
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 364e7bbbd2..c61cf70345 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -5,7 +5,7 @@ endif
 
 ifeq ($(subdir),string)
 sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
-		   strend-sse4
+		   strend-sse4 memcmp-sse4
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 5c73813404..b2f2de3796 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -20,21 +20,23 @@
 
 #ifdef	__ASSEMBLER__
 
-#include <ifunc-defines.h>
+# include <ifunc-defines.h>
 
-#define bit_SSE2	(1 << 26)
-#define bit_SSSE3	(1 << 9)
-#define bit_SSE4_2	(1 << 20)
+# define bit_SSE2	(1 << 26)
+# define bit_SSSE3	(1 << 9)
+# define bit_SSE4_1	(1 << 19)
+# define bit_SSE4_2	(1 << 20)
 
-#define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
-#define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-#define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
+# define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_SSE4_1	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 
 #define index_Fast_Rep_String	FEATURE_INDEX_1*FEATURE_SIZE
 
 #else	/* __ASSEMBLER__ */
 
-#include <sys/param.h>
+# include <sys/param.h>
 
 enum
   {
@@ -84,20 +86,22 @@ extern void __init_cpu_features (void) attribute_hidden;
 extern const struct cpu_features *__get_cpu_features (void)
      __attribute__ ((const));
 
-#ifndef NOT_IN_libc
-# define __get_cpu_features()	(&__cpu_features)
-#endif
+# ifndef NOT_IN_libc
+#  define __get_cpu_features()	(&__cpu_features)
+# endif
 
-#define HAS_CPU_FEATURE(idx, reg, bit) \
+# define HAS_CPU_FEATURE(idx, reg, bit) \
   ((__get_cpu_features ()->cpuid[idx].reg & (1 << (bit))) != 0)
 
 /* Following are the feature tests used throughout libc.  */
 
-#define HAS_SSE2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, 26)
-#define HAS_POPCOUNT	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 23)
-#define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
-#define HAS_FMA		HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
+# define HAS_SSE2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, 26)
+# define HAS_POPCOUNT	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 23)
+# define HAS_SSSE3	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 9)
+# define HAS_SSE4_1	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 19)
+# define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
+# define HAS_FMA	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
 
-#define index_Fast_Rep_String	FEATURE_INDEX_1
+# define index_Fast_Rep_String	FEATURE_INDEX_1
 
 #endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..fc439bb013
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -0,0 +1,1635 @@
+/* memcmp with SSE4.1
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCMP
+# define MEMCMP		__memcmp_sse4_1
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#define JMPTBL(I, B)	(I - B)
+
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), %rcx;			\
+  add		%r11, %rcx;					\
+  jmp		*%rcx;						\
+  ud2
+
+	.section .text.sse4.1,"ax",@progbits
+ENTRY (MEMCMP)
+	pxor	%xmm0, %xmm0
+	cmp	$79, %rdx
+	ja	L(79bytesormore)
+	cmp	$1, %rdx
+	je	L(firstbyte)
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(firstbyte):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(79bytesormore):
+	movdqu	(%rsi), %xmm1
+	movdqu	(%rdi), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+	mov	%rsi, %rcx
+	and	$-16, %rsi
+	add	$16, %rsi
+	sub	%rsi, %rcx
+
+	sub	%rcx, %rdi
+	add	%rcx, %rdx
+	test	$0xf, %rdi
+	jz	L(2aligned)
+
+	cmp	$128, %rdx
+	ja	L(128bytesormore)
+L(less128bytes):
+	sub	$64, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(128bytesormore):
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+	cmp	$256, %rdx
+	ja	L(less512bytes)
+L(less256bytes):
+	sub	$128, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(less512bytes):
+	sub	$256, %rdx
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqu	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqu	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqu	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqu	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqu	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqu	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqu	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqu	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytes)
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(512bytesormore):
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %r8
+#else
+	mov	__x86_64_data_cache_size_half(%rip), %r8
+#endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_unaglined)
+	sub	$64, %rdx
+	ALIGN (4)
+L(64bytesormore_loop):
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(L2_L3_cache_unaglined):
+	sub	$64, %rdx
+	ALIGN (4)
+L(L2_L3_unaligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_unaligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+	ALIGN (4)
+L(2aligned):
+	cmp	$128, %rdx
+	ja	L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+	sub	$64, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64in2alinged)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64in2alinged):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(128bytesormorein2aligned):
+	cmp	$512, %rdx
+	ja	L(512bytesormorein2aligned)
+	cmp	$256, %rdx
+	ja	L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+	sub	$128, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128in2aligned)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128in2aligned):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(256bytesormorein2aligned):
+
+	sub	$256, %rdx
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqa	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqa	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqa	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqa	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqa	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqa	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqa	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqa	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytesin2alinged)
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256in2alinged)
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256in2alinged):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(512bytesormorein2aligned):
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %r8
+#else
+	mov	__x86_64_data_cache_size_half(%rip), %r8
+#endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_aglined)
+
+	sub	$64, %rdx
+	ALIGN (4)
+L(64bytesormore_loopin2aligned):
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loopin2aligned)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+L(L2_L3_cache_aglined):
+	sub	$64, %rdx
+	ALIGN (4)
+L(L2_L3_aligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_aligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+
+	ALIGN (4)
+L(64bytesormore_loop_end):
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm3, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm4, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	jmp	L(16bytes)
+
+L(256bytesin256):
+	add	$256, %rdi
+	add	$256, %rsi
+	jmp	L(16bytes)
+L(240bytesin256):
+	add	$240, %rdi
+	add	$240, %rsi
+	jmp	L(16bytes)
+L(224bytesin256):
+	add	$224, %rdi
+	add	$224, %rsi
+	jmp	L(16bytes)
+L(208bytesin256):
+	add	$208, %rdi
+	add	$208, %rsi
+	jmp	L(16bytes)
+L(192bytesin256):
+	add	$192, %rdi
+	add	$192, %rsi
+	jmp	L(16bytes)
+L(176bytesin256):
+	add	$176, %rdi
+	add	$176, %rsi
+	jmp	L(16bytes)
+L(160bytesin256):
+	add	$160, %rdi
+	add	$160, %rsi
+	jmp	L(16bytes)
+L(144bytesin256):
+	add	$144, %rdi
+	add	$144, %rsi
+	jmp	L(16bytes)
+L(128bytesin256):
+	add	$128, %rdi
+	add	$128, %rsi
+	jmp	L(16bytes)
+L(112bytesin256):
+	add	$112, %rdi
+	add	$112, %rsi
+	jmp	L(16bytes)
+L(96bytesin256):
+	add	$96, %rdi
+	add	$96, %rsi
+	jmp	L(16bytes)
+L(80bytesin256):
+	add	$80, %rdi
+	add	$80, %rsi
+	jmp	L(16bytes)
+L(64bytesin256):
+	add	$64, %rdi
+	add	$64, %rsi
+	jmp	L(16bytes)
+L(48bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(32bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytes):
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(8bytes):
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(12bytes):
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(4bytes):
+	mov	-4(%rsi), %ecx
+	mov	-4(%rdi), %eax
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(65bytes):
+	movdqu	-65(%rdi), %xmm1
+	movdqu	-65(%rsi), %xmm2
+	mov	$-65, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(49bytes):
+	movdqu	-49(%rdi), %xmm1
+	movdqu	-49(%rsi), %xmm2
+	mov	$-49, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%rdi), %xmm1
+	movdqu	-33(%rsi), %xmm2
+	mov	$-33, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%rdi), %rax
+	mov	-17(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(9bytes):
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN (4)
+L(13bytes):
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(5bytes):
+	mov	-5(%rdi), %eax
+	mov	-5(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN (4)
+L(66bytes):
+	movdqu	-66(%rdi), %xmm1
+	movdqu	-66(%rsi), %xmm2
+	mov	$-66, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(50bytes):
+	movdqu	-50(%rdi), %xmm1
+	movdqu	-50(%rsi), %xmm2
+	mov	$-50, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	movdqu	-34(%rdi), %xmm1
+	movdqu	-34(%rsi), %xmm2
+	mov	$-34, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%rdi), %rax
+	mov	-18(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(10bytes):
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(14bytes):
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(6bytes):
+	mov	-6(%rdi), %eax
+	mov	-6(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+L(2bytes):
+	movzwl	-2(%rsi), %ecx
+	movzwl	-2(%rdi), %eax
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(67bytes):
+	movdqu	-67(%rdi), %xmm2
+	movdqu	-67(%rsi), %xmm1
+	mov	$-67, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(51bytes):
+	movdqu	-51(%rdi), %xmm2
+	movdqu	-51(%rsi), %xmm1
+	mov	$-51, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	movdqu	-35(%rsi), %xmm1
+	movdqu	-35(%rdi), %xmm2
+	mov	$-35, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	mov	-19(%rdi), %rax
+	mov	-19(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(11bytes):
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(15bytes):
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(7bytes):
+	mov	-7(%rdi), %eax
+	mov	-7(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(3bytes):
+	movzwl	-3(%rdi), %eax
+	movzwl	-3(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin2bytes)
+L(1bytes):
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(68bytes):
+	movdqu	-68(%rdi), %xmm2
+	movdqu	-68(%rsi), %xmm1
+	mov	$-68, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(52bytes):
+	movdqu	-52(%rdi), %xmm2
+	movdqu	-52(%rsi), %xmm1
+	mov	$-52, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%rdi), %xmm2
+	movdqu	-36(%rsi), %xmm1
+	mov	$-36, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%rdi), %xmm2
+	movdqu	-20(%rsi), %xmm1
+	mov	$-20, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(69bytes):
+	movdqu	-69(%rsi), %xmm1
+	movdqu	-69(%rdi), %xmm2
+	mov	$-69, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(53bytes):
+	movdqu	-53(%rsi), %xmm1
+	movdqu	-53(%rdi), %xmm2
+	mov	$-53, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	movdqu	-37(%rsi), %xmm1
+	movdqu	-37(%rdi), %xmm2
+	mov	$-37, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	movdqu	-21(%rsi), %xmm1
+	movdqu	-21(%rdi), %xmm2
+	mov	$-21, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(70bytes):
+	movdqu	-70(%rsi), %xmm1
+	movdqu	-70(%rdi), %xmm2
+	mov	$-70, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(54bytes):
+	movdqu	-54(%rsi), %xmm1
+	movdqu	-54(%rdi), %xmm2
+	mov	$-54, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	movdqu	-38(%rsi), %xmm1
+	movdqu	-38(%rdi), %xmm2
+	mov	$-38, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	movdqu	-22(%rsi), %xmm1
+	movdqu	-22(%rdi), %xmm2
+	mov	$-22, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(71bytes):
+	movdqu	-71(%rsi), %xmm1
+	movdqu	-71(%rdi), %xmm2
+	mov	$-71, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(55bytes):
+	movdqu	-55(%rdi), %xmm2
+	movdqu	-55(%rsi), %xmm1
+	mov	$-55, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	movdqu	-39(%rdi), %xmm2
+	movdqu	-39(%rsi), %xmm1
+	mov	$-39, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	movdqu	-23(%rdi), %xmm2
+	movdqu	-23(%rsi), %xmm1
+	mov	$-23, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(72bytes):
+	movdqu	-72(%rsi), %xmm1
+	movdqu	-72(%rdi), %xmm2
+	mov	$-72, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(56bytes):
+	movdqu	-56(%rdi), %xmm2
+	movdqu	-56(%rsi), %xmm1
+	mov	$-56, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	movdqu	-40(%rdi), %xmm2
+	movdqu	-40(%rsi), %xmm1
+	mov	$-40, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	movdqu	-24(%rdi), %xmm2
+	movdqu	-24(%rsi), %xmm1
+	mov	$-24, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(73bytes):
+	movdqu	-73(%rsi), %xmm1
+	movdqu	-73(%rdi), %xmm2
+	mov	$-73, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(57bytes):
+	movdqu	-57(%rdi), %xmm2
+	movdqu	-57(%rsi), %xmm1
+	mov	$-57, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	movdqu	-41(%rdi), %xmm2
+	movdqu	-41(%rsi), %xmm1
+	mov	$-41, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	movdqu	-25(%rdi), %xmm2
+	movdqu	-25(%rsi), %xmm1
+	mov	$-25, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(74bytes):
+	movdqu	-74(%rsi), %xmm1
+	movdqu	-74(%rdi), %xmm2
+	mov	$-74, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(58bytes):
+	movdqu	-58(%rdi), %xmm2
+	movdqu	-58(%rsi), %xmm1
+	mov	$-58, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	movdqu	-42(%rdi), %xmm2
+	movdqu	-42(%rsi), %xmm1
+	mov	$-42, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	movdqu	-26(%rdi), %xmm2
+	movdqu	-26(%rsi), %xmm1
+	mov	$-26, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	jmp	L(diffin2bytes)
+
+	ALIGN (4)
+L(75bytes):
+	movdqu	-75(%rsi), %xmm1
+	movdqu	-75(%rdi), %xmm2
+	mov	$-75, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(59bytes):
+	movdqu	-59(%rdi), %xmm2
+	movdqu	-59(%rsi), %xmm1
+	mov	$-59, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	movdqu	-43(%rdi), %xmm2
+	movdqu	-43(%rsi), %xmm1
+	mov	$-43, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	movdqu	-27(%rdi), %xmm2
+	movdqu	-27(%rsi), %xmm1
+	mov	$-27, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(76bytes):
+	movdqu	-76(%rsi), %xmm1
+	movdqu	-76(%rdi), %xmm2
+	mov	$-76, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(60bytes):
+	movdqu	-60(%rdi), %xmm2
+	movdqu	-60(%rsi), %xmm1
+	mov	$-60, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	movdqu	-44(%rdi), %xmm2
+	movdqu	-44(%rsi), %xmm1
+	mov	$-44, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	movdqu	-28(%rdi), %xmm2
+	movdqu	-28(%rsi), %xmm1
+	mov	$-28, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(77bytes):
+	movdqu	-77(%rsi), %xmm1
+	movdqu	-77(%rdi), %xmm2
+	mov	$-77, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(61bytes):
+	movdqu	-61(%rdi), %xmm2
+	movdqu	-61(%rsi), %xmm1
+	mov	$-61, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	movdqu	-45(%rdi), %xmm2
+	movdqu	-45(%rsi), %xmm1
+	mov	$-45, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	movdqu	-29(%rdi), %xmm2
+	movdqu	-29(%rsi), %xmm1
+	mov	$-29, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(78bytes):
+	movdqu	-78(%rsi), %xmm1
+	movdqu	-78(%rdi), %xmm2
+	mov	$-78, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(62bytes):
+	movdqu	-62(%rdi), %xmm2
+	movdqu	-62(%rsi), %xmm1
+	mov	$-62, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	movdqu	-46(%rdi), %xmm2
+	movdqu	-46(%rsi), %xmm1
+	mov	$-46, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	movdqu	-30(%rdi), %xmm2
+	movdqu	-30(%rsi), %xmm1
+	mov	$-30, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(79bytes):
+	movdqu	-79(%rsi), %xmm1
+	movdqu	-79(%rdi), %xmm2
+	mov	$-79, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(63bytes):
+	movdqu	-63(%rdi), %xmm2
+	movdqu	-63(%rsi), %xmm1
+	mov	$-63, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	movdqu	-47(%rdi), %xmm2
+	movdqu	-47(%rsi), %xmm1
+	mov	$-47, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	movdqu	-31(%rdi), %xmm2
+	movdqu	-31(%rsi), %xmm1
+	mov	$-31, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(64bytes):
+	movdqu	-64(%rdi), %xmm2
+	movdqu	-64(%rsi), %xmm1
+	mov	$-64, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%rdi), %xmm2
+	movdqu	-48(%rsi), %xmm1
+	mov	$-48, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%rdi), %xmm2
+	movdqu	-32(%rsi), %xmm1
+	mov	$-32, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+	ALIGN (3)
+L(less16bytes):
+	movsbq	%dl, %rdx
+	mov	(%rsi, %rdx), %rcx
+	mov	(%rdi, %rdx), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	8(%rsi, %rdx), %rcx
+	mov	8(%rdi, %rdx), %rax
+L(diffin8bytes):
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	shr	$32, %rcx
+	shr	$32, %rax
+L(diffin4bytes):
+	cmp	%cx, %ax
+	jne	L(diffin2bytes)
+	shr	$16, %ecx
+	shr	$16, %eax
+L(diffin2bytes):
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(end):
+	and	$0xff, %eax
+	and	$0xff, %ecx
+	sub	%ecx, %eax
+	ret
+
+END (MEMCMP)
+
+	.section .rodata.sse4.1,"a",@progbits
+	ALIGN (3)
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(65bytes), L(table_64bytes))
+	.int	JMPTBL (L(66bytes), L(table_64bytes))
+	.int	JMPTBL (L(67bytes), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(69bytes), L(table_64bytes))
+	.int	JMPTBL (L(70bytes), L(table_64bytes))
+	.int	JMPTBL (L(71bytes), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(73bytes), L(table_64bytes))
+	.int	JMPTBL (L(74bytes), L(table_64bytes))
+	.int	JMPTBL (L(75bytes), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(77bytes), L(table_64bytes))
+	.int	JMPTBL (L(78bytes), L(table_64bytes))
+	.int	JMPTBL (L(79bytes), L(table_64bytes))
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
new file mode 100644
index 0000000000..301ab287f5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -0,0 +1,59 @@
+/* Multiple versions of memcmp
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__memcmp_sse2(%rip), %rax
+	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	jz	2f
+	leaq	__memcmp_sse4_1(%rip), %rax
+2:	ret
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_sse2, @function; \
+	.p2align 4; \
+	__memcmp_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c
new file mode 100644
index 0000000000..0f271356c2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-memcmp.c
@@ -0,0 +1 @@
+#include "../rtld-memcmp.c"