summary refs log tree commit diff
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2009-03-13 23:53:18 +0000
committerUlrich Drepper <drepper@redhat.com>2009-03-13 23:53:18 +0000
commit425ce2edb9d11cc1ff650fac16dfbc450241896a (patch)
treeb3bd9971ea82766a895ab549ff194d97bcc9d51e
parente7f110cdbd6e9c5a48b378ba7b30a3ad1dc04314 (diff)
downloadglibc-425ce2edb9d11cc1ff650fac16dfbc450241896a.tar.gz
glibc-425ce2edb9d11cc1ff650fac16dfbc450241896a.tar.xz
glibc-425ce2edb9d11cc1ff650fac16dfbc450241896a.zip
* config.h.in (USE_MULTIARCH): Define.
	* configure.in: Handle --enable-multi-arch.
	* elf/dl-runtime.c (_dl_fixup): Handle STT_GNU_IFUNC.
	(_dl_fixup_profile): Likewise.
	* elf/do-lookup.c (dl_lookup_x): Likewise.
	* sysdeps/x86_64/dl-machine.h: Handle STT_GNU_IFUNC.
	* elf/elf.h (STT_GNU_IFUNC): Define.
	* include/libc-symbols.h (libc_ifunc): Define.
	* sysdeps/x86_64/cacheinfo.c: If USE_MULTIARCH is defined, use the
	framework in init-arch.h to get CPUID values.
	* sysdeps/x86_64/multiarch/Makefile: New file.
	* sysdeps/x86_64/multiarch/init-arch.c: New file.
	* sysdeps/x86_64/multiarch/init-arch.h: New file.
	* sysdeps/x86_64/multiarch/sched_cpucount.c: New file.

	* config.make.in (experimental-malloc): Define.
	* configure.in: Handle --enable-experimental-malloc.
	* malloc/Makefile: Handle experimental-malloc flag.
	* malloc/malloc.c: Implement PER_THREAD and ATOMIC_FASTBINS features.
	* malloc/arena.c: Likewise.
	* malloc/hooks.c: Likewise.
	* malloc/malloc.h: Define M_ARENA_TEST and M_ARENA_MAX.
-rw-r--r--ChangeLog25
-rw-r--r--NEWS4
-rw-r--r--config.h.in3
-rw-r--r--config.make.in3
-rwxr-xr-xconfigure70
-rw-r--r--configure.in43
-rw-r--r--elf/dl-runtime.c14
-rw-r--r--elf/do-lookup.h12
-rw-r--r--elf/elf.h1
-rw-r--r--include/libc-symbols.h14
-rw-r--r--malloc/Makefile5
-rw-r--r--malloc/arena.c203
-rw-r--r--malloc/hooks.c15
-rw-r--r--malloc/malloc.c260
-rw-r--r--malloc/malloc.h4
-rw-r--r--sysdeps/x86_64/cacheinfo.c32
-rw-r--r--sysdeps/x86_64/multiarch/Makefile3
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.c65
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h70
-rw-r--r--sysdeps/x86_64/multiarch/sched_cpucount.c42
20 files changed, 813 insertions, 75 deletions
diff --git a/ChangeLog b/ChangeLog
index 933e81cb4f..954285ff0f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,28 @@
+2009-03-13  Ulrich Drepper  <drepper@redhat.com>
+
+	* config.h.in (USE_MULTIARCH): Define.
+	* configure.in: Handle --enable-multi-arch.
+	* elf/dl-runtime.c (_dl_fixup): Handle STT_GNU_IFUNC.
+	(_dl_fixup_profile): Likewise.
+	* elf/do-lookup.c (dl_lookup_x): Likewise.
+	* sysdeps/x86_64/dl-machine.h: Handle STT_GNU_IFUNC.
+	* elf/elf.h (STT_GNU_IFUNC): Define.
+	* include/libc-symbols.h (libc_ifunc): Define.
+	* sysdeps/x86_64/cacheinfo.c: If USE_MULTIARCH is defined, use the
+	framework in init-arch.h to get CPUID values.
+	* sysdeps/x86_64/multiarch/Makefile: New file.
+	* sysdeps/x86_64/multiarch/init-arch.c: New file.
+	* sysdeps/x86_64/multiarch/init-arch.h: New file.
+	* sysdeps/x86_64/multiarch/sched_cpucount.c: New file.
+
+	* config.make.in (experimental-malloc): Define.
+	* configure.in: Handle --enable-experimental-malloc.
+	* malloc/Makefile: Handle experimental-malloc flag.
+	* malloc/malloc.c: Implement PER_THREAD and ATOMIC_FASTBINS features.
+	* malloc/arena.c: Likewise.
+	* malloc/hooks.c: Likewise.
+	* malloc/malloc.h: Define M_ARENA_TEST and M_ARENA_MAX.
+
 2009-03-11  Ulrich Drepper  <drepper@redhat.com>
 
 	* sysdeps/x86_64/dl-machine.h (elf_machine_rela): Add branch
diff --git a/NEWS b/NEWS
index 2689db304e..55ba54469e 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,4 @@
-GNU C Library NEWS -- history of user-visible changes.  2009-3-8
+GNU C Library NEWS -- history of user-visible changes.  2009-3-12
 Copyright (C) 1992-2008, 2009 Free Software Foundation, Inc.
 See the end for copying conditions.
 
@@ -7,7 +7,7 @@ using `glibc' in the "product" field.
 
 Version 2.10
 
-* New Linux interface: accept4
+* New Linux interfaces: accept4, fallocate, fallocate64.
 
 * Correct declarations of string function when used in C++ code.  This
   could lead to compile error for invalid C++ code.
diff --git a/config.h.in b/config.h.in
index b5abb10187..8dbc224a7d 100644
--- a/config.h.in
+++ b/config.h.in
@@ -189,6 +189,9 @@
 /* Define if __stack_chk_guard canary should be randomized at program startup.  */
 #undef ENABLE_STACKGUARD_RANDOMIZE
 
+/* Define if multi-arch DSOs should be generated.  */
+#undef USE_MULTIARCH
+
 /*
  */
 
diff --git a/config.make.in b/config.make.in
index aa73466713..6da6362f24 100644
--- a/config.make.in
+++ b/config.make.in
@@ -70,6 +70,7 @@ versioning = @VERSIONING@
 oldest-abi = @oldest_abi@
 no-whole-archive = @no_whole_archive@
 exceptions = @exceptions@
+multi-arch = @multi_arch@
 
 mach-interface-list = @mach_interface_list@
 
@@ -78,6 +79,8 @@ have-ksh = @libc_cv_have_ksh@
 
 sizeof-long-double = @sizeof_long_double@
 
+experimental-malloc = @experimental_malloc@
+
 # Configuration options.
 build-static = @static@
 build-shared = @shared@
diff --git a/configure b/configure
index c5c6dc4789..69af8525be 100755
--- a/configure
+++ b/configure
@@ -660,6 +660,8 @@ oldest_abi
 bindnow
 force_install
 all_warnings
+multi_arch
+experimental_malloc
 build
 build_cpu
 build_vendor
@@ -1380,6 +1382,10 @@ Optional Features:
   --enable-kernel=VERSION compile for compatibility with kernel not older than
                           VERSION
   --enable-all-warnings   enable all useful warnings gcc can issue
+  --enable-multi-arch     enable single DSO with optimizations for multiple
+                          architectures
+  --enable-experimental-malloc
+                          enable experimental malloc features
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -2173,6 +2179,29 @@ fi
 
 
 
+# Check whether --enable-multi-arch was given.
+if test "${enable_multi_arch+set}" = set; then
+  enableval=$enable_multi_arch; multi_arch=$enableval
+else
+  multi_arch=no
+fi
+
+if test x"$multi_arch" = xyes; then
+  cat >>confdefs.h <<\_ACEOF
+#define USE_MULTIARCH 1
+_ACEOF
+
+  multi_arch_d=/multiarch
+fi
+
+
+# Check whether --enable-experimental-malloc was given.
+if test "${enable_experimental_malloc+set}" = set; then
+  enableval=$enable_experimental_malloc; experimental_malloc=$enableval
+fi
+
+
+
 # Make sure we can run config.sub.
 $SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
   { { echo "$as_me:$LINENO: error: cannot run $SHELL $ac_aux_dir/config.sub" >&5
@@ -2627,7 +2656,7 @@ for b in $base ''; do
       test "$v" = / && continue
       for o in /$ostry ''; do
 	test "$o" = / && continue
-	for m in $mach ''; do
+	for m in $multi_arch_d $mach ''; do
 	  for d in $add_ons_pfx ''; do
 	    for a in $add_ons_sfx ''; do
 	      if test -n "$m0$m0sub$b$v$o$m$msub"; then
@@ -5684,6 +5713,37 @@ _ACEOF
 
 fi
 
+# For the multi-arch option we need support in the assembler.
+if test "$multi_arch" = yes; then
+  if test "x$libc_cv_asm_type_prefix" != xno; then
+{ echo "$as_me:$LINENO: checking for assembler gnu_indirect_function symbol type support" >&5
+echo $ECHO_N "checking for assembler gnu_indirect_function symbol type support... $ECHO_C" >&6; }
+if test "${libc_cv_asm_gnu_indirect_function+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat > conftest.s <<EOF
+.type foo,%gnu_indirect_function
+EOF
+if ${CC-cc} -c $ASFLAGS conftest.s 1>&5 2>&5;
+then
+  libc_cv_asm_gnu_indirect_function=yes
+else
+  libc_cv_asm_gnu_indirect_function=no
+fi
+rm -f conftest*
+fi
+{ echo "$as_me:$LINENO: result: $libc_cv_asm_gnu_indirect_function" >&5
+echo "${ECHO_T}$libc_cv_asm_gnu_indirect_function" >&6; }
+  else
+    libc_cv_asm_gnu_indirect_function=no
+  fi
+  if test x"$libc_cv_asm_gnu_indirect_function" != xyes; then
+    { { echo "$as_me:$LINENO: error: --enable-multi-arch support requires assembler and linker support" >&5
+echo "$as_me: error: --enable-multi-arch support requires assembler and linker support" >&2;}
+   { (exit 1); exit 1; }; }
+  fi
+fi
+
 { echo "$as_me:$LINENO: checking for .symver assembler directive" >&5
 echo $ECHO_N "checking for .symver assembler directive... $ECHO_C" >&6; }
 if test "${libc_cv_asm_symver_directive+set}" = set; then
@@ -9184,6 +9244,8 @@ oldest_abi!$oldest_abi$ac_delim
 bindnow!$bindnow$ac_delim
 force_install!$force_install$ac_delim
 all_warnings!$all_warnings$ac_delim
+multi_arch!$multi_arch$ac_delim
+experimental_malloc!$experimental_malloc$ac_delim
 build!$build$ac_delim
 build_cpu!$build_cpu$ac_delim
 build_vendor!$build_vendor$ac_delim
@@ -9235,8 +9297,6 @@ libc_cv_have_bash2!$libc_cv_have_bash2$ac_delim
 KSH!$KSH$ac_delim
 libc_cv_have_ksh!$libc_cv_have_ksh$ac_delim
 AWK!$AWK$ac_delim
-PERL!$PERL$ac_delim
-INSTALL_INFO!$INSTALL_INFO$ac_delim
 _ACEOF
 
   if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
@@ -9278,6 +9338,8 @@ _ACEOF
 ac_delim='%!_!# '
 for ac_last_try in false false false false false :; do
   cat >conf$$subs.sed <<_ACEOF
+PERL!$PERL$ac_delim
+INSTALL_INFO!$INSTALL_INFO$ac_delim
 BISON!$BISON$ac_delim
 VERSIONING!$VERSIONING$ac_delim
 libc_cv_cc_with_libunwind!$libc_cv_cc_with_libunwind$ac_delim
@@ -9334,7 +9396,7 @@ LIBOBJS!$LIBOBJS$ac_delim
 LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 54; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 56; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
diff --git a/configure.in b/configure.in
index d7b22f3a3b..4015722d5a 100644
--- a/configure.in
+++ b/configure.in
@@ -254,6 +254,24 @@ AC_ARG_ENABLE([all-warnings],
 	      [])
 AC_SUBST(all_warnings)
 
+AC_ARG_ENABLE([multi-arch],
+	      AC_HELP_STRING([--enable-multi-arch],
+			     [enable single DSO with optimizations for multiple architectures]),
+	      [multi_arch=$enableval],
+	      [multi_arch=no])
+if test x"$multi_arch" = xyes; then
+  AC_DEFINE(USE_MULTIARCH)
+  multi_arch_d=/multiarch
+fi
+AC_SUBST(multi_arch)
+
+AC_ARG_ENABLE([experimental-malloc],
+	      AC_HELP_STRING([--enable-experimental-malloc],
+			     [enable experimental malloc features]),
+	      [experimental_malloc=$enableval],
+	      [])
+AC_SUBST(experimental_malloc)
+
 AC_CANONICAL_HOST
 
 # The way shlib-versions is used to generate soversions.mk uses a
@@ -608,7 +626,7 @@ for b in $base ''; do
       test "$v" = / && continue
       for o in /$ostry ''; do
 	test "$o" = / && continue
-	for m in $mach ''; do
+	for m in $multi_arch_d $mach ''; do
 	  for d in $add_ons_pfx ''; do
 	    for a in $add_ons_sfx ''; do
 	      if test -n "$m0$m0sub$b$v$o$m$msub"; then
@@ -1157,6 +1175,29 @@ if test "x$libc_cv_asm_type_prefix" != xno; then
   AC_DEFINE_UNQUOTED(ASM_TYPE_DIRECTIVE_PREFIX, ${libc_cv_asm_type_prefix})
 fi
 
+# For the multi-arch option we need support in the assembler.
+if test "$multi_arch" = yes; then
+  if test "x$libc_cv_asm_type_prefix" != xno; then
+AC_CACHE_CHECK([for assembler gnu_indirect_function symbol type support],
+               libc_cv_asm_gnu_indirect_function, [dnl
+cat > conftest.s <<EOF
+.type foo,%gnu_indirect_function
+EOF
+if ${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD;
+then
+  libc_cv_asm_gnu_indirect_function=yes
+else
+  libc_cv_asm_gnu_indirect_function=no
+fi
+rm -f conftest*])
+  else
+    libc_cv_asm_gnu_indirect_function=no
+  fi
+  if test x"$libc_cv_asm_gnu_indirect_function" != xyes; then
+    AC_MSG_ERROR([--enable-multi-arch support requires assembler and linker support])
+  fi
+fi
+
 AC_CACHE_CHECK(for .symver assembler directive, libc_cv_asm_symver_directive,
 [cat > conftest.s <<EOF
 ${libc_cv_dot_text}
diff --git a/elf/dl-runtime.c b/elf/dl-runtime.c
index 968e293409..962f47de64 100644
--- a/elf/dl-runtime.c
+++ b/elf/dl-runtime.c
@@ -1,5 +1,5 @@
 /* On-demand PLT fixup for shared objects.
-   Copyright (C) 1995-2006, 2007 Free Software Foundation, Inc.
+   Copyright (C) 1995-2006, 2007, 2008 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -130,6 +130,9 @@ _dl_fixup (
   /* And now perhaps the relocation addend.  */
   value = elf_machine_plt_value (l, reloc, value);
 
+  if (__builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0))
+    value = ((DL_FIXUP_VALUE_TYPE (*) (void)) value) ();
+
   /* Finally, fix up the plt itself.  */
   if (__builtin_expect (GLRO(dl_bind_not), 0))
     return value;
@@ -215,12 +218,21 @@ _dl_profile_fixup (
 				       defsym != NULL
 				       ? LOOKUP_VALUE_ADDRESS (result)
 					 + defsym->st_value : 0);
+
+	  if (__builtin_expect (ELFW(ST_TYPE) (defsym->st_info)
+				== STT_GNU_IFUNC, 0))
+	    value = ((DL_FIXUP_VALUE_TYPE (*) (void)) value) ();
 	}
       else
 	{
 	  /* We already found the symbol.  The module (and therefore its load
 	     address) is also known.  */
 	  value = DL_FIXUP_MAKE_VALUE (l, l->l_addr + refsym->st_value);
+
+	  if (__builtin_expect (ELFW(ST_TYPE) (refsym->st_info)
+				== STT_GNU_IFUNC, 0))
+	    value = ((DL_FIXUP_VALUE_TYPE (*) (void)) value) ();
+
 	  result = l;
 	}
       /* And now perhaps the relocation addend.  */
diff --git a/elf/do-lookup.h b/elf/do-lookup.h
index ebb9ed5b47..41e5fc137c 100644
--- a/elf/do-lookup.h
+++ b/elf/do-lookup.h
@@ -1,5 +1,5 @@
 /* Look up a symbol in the loaded objects.
-   Copyright (C) 1995-2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+   Copyright (C) 1995-2007, 2008 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -88,10 +88,12 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash,
 
 	if (__builtin_expect (ELFW(ST_TYPE) (sym->st_info) > STT_FUNC
 			      && ELFW(ST_TYPE) (sym->st_info) != STT_COMMON
-			      && ELFW(ST_TYPE) (sym->st_info) != STT_TLS, 0))
-	  /* Ignore all but STT_NOTYPE, STT_OBJECT, STT_FUNC, and STT_COMMON
-	     entries (and STT_TLS if TLS is supported) since these
-	     are no code/data definitions.  */
+			      && ELFW(ST_TYPE) (sym->st_info) != STT_TLS
+			      && ELFW(ST_TYPE) (sym->st_info) != STT_GNU_IFUNC,
+			      0))
+	  /* Ignore all but STT_NOTYPE, STT_OBJECT, STT_FUNC, STT_COMMON,
+	     STT_TLS, and STT_GNU_IFUNC since these are no code/data
+	     definitions.  */
 	  return NULL;
 
 	if (sym != ref && strcmp (strtab + sym->st_name, undef_name))
diff --git a/elf/elf.h b/elf/elf.h
index 2792820c35..cd74d510ee 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -459,6 +459,7 @@ typedef struct
 #define STT_TLS		6		/* Symbol is thread-local data object*/
 #define	STT_NUM		7		/* Number of defined types.  */
 #define STT_LOOS	10		/* Start of OS-specific */
+#define STT_GNU_IFUNC	10		/* Symbol is indirect code object */
 #define STT_HIOS	12		/* End of OS-specific */
 #define STT_LOPROC	13		/* Start of processor-specific */
 #define STT_HIPROC	15		/* End of processor-specific */
diff --git a/include/libc-symbols.h b/include/libc-symbols.h
index a2faeafb32..d53bcb9b77 100644
--- a/include/libc-symbols.h
+++ b/include/libc-symbols.h
@@ -1,7 +1,6 @@
 /* Support macros for making weak and strong aliases for symbols,
    and for using symbol sets and linker warnings with GNU ld.
-   Copyright (C) 1995-1998,2000-2003,2004,2005,2006
-	Free Software Foundation, Inc.
+   Copyright (C) 1995-1998, 2000-2006, 2008 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -835,4 +834,15 @@ for linking")
 # define compat_data_section .section ".data.compat", "aw";
 #endif
 
+/* Marker used for indirection function symbols.  */
+#define libc_ifunc(name, expr)						\
+  extern void *name##_ifunc (void) __asm__ (#name);			\
+  void *name##_ifunc (void)						\
+  {									\
+    INIT_ARCH ();							\
+    __typeof (name) *res = expr;					\
+    return res;								\
+  }									\
+  __asm__ (".type " #name ", %gnu_indirect_function");
+
 #endif /* libc-symbols.h */
diff --git a/malloc/Makefile b/malloc/Makefile
index 22b14eac77..1099335fff 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -1,4 +1,4 @@
-# Copyright (C) 1991-1999, 2000, 2001, 2002, 2003, 2005, 2006, 2007
+# Copyright (C) 1991-1999, 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2009
 # Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 
@@ -124,6 +124,9 @@ endif
 
 tst-mcheck-ENV = MALLOC_CHECK_=3
 
+ifeq ($(experimental-malloc),yes)
+CPPFLAGS-malloc.c += -DPER_THREAD -DATOMIC_FASTBINS
+endif
 # Uncomment this for test releases.  For public releases it is too expensive.
 #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
 
diff --git a/malloc/arena.c b/malloc/arena.c
index cc03dc4a5b..f280d38811 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -1,5 +1,5 @@
 /* Malloc implementation for multiple threads without lock contention.
-   Copyright (C) 2001,2002,2003,2004,2005,2006,2007
+   Copyright (C) 2001,2002,2003,2004,2005,2006,2007,2009
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Wolfram Gloger <wg@malloc.de>, 2001.
@@ -78,6 +78,10 @@ extern int sanity_check_heap_info_alignment[(sizeof (heap_info)
 
 static tsd_key_t arena_key;
 static mutex_t list_lock;
+#ifdef PER_THREAD
+static size_t narenas;
+static mstate free_list;
+#endif
 
 #if THREAD_STATS
 static int stat_n_heaps;
@@ -105,13 +109,30 @@ int __malloc_initialized = -1;
    in the new arena. */
 
 #define arena_get(ptr, size) do { \
+  arena_lookup(ptr); \
+  arena_lock(ptr, size); \
+} while(0)
+
+#define arena_lookup(ptr) do { \
   Void_t *vptr = NULL; \
   ptr = (mstate)tsd_getspecific(arena_key, vptr); \
+} while(0)
+
+#ifdef PER_THREAD
+#define arena_lock(ptr, size) do { \
+  if(ptr) \
+    (void)mutex_lock(&ptr->mutex); \
+  else \
+    ptr = arena_get2(ptr, (size)); \
+} while(0)
+#else
+#define arena_lock(ptr, size) do { \
   if(ptr && !mutex_trylock(&ptr->mutex)) { \
     THREAD_STAT(++(ptr->stat_lock_direct)); \
   } else \
     ptr = arena_get2(ptr, (size)); \
 } while(0)
+#endif
 
 /* find the heap and corresponding arena for a given ptr */
 
@@ -219,6 +240,11 @@ free_atfork(Void_t* mem, const Void_t *caller)
   }
 #endif
 
+#ifdef ATOMIC_FASTBINS
+  ar_ptr = arena_for_chunk(p);
+  tsd_getspecific(arena_key, vptr);
+  _int_free(ar_ptr, p, vptr == ATFORK_ARENA_PTR);
+#else
   ar_ptr = arena_for_chunk(p);
   tsd_getspecific(arena_key, vptr);
   if(vptr != ATFORK_ARENA_PTR)
@@ -226,6 +252,7 @@ free_atfork(Void_t* mem, const Void_t *caller)
   _int_free(ar_ptr, p);
   if(vptr != ATFORK_ARENA_PTR)
     (void)mutex_unlock(&ar_ptr->mutex);
+#endif
 }
 
 
@@ -312,8 +339,17 @@ ptmalloc_unlock_all2 (void)
   __malloc_hook = save_malloc_hook;
   __free_hook = save_free_hook;
 #endif
+#ifdef PER_THREAD
+  free_list = NULL;
+#endif
   for(ar_ptr = &main_arena;;) {
     mutex_init(&ar_ptr->mutex);
+#ifdef PER_THREAD
+    if (ar_ptr != save_arena) {
+      ar_ptr->next_free = free_list;
+      free_list = ar_ptr;
+    }
+#endif
     ar_ptr = ar_ptr->next;
     if(ar_ptr == &main_arena) break;
   }
@@ -377,6 +413,11 @@ ptmalloc_init_minimal (void)
   mp_.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
   mp_.trim_threshold = DEFAULT_TRIM_THRESHOLD;
   mp_.pagesize       = malloc_getpagesize;
+#ifdef PER_THREAD
+# define NARENAS_FROM_NCORES(n) ((n) * (sizeof(long) == 4 ? 2 : 8))
+  mp_.arena_test     = NARENAS_FROM_NCORES (1);
+  narenas = 1;
+#endif
 }
 
 
@@ -529,9 +570,25 @@ ptmalloc_init (void)
 		}
 	      break;
 	    case 9:
-	      if (! secure && memcmp (envline, "MMAP_MAX_", 9) == 0)
-		mALLOPt(M_MMAP_MAX, atoi(&envline[10]));
+	      if (! secure)
+		{
+		  if (memcmp (envline, "MMAP_MAX_", 9) == 0)
+		    mALLOPt(M_MMAP_MAX, atoi(&envline[10]));
+#ifdef PER_THREAD
+		  else if (memcmp (envline, "ARENA_MAX", 9) == 0)
+		    mALLOPt(M_ARENA_MAX, atoi(&envline[10]));
+#endif
+		}
 	      break;
+#ifdef PER_THREAD
+	    case 10:
+	      if (! secure)
+		{
+		  if (memcmp (envline, "ARENA_TEST", 10) == 0)
+		    mALLOPt(M_ARENA_TEST, atoi(&envline[11]));
+		}
+	      break;
+#endif
 	    case 15:
 	      if (! secure)
 		{
@@ -875,9 +932,110 @@ _int_new_arena(size_t size)
   top(a) = (mchunkptr)ptr;
   set_head(top(a), (((char*)h + h->size) - ptr) | PREV_INUSE);
 
+  tsd_setspecific(arena_key, (Void_t *)a);
+  mutex_init(&a->mutex);
+  (void)mutex_lock(&a->mutex);
+
+#ifdef PER_THREAD
+  (void)mutex_lock(&list_lock);
+#endif
+
+  /* Add the new arena to the global list.  */
+  a->next = main_arena.next;
+  atomic_write_barrier ();
+  main_arena.next = a;
+
+#ifdef PER_THREAD
+  ++narenas;
+
+  (void)mutex_unlock(&list_lock);
+#endif
+
+  THREAD_STAT(++(a->stat_lock_loop));
+
   return a;
 }
 
+
+#ifdef PER_THREAD
+static mstate
+get_free_list (void)
+{
+  mstate result = free_list;
+  if (result != NULL)
+    {
+      (void)mutex_lock(&list_lock);
+      result = free_list;
+      if (result != NULL)
+	free_list = result->next_free;
+      (void)mutex_unlock(&list_lock);
+
+      if (result != NULL)
+	{
+	  (void)mutex_lock(&result->mutex);
+	  tsd_setspecific(arena_key, (Void_t *)result);
+	  THREAD_STAT(++(result->stat_lock_loop));
+	}
+    }
+
+  return result;
+}
+
+
+static mstate
+reused_arena (void)
+{
+  if (narenas <= mp_.arena_test)
+    return NULL;
+
+  static int narenas_limit;
+  if (narenas_limit == 0)
+    {
+      if (mp_.arena_max != 0)
+	narenas_limit = mp_.arena_max;
+      else
+	{
+	  int n  = __get_nprocs ();
+
+	  if (n >= 1)
+	    narenas_limit = NARENAS_FROM_NCORES (n);
+	  else
+	    /* We have no information about the system.  Assume two
+	       cores.  */
+	    narenas_limit = NARENAS_FROM_NCORES (2);
+	}
+    }
+
+  if (narenas < narenas_limit)
+    return NULL;
+
+  mstate result;
+  static mstate next_to_use;
+  if (next_to_use == NULL)
+    next_to_use = &main_arena;
+
+  result = next_to_use;
+  do
+    {
+      if (!mutex_trylock(&result->mutex))
+	goto out;
+
+      result = result->next;
+    }
+  while (result != next_to_use);
+
+  /* No arena available.  Wait for the next in line.  */
+  (void)mutex_lock(&result->mutex);
+
+ out:
+  tsd_setspecific(arena_key, (Void_t *)result);
+  THREAD_STAT(++(result->stat_lock_loop));
+  next_to_use = result->next;
+
+  return result;
+}
+#endif
+
 static mstate
 internal_function
 #if __STD_C
@@ -888,6 +1046,12 @@ arena_get2(a_tsd, size) mstate a_tsd; size_t size;
 {
   mstate a;
 
+#ifdef PER_THREAD
+  if ((a = get_free_list ()) == NULL
+      && (a = reused_arena ()) == NULL)
+    /* Nothing immediately available, so generate a new arena.  */
+    a = _int_new_arena(size);
+#else
   if(!a_tsd)
     a = a_tsd = &main_arena;
   else {
@@ -930,24 +1094,31 @@ arena_get2(a_tsd, size) mstate a_tsd; size_t size;
 
   /* Nothing immediately available, so generate a new arena.  */
   a = _int_new_arena(size);
-  if(a)
-    {
-      tsd_setspecific(arena_key, (Void_t *)a);
-      mutex_init(&a->mutex);
-      mutex_lock(&a->mutex); /* remember result */
-
-      /* Add the new arena to the global list.  */
-      a->next = main_arena.next;
-      atomic_write_barrier ();
-      main_arena.next = a;
-
-      THREAD_STAT(++(a->stat_lock_loop));
-    }
   (void)mutex_unlock(&list_lock);
+#endif
 
   return a;
 }
 
+#ifdef PER_THREAD
+static void __attribute__ ((section ("__libc_thread_freeres_fn")))
+arena_thread_freeres (void)
+{
+  Void_t *vptr = NULL;
+  mstate a = tsd_getspecific(arena_key, vptr);
+  tsd_setspecific(arena_key, NULL);
+
+  if (a != NULL)
+    {
+      (void)mutex_lock(&list_lock);
+      a->next_free = free_list;
+      free_list = a;
+      (void)mutex_unlock(&list_lock);
+    }
+}
+text_set_element (__libc_thread_subfreeres, arena_thread_freeres);
+#endif
+
 #endif /* USE_ARENAS */
 
 /*
diff --git a/malloc/hooks.c b/malloc/hooks.c
index 9659ec5fbe..fe89db83f4 100644
--- a/malloc/hooks.c
+++ b/malloc/hooks.c
@@ -275,17 +275,13 @@ free_check(mem, caller) Void_t* mem; const Void_t *caller;
   mchunkptr p;
 
   if(!mem) return;
-  (void)mutex_lock(&main_arena.mutex);
   p = mem2chunk_check(mem, NULL);
   if(!p) {
-    (void)mutex_unlock(&main_arena.mutex);
-
     malloc_printerr(check_action, "free(): invalid pointer", mem);
     return;
   }
 #if HAVE_MMAP
   if (chunk_is_mmapped(p)) {
-    (void)mutex_unlock(&main_arena.mutex);
     munmap_chunk(p);
     return;
   }
@@ -293,8 +289,13 @@ free_check(mem, caller) Void_t* mem; const Void_t *caller;
 #if 0 /* Erase freed memory. */
   memset(mem, 0, chunksize(p) - (SIZE_SZ+1));
 #endif
+#ifdef ATOMIC_FASTBINS
+  _int_free(&main_arena, p, 0);
+#else
+  (void)mutex_lock(&main_arena.mutex);
   _int_free(&main_arena, p);
   (void)mutex_unlock(&main_arena.mutex);
+#endif
 }
 
 static Void_t*
@@ -472,7 +473,11 @@ free_starter(mem, caller) Void_t* mem; const Void_t *caller;
     return;
   }
 #endif
+#ifdef ATOMIC_FASTBINS
+  _int_free(&main_arena, p, 1);
+#else
   _int_free(&main_arena, p);
+#endif
 }
 
 # endif	/* !defiend NO_STARTER */
@@ -584,7 +589,7 @@ public_sET_STATe(Void_t* msptr)
   clear_fastchunks(&main_arena);
   set_max_fast(DEFAULT_MXFAST);
   for (i=0; i<NFASTBINS; ++i)
-    main_arena.fastbins[i] = 0;
+    fastbin (&main_arena, i) = 0;
   for (i=0; i<BINMAPSIZE; ++i)
     main_arena.binmap[i] = 0;
   top(&main_arena) = ms->av[2];
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 12e23b0f9b..bb7ea36c80 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -208,7 +208,7 @@
 
     Tuning options that are also dynamically changeable via mallopt:
 
-    DEFAULT_MXFAST             64
+    DEFAULT_MXFAST             64 (for 32bit), 128 (for 64bit)
     DEFAULT_TRIM_THRESHOLD     128 * 1024
     DEFAULT_TOP_PAD            0
     DEFAULT_MMAP_THRESHOLD     128 * 1024
@@ -254,8 +254,12 @@
 #include <malloc-machine.h>
 
 #ifdef _LIBC
+#ifdef ATOMIC_FASTBINS
+#include <atomic.h>
+#endif
 #include <stdio-common/_itoa.h>
 #include <bits/wordsize.h>
+#include <sys/sysinfo.h>
 #endif
 
 #ifdef __cplusplus
@@ -321,12 +325,7 @@ extern "C" {
   or other mallocs available that do this.
 */
 
-#if MALLOC_DEBUG
 #include <assert.h>
-#else
-#undef	assert
-#define assert(x) ((void)0)
-#endif
 
 
 /*
@@ -1308,7 +1307,7 @@ int      __posix_memalign(void **, size_t, size_t);
 #endif
 
 #ifndef DEFAULT_MXFAST
-#define DEFAULT_MXFAST     64
+#define DEFAULT_MXFAST     (64 * SIZE_SZ / 4)
 #endif
 
 
@@ -1582,7 +1581,11 @@ typedef struct malloc_chunk* mchunkptr;
 #if __STD_C
 
 static Void_t*  _int_malloc(mstate, size_t);
+#ifdef ATOMIC_FASTBINS
+static void     _int_free(mstate, mchunkptr, int);
+#else
 static void     _int_free(mstate, mchunkptr);
+#endif
 static Void_t*  _int_realloc(mstate, mchunkptr, INTERNAL_SIZE_T);
 static Void_t*  _int_memalign(mstate, size_t, size_t);
 static Void_t*  _int_valloc(mstate, size_t);
@@ -2239,12 +2242,15 @@ typedef struct malloc_chunk* mbinptr;
 */
 
 typedef struct malloc_chunk* mfastbinptr;
+#define fastbin(ar_ptr, idx) ((ar_ptr)->fastbinsY[idx])
 
 /* offset 2 to use otherwise unindexable first 2 bins */
-#define fastbin_index(sz)        ((((unsigned int)(sz)) >> 3) - 2)
+#define fastbin_index(sz) \
+  ((((unsigned int)(sz)) >> (SIZE_SZ == 8 ? 4 : 3)) - 2)
+
 
 /* The maximum fastbin request size we support */
-#define MAX_FAST_SIZE     80
+#define MAX_FAST_SIZE     (80 * SIZE_SZ / 4)
 
 #define NFASTBINS  (fastbin_index(request2size(MAX_FAST_SIZE))+1)
 
@@ -2279,8 +2285,13 @@ typedef struct malloc_chunk* mfastbinptr;
 #define FASTCHUNKS_BIT        (1U)
 
 #define have_fastchunks(M)     (((M)->flags &  FASTCHUNKS_BIT) == 0)
+#ifdef ATOMIC_FASTBINS
+#define clear_fastchunks(M)    catomic_or (&(M)->flags, FASTCHUNKS_BIT)
+#define set_fastchunks(M)      catomic_and (&(M)->flags, ~FASTCHUNKS_BIT)
+#else
 #define clear_fastchunks(M)    ((M)->flags |=  FASTCHUNKS_BIT)
 #define set_fastchunks(M)      ((M)->flags &= ~FASTCHUNKS_BIT)
+#endif
 
 /*
   NONCONTIGUOUS_BIT indicates that MORECORE does not return contiguous
@@ -2327,7 +2338,7 @@ struct malloc_state {
 #endif
 
   /* Fastbins */
-  mfastbinptr      fastbins[NFASTBINS];
+  mfastbinptr      fastbinsY[NFASTBINS];
 
   /* Base of the topmost chunk -- not otherwise kept in a bin */
   mchunkptr        top;
@@ -2344,6 +2355,11 @@ struct malloc_state {
   /* Linked list */
   struct malloc_state *next;
 
+#ifdef PER_THREAD
+  /* Linked list for free arenas.  */
+  struct malloc_state *next_free;
+#endif
+
   /* Memory allocated from the system in this arena.  */
   INTERNAL_SIZE_T system_mem;
   INTERNAL_SIZE_T max_system_mem;
@@ -2354,6 +2370,10 @@ struct malloc_par {
   unsigned long    trim_threshold;
   INTERNAL_SIZE_T  top_pad;
   INTERNAL_SIZE_T  mmap_threshold;
+#ifdef PER_THREAD
+  INTERNAL_SIZE_T  arena_test;
+  INTERNAL_SIZE_T  arena_max;
+#endif
 
   /* Memory map support */
   int              n_mmaps;
@@ -2391,6 +2411,13 @@ static struct malloc_state main_arena;
 static struct malloc_par mp_;
 
 
+#ifdef PER_THREAD
+/*  Non public mallopt parameters.  */
+#define M_ARENA_TEST -7
+#define M_ARENA_MAX  -8
+#endif
+
+
 /* Maximum size of memory handled in fastbins.  */
 static INTERNAL_SIZE_T global_max_fast;
 
@@ -3037,8 +3064,10 @@ static Void_t* sYSMALLOc(nb, av) INTERNAL_SIZE_T nb; mstate av;
   /* Precondition: not enough current space to satisfy nb request */
   assert((unsigned long)(old_size) < (unsigned long)(nb + MINSIZE));
 
+#ifndef ATOMIC_FASTBINS
   /* Precondition: all fastbins are consolidated */
   assert(!have_fastchunks(av));
+#endif
 
 
   if (av != &main_arena) {
@@ -3084,7 +3113,11 @@ static Void_t* sYSMALLOc(nb, av) INTERNAL_SIZE_T nb; mstate av;
 	set_head(chunk_at_offset(old_top, old_size), (2*SIZE_SZ)|PREV_INUSE);
 	set_foot(chunk_at_offset(old_top, old_size), (2*SIZE_SZ));
 	set_head(old_top, old_size|PREV_INUSE|NON_MAIN_ARENA);
+#ifdef ATOMIC_FASTBINS
+	_int_free(av, old_top, 1);
+#else
 	_int_free(av, old_top);
+#endif
       } else {
 	set_head(old_top, (old_size + 2*SIZE_SZ)|PREV_INUSE);
 	set_foot(old_top, (old_size + 2*SIZE_SZ));
@@ -3323,7 +3356,11 @@ static Void_t* sYSMALLOc(nb, av) INTERNAL_SIZE_T nb; mstate av;
 
           /* If possible, release the rest. */
           if (old_size >= MINSIZE) {
+#ifdef ATOMIC_FASTBINS
+            _int_free(av, old_top, 1);
+#else
             _int_free(av, old_top);
+#endif
           }
 
         }
@@ -3545,7 +3582,40 @@ public_mALLOc(size_t bytes)
   if (__builtin_expect (hook != NULL, 0))
     return (*hook)(bytes, RETURN_ADDRESS (0));
 
-  arena_get(ar_ptr, bytes);
+  arena_lookup(ar_ptr);
+#if 0
+  // XXX We need double-word CAS and fastbins must be extended to also
+  // XXX hold a generation counter for each entry.
+  if (ar_ptr) {
+    INTERNAL_SIZE_T nb;               /* normalized request size */
+    checked_request2size(bytes, nb);
+    if (nb <= get_max_fast ()) {
+      long int idx = fastbin_index(nb);
+      mfastbinptr* fb = &fastbin (ar_ptr, idx);
+      mchunkptr pp = *fb;
+      mchunkptr v;
+      do
+	{
+	  v = pp;
+	  if (v == NULL)
+	    break;
+	}
+      while ((pp = catomic_compare_and_exchange_val_acq (fb, v->fd, v)) != v);
+      if (v != 0) {
+	if (__builtin_expect (fastbin_index (chunksize (v)) != idx, 0))
+	  malloc_printerr (check_action, "malloc(): memory corruption (fast)",
+			   chunk2mem (v));
+	check_remalloced_chunk(ar_ptr, v, nb);
+	void *p = chunk2mem(v);
+	if (__builtin_expect (perturb_byte, 0))
+	  alloc_perturb (p, bytes);
+	return p;
+      }
+    }
+  }
+#endif
+
+  arena_lock(ar_ptr, bytes);
   if(!ar_ptr)
     return 0;
   victim = _int_malloc(ar_ptr, bytes);
@@ -3612,18 +3682,22 @@ public_fREe(Void_t* mem)
 #endif
 
   ar_ptr = arena_for_chunk(p);
-#if THREAD_STATS
+#ifdef ATOMIC_FASTBINS
+  _int_free(ar_ptr, p, 0);
+#else
+# if THREAD_STATS
   if(!mutex_trylock(&ar_ptr->mutex))
     ++(ar_ptr->stat_lock_direct);
   else {
     (void)mutex_lock(&ar_ptr->mutex);
     ++(ar_ptr->stat_lock_wait);
   }
-#else
+# else
   (void)mutex_lock(&ar_ptr->mutex);
-#endif
+# endif
   _int_free(ar_ptr, p);
   (void)mutex_unlock(&ar_ptr->mutex);
+#endif
 }
 #ifdef libc_hidden_def
 libc_hidden_def (public_fREe)
@@ -3699,7 +3773,7 @@ public_rEALLOc(Void_t* oldmem, size_t bytes)
   (void)mutex_lock(&ar_ptr->mutex);
 #endif
 
-#ifndef NO_THREADS
+#if !defined NO_THREADS && !defined PER_THREAD
   /* As in malloc(), remember this arena for the next allocation. */
   tsd_setspecific(arena_key, (Void_t *)ar_ptr);
 #endif
@@ -3717,18 +3791,22 @@ public_rEALLOc(Void_t* oldmem, size_t bytes)
       if (newp != NULL)
 	{
 	  MALLOC_COPY (newp, oldmem, oldsize - SIZE_SZ);
-#if THREAD_STATS
+#ifdef ATOMIC_FASTBINS
+	  _int_free(ar_ptr, oldp, 0);
+#else
+# if THREAD_STATS
 	  if(!mutex_trylock(&ar_ptr->mutex))
 	    ++(ar_ptr->stat_lock_direct);
 	  else {
 	    (void)mutex_lock(&ar_ptr->mutex);
 	    ++(ar_ptr->stat_lock_wait);
 	  }
-#else
+# else
 	  (void)mutex_lock(&ar_ptr->mutex);
-#endif
+# endif
 	  _int_free(ar_ptr, oldp);
 	  (void)mutex_unlock(&ar_ptr->mutex);
+#endif
 	}
     }
 
@@ -4130,7 +4208,6 @@ _int_malloc(mstate av, size_t bytes)
   INTERNAL_SIZE_T nb;               /* normalized request size */
   unsigned int    idx;              /* associated bin index */
   mbinptr         bin;              /* associated bin */
-  mfastbinptr*    fb;               /* associated fastbin */
 
   mchunkptr       victim;           /* inspected/selected chunk */
   INTERNAL_SIZE_T size;             /* its size */
@@ -4164,13 +4241,28 @@ _int_malloc(mstate av, size_t bytes)
   */
 
   if ((unsigned long)(nb) <= (unsigned long)(get_max_fast ())) {
-    long int idx = fastbin_index(nb);
-    fb = &(av->fastbins[idx]);
-    if ( (victim = *fb) != 0) {
+    idx = fastbin_index(nb);
+    mfastbinptr* fb = &fastbin (av, idx);
+#ifdef ATOMIC_FASTBINS
+    mchunkptr pp = *fb;
+    do
+      {
+	victim = pp;
+	if (victim == NULL)
+	  break;
+      }
+    while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
+	   != victim);
+#else
+    victim = *fb;
+#endif
+    if (victim != 0) {
       if (__builtin_expect (fastbin_index (chunksize (victim)) != idx, 0))
 	malloc_printerr (check_action, "malloc(): memory corruption (fast)",
 			 chunk2mem (victim));
+#ifndef ATOMIC_FASTBINS
       *fb = victim->fd;
+#endif
       check_remalloced_chunk(av, victim, nb);
       void *p = chunk2mem(victim);
       if (__builtin_expect (perturb_byte, 0))
@@ -4560,6 +4652,18 @@ _int_malloc(mstate av, size_t bytes)
       return p;
     }
 
+#ifdef ATOMIC_FASTBINS
+    /* When we are using atomic ops to free fast chunks we can get
+       here for all block sizes.  */
+    else if (have_fastchunks(av)) {
+      malloc_consolidate(av);
+      /* restore original bin index */
+      if (in_smallbin_range(nb))
+	idx = smallbin_index(nb);
+      else
+	idx = largebin_index(nb);
+    }
+#else
     /*
       If there is space available in fastbins, consolidate and retry,
       to possibly avoid expanding memory. This can occur only if nb is
@@ -4571,6 +4675,7 @@ _int_malloc(mstate av, size_t bytes)
       malloc_consolidate(av);
       idx = smallbin_index(nb); /* restore original bin index */
     }
+#endif
 
     /*
        Otherwise, relay to handle system-dependent cases
@@ -4589,7 +4694,11 @@ _int_malloc(mstate av, size_t bytes)
 */
 
 static void
+#ifdef ATOMIC_FASTBINS
+_int_free(mstate av, mchunkptr p, int have_lock)
+#else
 _int_free(mstate av, mchunkptr p)
+#endif
 {
   INTERNAL_SIZE_T size;        /* its size */
   mfastbinptr*    fb;          /* associated fastbin */
@@ -4601,6 +4710,9 @@ _int_free(mstate av, mchunkptr p)
   mchunkptr       fwd;         /* misc temp for linking */
 
   const char *errstr = NULL;
+#ifdef ATOMIC_FASTBINS
+  int locked = 0;
+#endif
 
   size = chunksize(p);
 
@@ -4613,6 +4725,10 @@ _int_free(mstate av, mchunkptr p)
     {
       errstr = "free(): invalid pointer";
     errout:
+#ifdef ATOMIC_FASTBINS
+      if (! have_lock && locked)
+	(void)mutex_unlock(&av->mutex);
+#endif
       malloc_printerr (check_action, errstr, chunk2mem(p));
       return;
     }
@@ -4649,8 +4765,28 @@ _int_free(mstate av, mchunkptr p)
 	goto errout;
       }
 
+    if (__builtin_expect (perturb_byte, 0))
+      free_perturb (chunk2mem(p), size - SIZE_SZ);
+
     set_fastchunks(av);
-    fb = &(av->fastbins[fastbin_index(size)]);
+    fb = &fastbin (av, fastbin_index(size));
+
+#ifdef ATOMIC_FASTBINS
+    mchunkptr fd;
+    mchunkptr old = *fb;
+    do
+      {
+	/* Another simple check: make sure the top of the bin is not the
+	   record we are going to add (i.e., double free).  */
+	if (__builtin_expect (old == p, 0))
+	  {
+	    errstr = "double free or corruption (fasttop)";
+	    goto errout;
+	  }
+	p->fd = fd = old;
+      }
+    while ((old = catomic_compare_and_exchange_val_acq (fb, p, fd)) != fd);
+#else
     /* Another simple check: make sure the top of the bin is not the
        record we are going to add (i.e., double free).  */
     if (__builtin_expect (*fb == p, 0))
@@ -4659,11 +4795,9 @@ _int_free(mstate av, mchunkptr p)
 	goto errout;
       }
 
-    if (__builtin_expect (perturb_byte, 0))
-      free_perturb (chunk2mem(p), size - SIZE_SZ);
-
     p->fd = *fb;
     *fb = p;
+#endif
   }
 
   /*
@@ -4671,6 +4805,22 @@ _int_free(mstate av, mchunkptr p)
   */
 
   else if (!chunk_is_mmapped(p)) {
+#ifdef ATOMIC_FASTBINS
+    if (! have_lock) {
+# if THREAD_STATS
+      if(!mutex_trylock(&av->mutex))
+	++(av->stat_lock_direct);
+      else {
+	(void)mutex_lock(&av->mutex);
+	++(av->stat_lock_wait);
+      }
+# else
+      (void)mutex_lock(&av->mutex);
+# endif
+      locked = 1;
+    }
+#endif
+
     nextchunk = chunk_at_offset(p, size);
 
     /* Lightweight tests: check whether the block is already the
@@ -4794,6 +4944,12 @@ _int_free(mstate av, mchunkptr p)
       }
     }
 
+#ifdef ATOMIC_FASTBINS
+    if (! have_lock) {
+      assert (locked);
+      (void)mutex_unlock(&av->mutex);
+    }
+#endif
   }
   /*
     If the chunk was allocated via mmap, release via munmap(). Note
@@ -4869,15 +5025,21 @@ static void malloc_consolidate(av) mstate av;
        because, except for the main arena, all the others might have
        blocks in the high fast bins.  It's not worth it anyway, just
        search all bins all the time.  */
-    maxfb = &(av->fastbins[fastbin_index(get_max_fast ())]);
+    maxfb = &fastbin (av, fastbin_index(get_max_fast ()));
 #else
-    maxfb = &(av->fastbins[NFASTBINS - 1]);
+    maxfb = &fastbin (av, NFASTBINS - 1);
 #endif
-    fb = &(av->fastbins[0]);
+    fb = &fastbin (av, 0);
     do {
-      if ( (p = *fb) != 0) {
-        *fb = 0;
-
+#ifdef ATOMIC_FASTBINS
+      p = atomic_exchange_acq (fb, 0);
+#else
+      p = *fb;
+#endif
+      if (p != 0) {
+#ifndef ATOMIC_FASTBINS
+	*fb = 0;
+#endif
         do {
           check_inuse_chunk(av, p);
           nextp = p->fd;
@@ -5070,7 +5232,11 @@ _int_realloc(mstate av, mchunkptr oldp, INTERNAL_SIZE_T nb)
             }
           }
 
+#ifdef ATOMIC_FASTBINS
+          _int_free(av, oldp, 1);
+#else
           _int_free(av, oldp);
+#endif
           check_inuse_chunk(av, newp);
           return chunk2mem(newp);
         }
@@ -5094,7 +5260,11 @@ _int_realloc(mstate av, mchunkptr oldp, INTERNAL_SIZE_T nb)
 	       (av != &main_arena ? NON_MAIN_ARENA : 0));
       /* Mark remainder as inuse so free() won't complain */
       set_inuse_bit_at_offset(remainder, remainder_size);
+#ifdef ATOMIC_FASTBINS
+      _int_free(av, remainder, 1);
+#else
       _int_free(av, remainder);
+#endif
     }
 
     check_inuse_chunk(av, newp);
@@ -5153,7 +5323,11 @@ _int_realloc(mstate av, mchunkptr oldp, INTERNAL_SIZE_T nb)
       newmem = _int_malloc(av, nb - MALLOC_ALIGN_MASK);
       if (newmem != 0) {
         MALLOC_COPY(newmem, chunk2mem(oldp), oldsize - 2*SIZE_SZ);
+#ifdef ATOMIC_FASTBINS
+        _int_free(av, oldp, 1);
+#else
         _int_free(av, oldp);
+#endif
       }
     }
     return newmem;
@@ -5247,7 +5421,11 @@ _int_memalign(mstate av, size_t alignment, size_t bytes)
 	     (av != &main_arena ? NON_MAIN_ARENA : 0));
     set_inuse_bit_at_offset(newp, newsize);
     set_head_size(p, leadsize | (av != &main_arena ? NON_MAIN_ARENA : 0));
+#ifdef ATOMIC_FASTBINS
+    _int_free(av, p, 1);
+#else
     _int_free(av, p);
+#endif
     p = newp;
 
     assert (newsize >= nb &&
@@ -5263,7 +5441,11 @@ _int_memalign(mstate av, size_t alignment, size_t bytes)
       set_head(remainder, remainder_size | PREV_INUSE |
 	       (av != &main_arena ? NON_MAIN_ARENA : 0));
       set_head_size(p, nb);
+#ifdef ATOMIC_FASTBINS
+      _int_free(av, remainder, 1);
+#else
       _int_free(av, remainder);
+#endif
     }
   }
 
@@ -5650,7 +5832,7 @@ struct mallinfo mALLINFo(mstate av)
   fastavail = 0;
 
   for (i = 0; i < NFASTBINS; ++i) {
-    for (p = av->fastbins[i]; p != 0; p = p->fd) {
+    for (p = fastbin (av, i); p != 0; p = p->fd) {
       ++nfastblocks;
       fastavail += chunksize(p);
     }
@@ -5818,6 +6000,18 @@ int mALLOPt(param_number, value) int param_number; int value;
   case M_PERTURB:
     perturb_byte = value;
     break;
+
+#ifdef PER_THREAD
+  case M_ARENA_TEST:
+    if (value > 0)
+      mp_.arena_test = value;
+    break;
+
+  case M_ARENA_MAX:
+    if (value > 0)
+      mp_.arena_max = value;
+    break;
+#endif
   }
   (void)mutex_unlock(&av->mutex);
   return res;
diff --git a/malloc/malloc.h b/malloc/malloc.h
index b6d7a8afaf..2c0ee35c4e 100644
--- a/malloc/malloc.h
+++ b/malloc/malloc.h
@@ -1,5 +1,5 @@
 /* Prototypes and definition for malloc implementation.
-   Copyright (C) 1996, 1997, 1999, 2000, 2002-2004, 2005, 2007
+   Copyright (C) 1996, 1997, 1999, 2000, 2002-2004, 2005, 2007, 2009
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -127,6 +127,8 @@ extern struct mallinfo mallinfo __MALLOC_P ((void));
 #define M_MMAP_MAX          -4
 #define M_CHECK_ACTION      -5
 #define M_PERTURB	    -6
+#define M_ARENA_TEST	    -7
+#define M_ARENA_MAX	    -8
 
 /* General SVID/XPG interface to tunable parameters. */
 extern int mallopt __MALLOC_P ((int __param, int __val));
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index a7e3fc7633..8769e9c966 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -23,6 +23,10 @@
 #include <stdlib.h>
 #include <unistd.h>
 
+#ifdef USE_MULTIARCH
+# include "multiarch/init-arch.h"
+#endif
+
 static const struct intel_02_cache_info
 {
   unsigned int idx;
@@ -443,19 +447,32 @@ init_cacheinfo (void)
   unsigned int ebx;
   unsigned int ecx;
   unsigned int edx;
-  int max_cpuid;
   int max_cpuid_ex;
   long int data = -1;
   long int shared = -1;
   unsigned int level;
   unsigned int threads = 0;
 
+#ifdef USE_MULTIARCH
+  if (__cpu_features.kind == arch_kind_unknown)
+    __init_cpu_features ();
+# define is_intel __cpu_features.kind == arch_kind_intel
+# define is_amd __cpu_features.kind == arch_kind_amd
+# define max_cpuid __cpu_features.max_cpuid
+#else
+  int max_cpuid;
   asm volatile ("cpuid"
 		: "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx)
 		: "0" (0));
-
   /* This spells out "GenuineIntel".  */
-  if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
+# define is_intel \
+  ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69
+  /* This spells out "AuthenticAMD".  */
+# define is_amd \
+  ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65
+#endif
+
+  if (is_intel)
     {
       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
 
@@ -470,9 +487,16 @@ init_cacheinfo (void)
           shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
 	}
 
+#ifdef USE_MULTIARCH
+      eax = __cpu_features.cpuid[INTEL_CPUID_INDEX_1].eax;
+      ebx = __cpu_features.cpuid[INTEL_CPUID_INDEX_1].ebx;
+      ecx = __cpu_features.cpuid[INTEL_CPUID_INDEX_1].ecx;
+      edx = __cpu_features.cpuid[INTEL_CPUID_INDEX_1].edx;
+#else
       asm volatile ("cpuid"
 		    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
 		    : "0" (1));
+#endif
 
       /* Intel prefers SSSE3 instructions for memory/string routines
 	 if they are avaiable.  */
@@ -519,7 +543,7 @@ init_cacheinfo (void)
         shared /= threads;
     }
   /* This spells out "AuthenticAMD".  */
-  else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+  else if (is_amd)
     {
       data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
       long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
new file mode 100644
index 0000000000..2a1e910e06
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),csu)
+aux += init-arch
+endif
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
new file mode 100644
index 0000000000..eb4365fe32
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -0,0 +1,65 @@
+/* Initialize CPU feature data.
+   This file is part of the GNU C Library.
+   Copyright (C) 2008 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include "init-arch.h"
+
+
+struct cpu_features __cpu_features attribute_hidden;
+
+
+void
+__init_cpu_features (void)
+{
+  unsigned int ebx;
+  unsigned int ecx;
+  unsigned int edx;
+
+  asm volatile ("cpuid"
+		: "=a" (__cpu_features.max_cpuid), "=b" (ebx), "=c" (ecx),
+		  "=d" (edx)
+		: "0" (0));
+
+  /* This spells out "GenuineIntel".  */
+  if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
+    {
+      __cpu_features.kind = arch_kind_intel;
+
+      asm volatile ("cpuid"
+		    : "=a" (__cpu_features.cpuid[INTEL_CPUID_INDEX_1].eax),
+		      "=b" (__cpu_features.cpuid[INTEL_CPUID_INDEX_1].ebx),
+		      "=c" (__cpu_features.cpuid[INTEL_CPUID_INDEX_1].ecx),
+		      "=d" (__cpu_features.cpuid[INTEL_CPUID_INDEX_1].edx)
+		    : "0" (1));
+    }
+  /* This spells out "AuthenticAMD".  */
+  else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+    {
+      __cpu_features.kind = arch_kind_amd;
+
+      asm volatile ("cpuid"
+		    : "=a" (__cpu_features.cpuid[AMD_CPUID_INDEX_1].eax),
+		      "=b" (__cpu_features.cpuid[AMD_CPUID_INDEX_1].ebx),
+		      "=c" (__cpu_features.cpuid[AMD_CPUID_INDEX_1].ecx),
+		      "=d" (__cpu_features.cpuid[AMD_CPUID_INDEX_1].edx)
+		    : "0" (1));
+    }
+  else
+    __cpu_features.kind = arch_kind_other;
+}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
new file mode 100644
index 0000000000..86cd83dc4c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -0,0 +1,70 @@
+/* This file is part of the GNU C Library.
+   Copyright (C) 2008 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sys/param.h>
+
+enum
+  {
+    INTEL_CPUID_INDEX_1 = 0,
+    /* Keep the following line at the end.  */
+    INTEL_CPUID_INDEX_MAX
+  };
+
+enum
+  {
+    AMD_CPUID_INDEX_1 = 0,
+    /* Keep the following line at the end.  */
+    AMD_CPUID_INDEX_MAX
+  };
+
+extern struct cpu_features
+{
+  enum
+    {
+      arch_kind_unknown = 0,
+      arch_kind_intel,
+      arch_kind_amd,
+      arch_kind_other
+    } kind;
+  int max_cpuid;
+  struct
+  {
+    unsigned int eax;
+    unsigned int ebx;
+    unsigned int ecx;
+    unsigned int edx;
+  } cpuid[MAX (INTEL_CPUID_INDEX_MAX, AMD_CPUID_INDEX_MAX)];
+} __cpu_features attribute_hidden;
+
+
+extern void __init_cpu_features (void) attribute_hidden;
+#define INIT_ARCH()\
+  do							\
+    if (__cpu_features.kind == arch_kind_unknown)	\
+      __init_cpu_features ();				\
+  while (0)
+
+/* Following are the feature tests used throughout libc.  */
+
+#define INTEL_HAS_POPCOUNT \
+  (__cpu_features.kind == arch_kind_intel				\
+   && (__cpu_features.cpuid[INTEL_CPUID_INDEX_1].ecx & (1 << 23)) != 0)
+
+#define AMD_HAS_POPCOUNT \
+  (__cpu_features.kind == arch_kind_amd				\
+   && (__cpu_features.cpuid[AMD_CPUID_INDEX_1].ecx & (1 << 23)) != 0)
diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c
new file mode 100644
index 0000000000..dc20182df4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sched_cpucount.c
@@ -0,0 +1,42 @@
+/* Count bits in CPU set.  x86-64 multi-arch version.
+   This file is part of the GNU C Library.
+   Copyright (C) 2008 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifdef SHARED
+# include <sched.h>
+# include "init-arch.h"
+
+# define __sched_cpucount static generic_cpucount
+# include <posix/sched_cpucount.c>
+# undef __sched_cpucount
+
+# define POPCNT(l) \
+  ({ __cpu_mask r; \
+     asm ("popcntq %1, %0" : "=r" (r) : "0" (l));\
+     r; })
+# define __sched_cpucount static popcount_cpucount
+# include <posix/sched_cpucount.c>
+# undef __sched_cpucount
+
+libc_ifunc (__sched_cpucount,
+	    INTEL_HAS_POPCOUNT || AMD_HAS_POPCOUNT
+	    ? popcount_cpucount : generic_cpucount);
+#else
+# include_next <sched_cpucount.c>
+#endif