From d5c3fafc4307c9b7a4c7d5cb381fcdbfad340bcc Mon Sep 17 00:00:00 2001 From: DJ Delorie Date: Thu, 6 Jul 2017 13:37:30 -0400 Subject: Add per-thread cache to malloc * config.make.in: Enable experimental malloc option. * configure.ac: Likewise. * configure: Regenerate. * manual/install.texi: Document it. * INSTALL: Regenerate. * malloc/Makefile: Likewise. * malloc/malloc.c: Add per-thread cache (tcache). (tcache_put): New. (tcache_get): New. (tcache_thread_freeres): New. (tcache_init): New. (__libc_malloc): Use cached chunks if available. (__libc_free): Initialize tcache if needed. (__libc_realloc): Likewise. (__libc_calloc): Likewise. (_int_malloc): Prefill tcache when appropriate. (_int_free): Likewise. (do_set_tcache_max): New. (do_set_tcache_count): New. (do_set_tcache_unsorted_limit): New. * manual/probes.texi: Document new probes. * malloc/arena.c: Add new tcache tunables. * elf/dl-tunables.list: Likewise. * manual/tunables.texi: Document them. * NEWS: Mention the per-thread cache. --- malloc/Makefile | 5 + malloc/arena.c | 11 ++ malloc/malloc.c | 350 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 357 insertions(+), 9 deletions(-) (limited to 'malloc') diff --git a/malloc/Makefile b/malloc/Makefile index b50de7cd6c..3fa395b949 100644 --- a/malloc/Makefile +++ b/malloc/Makefile @@ -189,6 +189,11 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV) tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3 tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV) +ifeq ($(experimental-malloc),yes) +CPPFLAGS-malloc.c += -DUSE_TCACHE=1 +else +CPPFLAGS-malloc.c += -DUSE_TCACHE=0 +endif # Uncomment this for test releases. For public releases it is too expensive. #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1 diff --git a/malloc/arena.c b/malloc/arena.c index 660d638c93..dc14fae152 100644 --- a/malloc/arena.c +++ b/malloc/arena.c @@ -236,6 +236,11 @@ TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t) TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t) TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t) TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t) +#if USE_TCACHE +TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t) +TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t) +TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t) +#endif #else /* Initialization routine. */ #include @@ -322,6 +327,12 @@ ptmalloc_init (void) TUNABLE_GET (mmap_max, int32_t, TUNABLE_CALLBACK (set_mmaps_max)); TUNABLE_GET (arena_max, size_t, TUNABLE_CALLBACK (set_arena_max)); TUNABLE_GET (arena_test, size_t, TUNABLE_CALLBACK (set_arena_test)); +#if USE_TCACHE + TUNABLE_GET (tcache_max, size_t, TUNABLE_CALLBACK (set_tcache_max)); + TUNABLE_GET (tcache_count, size_t, TUNABLE_CALLBACK (set_tcache_count)); + TUNABLE_GET (tcache_unsorted_limit, size_t, + TUNABLE_CALLBACK (set_tcache_unsorted_limit)); +#endif __libc_lock_unlock (main_arena.mutex); #else const char *s = NULL; diff --git a/malloc/malloc.c b/malloc/malloc.c index aa45626093..2527e25047 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -238,6 +238,9 @@ /* For ALIGN_UP et. al. */ #include +/* For DIAG_PUSH/POP_NEEDS_COMMENT et al. */ +#include + #include /* @@ -296,6 +299,30 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line, } #endif +#if USE_TCACHE +/* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */ +# define TCACHE_MAX_BINS 64 +# define MAX_TCACHE_SIZE tidx2usize (TCACHE_MAX_BINS-1) + +/* Only used to pre-fill the tunables. */ +# define tidx2usize(idx) (((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ) + +/* When "x" is from chunksize(). */ +# define csize2tidx(x) (((x) - MINSIZE + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT) +/* When "x" is a user-provided size. */ +# define usize2tidx(x) csize2tidx (request2size (x)) + +/* With rounding and alignment, the bins are... + idx 0 bytes 0..24 (64-bit) or 0..12 (32-bit) + idx 1 bytes 25..40 or 13..20 + idx 2 bytes 41..56 or 21..28 + etc. */ + +/* This is another arbitrary limit, which tunables can change. Each + tcache bin will hold at most this number of chunks. */ +# define TCACHE_FILL_COUNT 7 +#endif + /* REALLOC_ZERO_BYTES_FREES should be set if a call to @@ -1712,6 +1739,17 @@ struct malloc_par /* First address handed out by MORECORE/sbrk. */ char *sbrk_base; + +#if USE_TCACHE + /* Maximum number of buckets to use. */ + size_t tcache_bins; + size_t tcache_max_bytes; + /* Maximum number of chunks in each bucket. */ + size_t tcache_count; + /* Maximum number of chunks to remove from the unsorted list, which + aren't used to prefill the cache. */ + size_t tcache_unsorted_limit; +#endif }; /* There are several instances of this struct ("arenas") in this @@ -1750,6 +1788,13 @@ static struct malloc_par mp_ = .trim_threshold = DEFAULT_TRIM_THRESHOLD, #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8)) .arena_test = NARENAS_FROM_NCORES (1) +#if USE_TCACHE + , + .tcache_count = TCACHE_FILL_COUNT, + .tcache_bins = TCACHE_MAX_BINS, + .tcache_max_bytes = tidx2usize (TCACHE_MAX_BINS-1), + .tcache_unsorted_limit = 0 /* No limit. */ +#endif }; /* Maximum size of memory handled in fastbins. */ @@ -2875,6 +2920,124 @@ mremap_chunk (mchunkptr p, size_t new_size) /*------------------------ Public wrappers. --------------------------------*/ +#if USE_TCACHE + +/* We overlay this structure on the user-data portion of a chunk when + the chunk is stored in the per-thread cache. */ +typedef struct tcache_entry +{ + struct tcache_entry *next; +} tcache_entry; + +/* There is one of these for each thread, which contains the + per-thread cache (hence "tcache_perthread_struct"). Keeping + overall size low is mildly important. Note that COUNTS and ENTRIES + are redundant (we could have just counted the linked list each + time), this is for performance reasons. */ +typedef struct tcache_perthread_struct +{ + char counts[TCACHE_MAX_BINS]; + tcache_entry *entries[TCACHE_MAX_BINS]; +} tcache_perthread_struct; + +static __thread char tcache_shutting_down = 0; +static __thread tcache_perthread_struct *tcache = NULL; + +/* Caller must ensure that we know tc_idx is valid and there's room + for more chunks. */ +static void +tcache_put (mchunkptr chunk, size_t tc_idx) +{ + tcache_entry *e = (tcache_entry *) chunk2mem (chunk); + assert (tc_idx < TCACHE_MAX_BINS); + e->next = tcache->entries[tc_idx]; + tcache->entries[tc_idx] = e; + ++(tcache->counts[tc_idx]); +} + +/* Caller must ensure that we know tc_idx is valid and there's + available chunks to remove. */ +static void * +tcache_get (size_t tc_idx) +{ + tcache_entry *e = tcache->entries[tc_idx]; + assert (tc_idx < TCACHE_MAX_BINS); + assert (tcache->entries[tc_idx] > 0); + tcache->entries[tc_idx] = e->next; + --(tcache->counts[tc_idx]); + return (void *) e; +} + +static void __attribute__ ((section ("__libc_thread_freeres_fn"))) +tcache_thread_freeres (void) +{ + int i; + tcache_perthread_struct *tcache_tmp = tcache; + + if (!tcache) + return; + + tcache = NULL; + + for (i = 0; i < TCACHE_MAX_BINS; ++i) + { + while (tcache_tmp->entries[i]) + { + tcache_entry *e = tcache_tmp->entries[i]; + tcache_tmp->entries[i] = e->next; + __libc_free (e); + } + } + + __libc_free (tcache_tmp); + + tcache_shutting_down = 1; +} +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres); + +static void +tcache_init(void) +{ + mstate ar_ptr; + void *victim = 0; + const size_t bytes = sizeof (tcache_perthread_struct); + + if (tcache_shutting_down) + return; + + arena_get (ar_ptr, bytes); + victim = _int_malloc (ar_ptr, bytes); + if (!victim && ar_ptr != NULL) + { + ar_ptr = arena_get_retry (ar_ptr, bytes); + victim = _int_malloc (ar_ptr, bytes); + } + + + if (ar_ptr != NULL) + __libc_lock_unlock (ar_ptr->mutex); + + /* In a low memory situation, we may not be able to allocate memory + - in which case, we just keep trying later. However, we + typically do this very early, so either there is sufficient + memory, or there isn't enough memory to do non-trivial + allocations anyway. */ + if (victim) + { + tcache = (tcache_perthread_struct *) victim; + memset (tcache, 0, sizeof (tcache_perthread_struct)); + } + +} + +#define MAYBE_INIT_TCACHE() \ + if (__glibc_unlikely (tcache == NULL)) \ + tcache_init(); + +#else +#define MAYBE_INIT_TCACHE() +#endif + void * __libc_malloc (size_t bytes) { @@ -2885,6 +3048,23 @@ __libc_malloc (size_t bytes) = atomic_forced_read (__malloc_hook); if (__builtin_expect (hook != NULL, 0)) return (*hook)(bytes, RETURN_ADDRESS (0)); +#if USE_TCACHE + /* int_free also calls request2size, be careful to not pad twice. */ + size_t tbytes = request2size (bytes); + size_t tc_idx = csize2tidx (tbytes); + + MAYBE_INIT_TCACHE (); + + DIAG_PUSH_NEEDS_COMMENT; + if (tc_idx < mp_.tcache_bins + /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */ + && tcache + && tcache->entries[tc_idx] != NULL) + { + return tcache_get (tc_idx); + } + DIAG_POP_NEEDS_COMMENT; +#endif arena_get (ar_ptr, bytes); @@ -2944,6 +3124,8 @@ __libc_free (void *mem) return; } + MAYBE_INIT_TCACHE (); + ar_ptr = arena_for_chunk (p); _int_free (ar_ptr, p, 0); } @@ -2981,7 +3163,10 @@ __libc_realloc (void *oldmem, size_t bytes) if (chunk_is_mmapped (oldp)) ar_ptr = NULL; else - ar_ptr = arena_for_chunk (oldp); + { + MAYBE_INIT_TCACHE (); + ar_ptr = arena_for_chunk (oldp); + } /* Little security check which won't hurt performance: the allocator never wrapps around at the end of the address space. Therefore @@ -3206,6 +3391,8 @@ __libc_calloc (size_t n, size_t elem_size) sz = bytes; + MAYBE_INIT_TCACHE (); + arena_get (av, sz); if (av) { @@ -3336,6 +3523,10 @@ _int_malloc (mstate av, size_t bytes) mchunkptr fwd; /* misc temp for linking */ mchunkptr bck; /* misc temp for linking */ +#if USE_TCACHE + size_t tcache_unsorted_count; /* count of unsorted chunks processed */ +#endif + const char *errstr = NULL; /* @@ -3365,19 +3556,22 @@ _int_malloc (mstate av, size_t bytes) can try it without checking, which saves some time on this fast path. */ +#define REMOVE_FB(fb, victim, pp) \ + do \ + { \ + victim = pp; \ + if (victim == NULL) \ + break; \ + } \ + while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \ + != victim); \ + if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ())) { idx = fastbin_index (nb); mfastbinptr *fb = &fastbin (av, idx); mchunkptr pp = *fb; - do - { - victim = pp; - if (victim == NULL) - break; - } - while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) - != victim); + REMOVE_FB (fb, victim, pp); if (victim != 0) { if (__builtin_expect (fastbin_index (chunksize (victim)) != idx, 0)) @@ -3388,6 +3582,26 @@ _int_malloc (mstate av, size_t bytes) return NULL; } check_remalloced_chunk (av, victim, nb); +#if USE_TCACHE + /* While we're here, if we see other chunks of the same size, + stash them in the tcache. */ + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_bins) + { + mchunkptr tc_victim; + + /* While bin not empty and tcache not full, copy chunks over. */ + while (tcache->counts[tc_idx] < mp_.tcache_count + && (pp = *fb) != NULL) + { + REMOVE_FB (fb, tc_victim, pp); + if (tc_victim != 0) + { + tcache_put (tc_victim, tc_idx); + } + } + } +#endif void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; @@ -3426,6 +3640,32 @@ _int_malloc (mstate av, size_t bytes) if (av != &main_arena) set_non_main_arena (victim); check_malloced_chunk (av, victim, nb); +#if USE_TCACHE + /* While we're here, if we see other chunks of the same size, + stash them in the tcache. */ + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_bins) + { + mchunkptr tc_victim; + + /* While bin not empty and tcache not full, copy chunks over. */ + while (tcache->counts[tc_idx] < mp_.tcache_count + && (tc_victim = last (bin)) != bin) + { + if (tc_victim != 0) + { + bck = tc_victim->bk; + set_inuse_bit_at_offset (tc_victim, nb); + if (av != &main_arena) + set_non_main_arena (tc_victim); + bin->bk = bck; + bck->fd = bin; + + tcache_put (tc_victim, tc_idx); + } + } + } +#endif void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; @@ -3464,6 +3704,16 @@ _int_malloc (mstate av, size_t bytes) otherwise need to expand memory to service a "small" request. */ +#if USE_TCACHE + INTERNAL_SIZE_T tcache_nb = 0; + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_bins) + tcache_nb = nb; + int return_cached = 0; + + tcache_unsorted_count = 0; +#endif + for (;; ) { int iters = 0; @@ -3524,10 +3774,26 @@ _int_malloc (mstate av, size_t bytes) set_inuse_bit_at_offset (victim, size); if (av != &main_arena) set_non_main_arena (victim); +#if USE_TCACHE + /* Fill cache first, return to user only if cache fills. + We may return one of these chunks later. */ + if (tcache_nb + && tcache->counts[tc_idx] < mp_.tcache_count) + { + tcache_put (victim, tc_idx); + return_cached = 1; + continue; + } + else + { +#endif check_malloced_chunk (av, victim, nb); void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; +#if USE_TCACHE + } +#endif } /* place chunk in bin */ @@ -3594,11 +3860,31 @@ _int_malloc (mstate av, size_t bytes) fwd->bk = victim; bck->fd = victim; +#if USE_TCACHE + /* If we've processed as many chunks as we're allowed while + filling the cache, return one of the cached ones. */ + ++tcache_unsorted_count; + if (return_cached + && mp_.tcache_unsorted_limit > 0 + && tcache_unsorted_count > mp_.tcache_unsorted_limit) + { + return tcache_get (tc_idx); + } +#endif + #define MAX_ITERS 10000 if (++iters >= MAX_ITERS) break; } +#if USE_TCACHE + /* If all the small chunks we found ended up cached, return one now. */ + if (return_cached) + { + return tcache_get (tc_idx); + } +#endif + /* If a large request, scan through the chunks of current bin in sorted order to find smallest that fits. Use the skip list for this. @@ -3884,6 +4170,20 @@ _int_free (mstate av, mchunkptr p, int have_lock) check_inuse_chunk(av, p); +#if USE_TCACHE + { + size_t tc_idx = csize2tidx (size); + + if (tcache + && tc_idx < mp_.tcache_bins + && tcache->counts[tc_idx] < mp_.tcache_count) + { + tcache_put (p, tc_idx); + return; + } + } +#endif + /* If eligible, place chunk on a fastbin so it can be found and used quickly in malloc. @@ -4845,6 +5145,38 @@ do_set_arena_max (size_t value) return 1; } +#if USE_TCACHE +static inline int +__always_inline +do_set_tcache_max (size_t value) +{ + if (value >= 0 && value <= MAX_TCACHE_SIZE) + { + LIBC_PROBE (memory_tunable_tcache_max_bytes, 2, value, mp_.tcache_max_bytes); + mp_.tcache_max_bytes = value; + mp_.tcache_bins = csize2tidx (request2size(value)) + 1; + } + return 1; +} + +static inline int +__always_inline +do_set_tcache_count (size_t value) +{ + LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count); + mp_.tcache_count = value; + return 1; +} + +static inline int +__always_inline +do_set_tcache_unsorted_limit (size_t value) +{ + LIBC_PROBE (memory_tunable_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit); + mp_.tcache_unsorted_limit = value; + return 1; +} +#endif int __libc_mallopt (int param_number, int value) -- cgit 1.4.1