diff options
author | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2021-02-16 12:55:13 +0000 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2021-04-13 08:43:40 +0100 |
commit | b116855de71098ef7dd2875dd3237f8f3ecc12c2 (patch) | |
tree | 6da94b9ca548a5042d743626fdbe1990fa5221a2 | |
parent | f8ea2b9982e39fd950d157f5dba31121ceb51df3 (diff) | |
download | glibc-nsz/bug19329-v2.tar.gz glibc-nsz/bug19329-v2.tar.xz glibc-nsz/bug19329-v2.zip |
RFC elf: Fix slow tls access after dlopen [BZ #19924] nsz/bug19329-v2
In short: __tls_get_addr checks the global generation counter, _dl_update_slotinfo updates up to the generation of the accessed module. If the global generation is newer than geneneration of the module then __tls_get_addr keeps hitting the slow path that updates the dtv. Possible approaches i can see: 1. update to global generation instead of module, 2. check the module generation in the fast path. This patch is 1.: it needs additional sync (load acquire) so the slotinfo list is up to date with the observed global generation. Approach 2. would require walking the slotinfo list at all times. I don't know how to make that fast with many modules. Note: in the x86_64 version of dl-tls.c the generation is only loaded once, since relaxed mo is not faster than acquire mo load. I have not benchmarked this yet.
-rw-r--r-- | elf/dl-close.c | 2 | ||||
-rw-r--r-- | elf/dl-open.c | 8 | ||||
-rw-r--r-- | elf/dl-reloc.c | 5 | ||||
-rw-r--r-- | elf/dl-tls.c | 28 | ||||
-rw-r--r-- | sysdeps/generic/ldsodefs.h | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/dl-tls.c | 4 |
6 files changed, 23 insertions, 27 deletions
diff --git a/elf/dl-close.c b/elf/dl-close.c index 9f31532f41..45f8a7fe31 100644 --- a/elf/dl-close.c +++ b/elf/dl-close.c @@ -780,7 +780,7 @@ _dl_close_worker (struct link_map *map, bool force) if (__glibc_unlikely (newgen == 0)) _dl_fatal_printf ("TLS generation counter wrapped! Please report as described in "REPORT_BUGS_TO".\n"); /* Can be read concurrently. */ - atomic_store_relaxed (&GL(dl_tls_generation), newgen); + atomic_store_release (&GL(dl_tls_generation), newgen); if (tls_free_end == GL(dl_tls_static_used)) GL(dl_tls_static_used) = tls_free_start; diff --git a/elf/dl-open.c b/elf/dl-open.c index 661f26977e..5b9816e4e8 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -400,7 +400,7 @@ update_tls_slotinfo (struct link_map *new) _dl_fatal_printf (N_("\ TLS generation counter wrapped! Please report this.")); /* Can be read concurrently. */ - atomic_store_relaxed (&GL(dl_tls_generation), newgen); + atomic_store_release (&GL(dl_tls_generation), newgen); /* We need a second pass for static tls data, because _dl_update_slotinfo must not be run while calls to @@ -417,8 +417,8 @@ TLS generation counter wrapped! Please report this.")); now, but we can delay updating the DTV. */ imap->l_need_tls_init = 0; #ifdef SHARED - /* Update the slot information data for at least the - generation of the DSO we are allocating data for. */ + /* Update the slot information data for the current + generation. */ /* FIXME: This can terminate the process on memory allocation failure. It is not possible to raise @@ -426,7 +426,7 @@ TLS generation counter wrapped! Please report this.")); _dl_update_slotinfo would have to be split into two operations, similar to resize_scopes and update_scopes above. This is related to bug 16134. */ - _dl_update_slotinfo (imap->l_tls_modid); + _dl_update_slotinfo (imap->l_tls_modid, newgen); #endif GL(dl_init_static_tls) (imap); diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c index c2df26deea..427669d769 100644 --- a/elf/dl-reloc.c +++ b/elf/dl-reloc.c @@ -111,11 +111,10 @@ _dl_try_allocate_static_tls (struct link_map *map, bool optional) if (map->l_real->l_relocated) { #ifdef SHARED +// TODO: it is not clear why we need to update the DTV here, add comment if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation), 0)) - /* Update the slot information data for at least the generation of - the DSO we are allocating data for. */ - (void) _dl_update_slotinfo (map->l_tls_modid); + (void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation)); #endif GL(dl_init_static_tls) (map); diff --git a/elf/dl-tls.c b/elf/dl-tls.c index b0257185e9..b51a4f3a19 100644 --- a/elf/dl-tls.c +++ b/elf/dl-tls.c @@ -701,7 +701,7 @@ allocate_and_init (struct link_map *map) struct link_map * -_dl_update_slotinfo (unsigned long int req_modid) +_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen) { struct link_map *the_map = NULL; dtv_t *dtv = THREAD_DTV (); @@ -718,19 +718,12 @@ _dl_update_slotinfo (unsigned long int req_modid) code and therefore add to the slotinfo list. This is a problem since we must not pick up any information about incomplete work. The solution to this is to ignore all dtv slots which were - created after the one we are currently interested. We know that - dynamic loading for this module is completed and this is the last - load operation we know finished. */ - unsigned long int idx = req_modid; + created after the generation we are interested in. We know that + dynamic loading for this generation is completed and this is the + last load operation we know finished. */ struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list); - while (idx >= listp->len) - { - idx -= listp->len; - listp = listp->next; - } - - if (dtv[0].counter < listp->slotinfo[idx].gen) + if (dtv[0].counter < new_gen) { /* CONCURRENCY NOTES: @@ -751,7 +744,6 @@ _dl_update_slotinfo (unsigned long int req_modid) other entries are racy. However updating a non-relevant dtv entry does not affect correctness. For a relevant module m, max_modid >= modid of m. */ - size_t new_gen = listp->slotinfo[idx].gen; size_t total = 0; size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx)); assert (max_modid >= req_modid); @@ -894,9 +886,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map) static struct link_map * __attribute_noinline__ -update_get_addr (GET_ADDR_ARGS) +update_get_addr (GET_ADDR_ARGS, size_t gen) { - struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE); + struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen); dtv_t *dtv = THREAD_DTV (); void *p = dtv[GET_ADDR_MODULE].pointer.val; @@ -931,7 +923,11 @@ __tls_get_addr (GET_ADDR_ARGS) by user code, see CONCURRENCY NOTES in _dl_update_slotinfo. */ size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); if (__glibc_unlikely (dtv[0].counter != gen)) - return update_get_addr (GET_ADDR_PARAM); + { +// TODO: needs comment update if we rely on consistent generation with slotinfo + gen = atomic_load_acquire (&GL(dl_tls_generation)); + return update_get_addr (GET_ADDR_PARAM, gen); + } void *p = dtv[GET_ADDR_MODULE].pointer.val; diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index ea3f7a69d0..614463f016 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -1224,7 +1224,8 @@ extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add) /* Update slot information data for at least the generation of the module with the given index. */ -extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid) +extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid, + size_t gen) attribute_hidden; /* Look up the module's TLS block as for __tls_get_addr, diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c index 24ef560b71..4ded8dd6b9 100644 --- a/sysdeps/x86_64/dl-tls.c +++ b/sysdeps/x86_64/dl-tls.c @@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS) { dtv_t *dtv = THREAD_DTV (); - size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); + size_t gen = atomic_load_acquire (&GL(dl_tls_generation)); if (__glibc_unlikely (dtv[0].counter != gen)) - return update_get_addr (GET_ADDR_PARAM); + return update_get_addr (GET_ADDR_PARAM, gen); return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); } |