From b116855de71098ef7dd2875dd3237f8f3ecc12c2 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Tue, 16 Feb 2021 12:55:13 +0000 Subject: RFC elf: Fix slow tls access after dlopen [BZ #19924] In short: __tls_get_addr checks the global generation counter, _dl_update_slotinfo updates up to the generation of the accessed module. If the global generation is newer than geneneration of the module then __tls_get_addr keeps hitting the slow path that updates the dtv. Possible approaches i can see: 1. update to global generation instead of module, 2. check the module generation in the fast path. This patch is 1.: it needs additional sync (load acquire) so the slotinfo list is up to date with the observed global generation. Approach 2. would require walking the slotinfo list at all times. I don't know how to make that fast with many modules. Note: in the x86_64 version of dl-tls.c the generation is only loaded once, since relaxed mo is not faster than acquire mo load. I have not benchmarked this yet. --- elf/dl-tls.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'elf/dl-tls.c') diff --git a/elf/dl-tls.c b/elf/dl-tls.c index b0257185e9..b51a4f3a19 100644 --- a/elf/dl-tls.c +++ b/elf/dl-tls.c @@ -701,7 +701,7 @@ allocate_and_init (struct link_map *map) struct link_map * -_dl_update_slotinfo (unsigned long int req_modid) +_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen) { struct link_map *the_map = NULL; dtv_t *dtv = THREAD_DTV (); @@ -718,19 +718,12 @@ _dl_update_slotinfo (unsigned long int req_modid) code and therefore add to the slotinfo list. This is a problem since we must not pick up any information about incomplete work. The solution to this is to ignore all dtv slots which were - created after the one we are currently interested. We know that - dynamic loading for this module is completed and this is the last - load operation we know finished. */ - unsigned long int idx = req_modid; + created after the generation we are interested in. We know that + dynamic loading for this generation is completed and this is the + last load operation we know finished. */ struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list); - while (idx >= listp->len) - { - idx -= listp->len; - listp = listp->next; - } - - if (dtv[0].counter < listp->slotinfo[idx].gen) + if (dtv[0].counter < new_gen) { /* CONCURRENCY NOTES: @@ -751,7 +744,6 @@ _dl_update_slotinfo (unsigned long int req_modid) other entries are racy. However updating a non-relevant dtv entry does not affect correctness. For a relevant module m, max_modid >= modid of m. */ - size_t new_gen = listp->slotinfo[idx].gen; size_t total = 0; size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx)); assert (max_modid >= req_modid); @@ -894,9 +886,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map) static struct link_map * __attribute_noinline__ -update_get_addr (GET_ADDR_ARGS) +update_get_addr (GET_ADDR_ARGS, size_t gen) { - struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE); + struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen); dtv_t *dtv = THREAD_DTV (); void *p = dtv[GET_ADDR_MODULE].pointer.val; @@ -931,7 +923,11 @@ __tls_get_addr (GET_ADDR_ARGS) by user code, see CONCURRENCY NOTES in _dl_update_slotinfo. */ size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); if (__glibc_unlikely (dtv[0].counter != gen)) - return update_get_addr (GET_ADDR_PARAM); + { +// TODO: needs comment update if we rely on consistent generation with slotinfo + gen = atomic_load_acquire (&GL(dl_tls_generation)); + return update_get_addr (GET_ADDR_PARAM, gen); + } void *p = dtv[GET_ADDR_MODULE].pointer.val; -- cgit 1.4.1