track all live threads in an AS-safe, fully-consistent linked list
[musl] / ldso / dynlink.c
index ad49cac..ec921df 100644 (file)
@@ -20,6 +20,7 @@
 #include "pthread_impl.h"
 #include "libc.h"
 #include "dynlink.h"
+#include "malloc_impl.h"
 
 static void error(const char *, ...);
 
@@ -52,17 +53,17 @@ struct dso {
        Phdr *phdr;
        int phnum;
        size_t phentsize;
-       int refcnt;
        Sym *syms;
        Elf_Symndx *hashtab;
        uint32_t *ghashtab;
        int16_t *versym;
        char *strings;
+       struct dso *syms_next, *lazy_next;
+       size_t *lazy, lazy_cnt;
        unsigned char *map;
        size_t map_len;
        dev_t dev;
        ino_t ino;
-       signed char global;
        char relocated;
        char constructed;
        char kernel_mapped;
@@ -71,7 +72,7 @@ struct dso {
        struct tls_module tls;
        size_t tls_id;
        size_t relro_start, relro_end;
-       void **new_dtv;
+       uintptr_t *new_dtv;
        unsigned char *new_tls;
        volatile int new_dtv_idx, new_tls_idx;
        struct td_index *td_index;
@@ -95,13 +96,6 @@ struct symdef {
        struct dso *dso;
 };
 
-int __init_tp(void *);
-void __init_libc(char **, char *);
-void *__copy_tls(unsigned char *);
-
-__attribute__((__visibility__("hidden")))
-const char *__libc_get_version(void);
-
 static struct builtin_tls {
        char c;
        struct pthread pt;
@@ -113,7 +107,7 @@ static struct builtin_tls {
 static size_t *saved_addends, *apply_addends_to;
 
 static struct dso ldso;
-static struct dso *head, *tail, *fini_head;
+static struct dso *head, *tail, *fini_head, *syms_tail, *lazy_head;
 static char *env_path, *sys_path;
 static unsigned long long gencnt;
 static int runtime;
@@ -129,14 +123,15 @@ static size_t static_tls_cnt;
 static pthread_mutex_t init_fini_lock = { ._m_type = PTHREAD_MUTEX_RECURSIVE };
 static struct fdpic_loadmap *app_loadmap;
 static struct fdpic_dummy_loadmap app_dummy_loadmap;
+static struct dso *const nodeps_dummy;
 
 struct debug *_dl_debug_addr = &debug;
 
-__attribute__((__visibility__("hidden")))
-void (*const __init_array_start)(void)=0, (*const __fini_array_start)(void)=0;
+extern hidden int __malloc_replaced;
+
+hidden void (*const __init_array_start)(void)=0, (*const __fini_array_start)(void)=0;
 
-__attribute__((__visibility__("hidden")))
-extern void (*const __init_array_end)(void), (*const __fini_array_end)(void);
+extern hidden void (*const __init_array_end)(void), (*const __fini_array_end)(void);
 
 weak_alias(__init_array_start, __init_array_end);
 weak_alias(__fini_array_start, __fini_array_end);
@@ -157,10 +152,26 @@ static void *laddr(const struct dso *p, size_t v)
        for (j=0; v-p->loadmap->segs[j].p_vaddr >= p->loadmap->segs[j].p_memsz; j++);
        return (void *)(v - p->loadmap->segs[j].p_vaddr + p->loadmap->segs[j].addr);
 }
+static void *laddr_pg(const struct dso *p, size_t v)
+{
+       size_t j=0;
+       size_t pgsz = PAGE_SIZE;
+       if (!p->loadmap) return p->base + v;
+       for (j=0; ; j++) {
+               size_t a = p->loadmap->segs[j].p_vaddr;
+               size_t b = a + p->loadmap->segs[j].p_memsz;
+               a &= -pgsz;
+               b += pgsz-1;
+               b &= -pgsz;
+               if (v-a<b-a) break;
+       }
+       return (void *)(v - p->loadmap->segs[j].p_vaddr + p->loadmap->segs[j].addr);
+}
 #define fpaddr(p, v) ((void (*)())&(struct funcdesc){ \
        laddr(p, v), (p)->got })
 #else
 #define laddr(p, v) (void *)((p)->base + (v))
+#define laddr_pg(p, v) laddr(p, v)
 #define fpaddr(p, v) ((void (*)())laddr(p, v))
 #endif
 
@@ -258,19 +269,12 @@ static Sym *gnu_lookup_filtered(uint32_t h1, uint32_t *hashtab, struct dso *dso,
 
 static struct symdef find_sym(struct dso *dso, const char *s, int need_def)
 {
-       uint32_t h = 0, gh, gho, *ght;
-       size_t ghm = 0;
+       uint32_t h = 0, gh = gnu_hash(s), gho = gh / (8*sizeof(size_t)), *ght;
+       size_t ghm = 1ul << gh % (8*sizeof(size_t));
        struct symdef def = {0};
-       for (; dso; dso=dso->next) {
+       for (; dso; dso=dso->syms_next) {
                Sym *sym;
-               if (!dso->global) continue;
                if ((ght = dso->ghashtab)) {
-                       if (!ghm) {
-                               gh = gnu_hash(s);
-                               int maskbits = 8 * sizeof ghm;
-                               gho = gh / maskbits;
-                               ghm = 1ul << gh % maskbits;
-                       }
                        sym = gnu_lookup_filtered(gh, ght, dso, s, gho, ghm);
                } else {
                        if (!h) h = sysv_hash(s);
@@ -286,18 +290,13 @@ static struct symdef find_sym(struct dso *dso, const char *s, int need_def)
                                continue;
                if (!(1<<(sym->st_info&0xf) & OK_TYPES)) continue;
                if (!(1<<(sym->st_info>>4) & OK_BINDS)) continue;
-
-               if (def.sym && sym->st_info>>4 == STB_WEAK) continue;
                def.sym = sym;
                def.dso = dso;
-               if (sym->st_info>>4 == STB_GLOBAL) break;
+               break;
        }
        return def;
 }
 
-__attribute__((__visibility__("hidden")))
-ptrdiff_t __tlsdesc_static(), __tlsdesc_dynamic();
-
 static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stride)
 {
        unsigned char *base = dso->base;
@@ -326,17 +325,40 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
                if (skip_relative && IS_RELATIVE(rel[1], dso->syms)) continue;
                type = R_TYPE(rel[1]);
                if (type == REL_NONE) continue;
-               sym_index = R_SYM(rel[1]);
                reloc_addr = laddr(dso, rel[0]);
+
+               if (stride > 2) {
+                       addend = rel[2];
+               } else if (type==REL_GOT || type==REL_PLT|| type==REL_COPY) {
+                       addend = 0;
+               } else if (reuse_addends) {
+                       /* Save original addend in stage 2 where the dso
+                        * chain consists of just ldso; otherwise read back
+                        * saved addend since the inline one was clobbered. */
+                       if (head==&ldso)
+                               saved_addends[save_slot] = *reloc_addr;
+                       addend = saved_addends[save_slot++];
+               } else {
+                       addend = *reloc_addr;
+               }
+
+               sym_index = R_SYM(rel[1]);
                if (sym_index) {
                        sym = syms + sym_index;
                        name = strings + sym->st_name;
-                       ctx = type==REL_COPY ? head->next : head;
+                       ctx = type==REL_COPY ? head->syms_next : head;
                        def = (sym->st_info&0xf) == STT_SECTION
                                ? (struct symdef){ .dso = dso, .sym = sym }
                                : find_sym(ctx, name, type==REL_PLT);
                        if (!def.sym && (sym->st_shndx != SHN_UNDEF
                            || sym->st_info>>4 != STB_WEAK)) {
+                               if (dso->lazy && (type==REL_PLT || type==REL_GOT)) {
+                                       dso->lazy[3*dso->lazy_cnt+0] = rel[0];
+                                       dso->lazy[3*dso->lazy_cnt+1] = rel[1];
+                                       dso->lazy[3*dso->lazy_cnt+2] = addend;
+                                       dso->lazy_cnt++;
+                                       continue;
+                               }
                                error("Error relocating %s: %s: symbol not found",
                                        dso->name, name);
                                if (runtime) longjmp(*rtld_fail, 1);
@@ -348,24 +370,17 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
                        def.dso = dso;
                }
 
-               if (stride > 2) {
-                       addend = rel[2];
-               } else if (type==REL_GOT || type==REL_PLT|| type==REL_COPY) {
-                       addend = 0;
-               } else if (reuse_addends) {
-                       /* Save original addend in stage 2 where the dso
-                        * chain consists of just ldso; otherwise read back
-                        * saved addend since the inline one was clobbered. */
-                       if (head==&ldso)
-                               saved_addends[save_slot] = *reloc_addr;
-                       addend = saved_addends[save_slot++];
-               } else {
-                       addend = *reloc_addr;
-               }
-
                sym_val = def.sym ? (size_t)laddr(def.dso, def.sym->st_value) : 0;
                tls_val = def.sym ? def.sym->st_value : 0;
 
+               if ((type == REL_TPOFF || type == REL_TPOFF_NEG)
+                   && runtime && def.dso->tls_id > static_tls_cnt) {
+                       error("Error relocating %s: %s: initial-exec TLS "
+                               "resolves to dynamic definition in %s",
+                               dso->name, name, def.dso->name);
+                       longjmp(*rtld_fail, 1);
+               }
+
                switch(type) {
                case REL_NONE:
                        break;
@@ -419,7 +434,7 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
 #endif
                case REL_TLSDESC:
                        if (stride<3) addend = reloc_addr[1];
-                       if (runtime && def.dso->tls_id >= static_tls_cnt) {
+                       if (runtime && def.dso->tls_id > static_tls_cnt) {
                                struct td_index *new = malloc(sizeof *new);
                                if (!new) {
                                        error(
@@ -430,7 +445,7 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
                                new->next = dso->td_index;
                                dso->td_index = new;
                                new->args[0] = def.dso->tls_id;
-                               new->args[1] = tls_val + addend;
+                               new->args[1] = tls_val + addend - DTP_OFFSET;
                                reloc_addr[0] = (size_t)__tlsdesc_dynamic;
                                reloc_addr[1] = (size_t)new;
                        } else {
@@ -443,6 +458,13 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
                                        + addend;
 #endif
                        }
+#ifdef TLSDESC_BACKWARDS
+                       /* Some archs (32-bit ARM at least) invert the order of
+                        * the descriptor members. Fix them up here. */
+                       size_t tmp = reloc_addr[0];
+                       reloc_addr[0] = reloc_addr[1];
+                       reloc_addr[1] = tmp;
+#endif
                        break;
                default:
                        error("Error relocating %s: unsupported relocation type %d",
@@ -453,26 +475,38 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
        }
 }
 
+static void redo_lazy_relocs()
+{
+       struct dso *p = lazy_head, *next;
+       lazy_head = 0;
+       for (; p; p=next) {
+               next = p->lazy_next;
+               size_t size = p->lazy_cnt*3*sizeof(size_t);
+               p->lazy_cnt = 0;
+               do_relocs(p, p->lazy, size, 3);
+               if (p->lazy_cnt) {
+                       p->lazy_next = lazy_head;
+                       lazy_head = p;
+               } else {
+                       free(p->lazy);
+                       p->lazy = 0;
+                       p->lazy_next = 0;
+               }
+       }
+}
+
 /* A huge hack: to make up for the wastefulness of shared libraries
  * needing at least a page of dirty memory even if they have no global
  * data, we reclaim the gaps at the beginning and end of writable maps
- * and "donate" them to the heap by setting up minimal malloc
- * structures and then freeing them. */
+ * and "donate" them to the heap. */
 
 static void reclaim(struct dso *dso, size_t start, size_t end)
 {
-       size_t *a, *z;
        if (start >= dso->relro_start && start < dso->relro_end) start = dso->relro_end;
        if (end   >= dso->relro_start && end   < dso->relro_end) end = dso->relro_start;
-       start = start + 6*sizeof(size_t)-1 & -4*sizeof(size_t);
-       end = (end & -4*sizeof(size_t)) - 2*sizeof(size_t);
-       if (start>end || end-start < 4*sizeof(size_t)) return;
-       a = laddr(dso, start);
-       z = laddr(dso, end);
-       a[-2] = 1;
-       a[-1] = z[0] = end-start + 2*sizeof(size_t) | 1;
-       z[1] = 1;
-       free(a);
+       if (start >= end) return;
+       char *base = laddr_pg(dso, start);
+       __malloc_donate(base, base+(end-start));
 }
 
 static void reclaim_gaps(struct dso *dso)
@@ -480,7 +514,6 @@ static void reclaim_gaps(struct dso *dso)
        Phdr *ph = dso->phdr;
        size_t phcnt = dso->phnum;
 
-       if (DL_FDPIC) return; // FIXME
        for (; phcnt--; ph=(void *)((char *)ph+dso->phentsize)) {
                if (ph->p_type!=PT_LOAD) continue;
                if ((ph->p_flags&(PF_R|PF_W))!=(PF_R|PF_W)) continue;
@@ -583,6 +616,12 @@ static void *map_library(int fd, struct dso *dso)
                } else if (ph->p_type == PT_GNU_RELRO) {
                        dso->relro_start = ph->p_vaddr & -PAGE_SIZE;
                        dso->relro_end = (ph->p_vaddr + ph->p_memsz) & -PAGE_SIZE;
+               } else if (ph->p_type == PT_GNU_STACK) {
+                       if (!runtime && ph->p_memsz > __default_stacksize) {
+                               __default_stacksize =
+                                       ph->p_memsz < DEFAULT_STACK_MAX ?
+                                       ph->p_memsz : DEFAULT_STACK_MAX;
+                       }
                }
                if (ph->p_type != PT_LOAD) continue;
                nsegs++;
@@ -675,18 +714,17 @@ static void *map_library(int fd, struct dso *dso)
                        dso->phnum = eh->e_phnum;
                        dso->phentsize = eh->e_phentsize;
                }
-               /* Reuse the existing mapping for the lowest-address LOAD */
-               if ((ph->p_vaddr & -PAGE_SIZE) == addr_min && !DL_NOMMU_SUPPORT)
-                       continue;
                this_min = ph->p_vaddr & -PAGE_SIZE;
                this_max = ph->p_vaddr+ph->p_memsz+PAGE_SIZE-1 & -PAGE_SIZE;
                off_start = ph->p_offset & -PAGE_SIZE;
                prot = (((ph->p_flags&PF_R) ? PROT_READ : 0) |
                        ((ph->p_flags&PF_W) ? PROT_WRITE: 0) |
                        ((ph->p_flags&PF_X) ? PROT_EXEC : 0));
-               if (mmap_fixed(base+this_min, this_max-this_min, prot, MAP_PRIVATE|MAP_FIXED, fd, off_start) == MAP_FAILED)
-                       goto error;
-               if (ph->p_memsz > ph->p_filesz) {
+               /* Reuse the existing mapping for the lowest-address LOAD */
+               if ((ph->p_vaddr & -PAGE_SIZE) != addr_min || DL_NOMMU_SUPPORT)
+                       if (mmap_fixed(base+this_min, this_max-this_min, prot, MAP_PRIVATE|MAP_FIXED, fd, off_start) == MAP_FAILED)
+                               goto error;
+               if (ph->p_memsz > ph->p_filesz && (ph->p_flags&PF_W)) {
                        size_t brk = (size_t)base+ph->p_vaddr+ph->p_filesz;
                        size_t pgbrk = brk+PAGE_SIZE-1 & -PAGE_SIZE;
                        memset((void *)brk, 0, pgbrk-brk & PAGE_SIZE-1);
@@ -705,7 +743,6 @@ done_mapping:
        dso->base = base;
        dso->dynv = laddr(dso, dyn);
        if (dso->tls.size) dso->tls.image = laddr(dso, tls_image);
-       if (!runtime) reclaim_gaps(dso);
        free(allocated_buf);
        return map;
 noexec:
@@ -788,7 +825,19 @@ static int fixup_rpath(struct dso *p, char *buf, size_t buf_size)
                origin = p->name;
        }
        t = strrchr(origin, '/');
-       l = t ? t-origin : 0;
+       if (t) {
+               l = t-origin;
+       } else {
+               /* Normally p->name will always be an absolute or relative
+                * pathname containing at least one '/' character, but in the
+                * case where ldso was invoked as a command to execute a
+                * program in the working directory, app.name may not. Fix. */
+               origin = ".";
+               l = 1;
+       }
+       /* Disallow non-absolute origins for suid/sgid/AT_SECURE. */
+       if (libc.secure && *origin != '/')
+               return 0;
        p->rpath = malloc(strlen(p->rpath_orig) + n*l + 1);
        if (!p->rpath) return -1;
 
@@ -934,7 +983,7 @@ static struct dso *load_library(const char *name, struct dso *needed_by)
                if (!ldso.prev) {
                        tail->next = &ldso;
                        ldso.prev = tail;
-                       tail = ldso.next ? ldso.next : &ldso;
+                       tail = &ldso;
                }
                return &ldso;
        }
@@ -945,7 +994,6 @@ static struct dso *load_library(const char *name, struct dso *needed_by)
                /* Search for the name to see if it's already loaded */
                for (p=head->next; p; p=p->next) {
                        if (p->shortname && !strcmp(p->shortname, name)) {
-                               p->refcnt++;
                                return p;
                        }
                }
@@ -1008,7 +1056,6 @@ static struct dso *load_library(const char *name, struct dso *needed_by)
                        if (!p->shortname && pathname != name)
                                p->shortname = strrchr(p->name, '/')+1;
                        close(fd);
-                       p->refcnt++;
                        return p;
                }
        }
@@ -1016,6 +1063,21 @@ static struct dso *load_library(const char *name, struct dso *needed_by)
        close(fd);
        if (!map) return 0;
 
+       /* Avoid the danger of getting two versions of libc mapped into the
+        * same process when an absolute pathname was used. The symbols
+        * checked are chosen to catch both musl and glibc, and to avoid
+        * false positives from interposition-hack libraries. */
+       decode_dyn(&temp_dso);
+       if (find_sym(&temp_dso, "__libc_start_main", 1).sym &&
+           find_sym(&temp_dso, "stdin", 1).sym) {
+               unmap_library(&temp_dso);
+               return load_library("libc.so", needed_by);
+       }
+       /* Past this point, if we haven't reached runtime yet, ldso has
+        * committed either to use the mapped library or to abort execution.
+        * Unmapping is not possible, so we can safely reclaim gaps. */
+       if (!runtime) reclaim_gaps(&temp_dso);
+
        /* Allocate storage for the new DSO. When there is TLS, this
         * storage must include a reservation for all pre-existing
         * threads to obtain copies of both the new TLS, and an
@@ -1035,10 +1097,8 @@ static struct dso *load_library(const char *name, struct dso *needed_by)
                return 0;
        }
        memcpy(p, &temp_dso, sizeof temp_dso);
-       decode_dyn(p);
        p->dev = st.st_dev;
        p->ino = st.st_ino;
-       p->refcnt = 1;
        p->needed_by = needed_by;
        p->name = p->buf;
        strcpy(p->name, pathname);
@@ -1099,6 +1159,7 @@ static void load_deps(struct dso *p)
                        }
                }
        }
+       if (!*deps) *deps = (struct dso **)&nodeps_dummy;
 }
 
 static void load_preload(char *s)
@@ -1115,9 +1176,24 @@ static void load_preload(char *s)
        }
 }
 
-static void make_global(struct dso *p)
+static void add_syms(struct dso *p)
 {
-       for (; p; p=p->next) p->global = 1;
+       if (!p->syms_next && syms_tail != p) {
+               syms_tail->syms_next = p;
+               syms_tail = p;
+       }
+}
+
+static void revert_syms(struct dso *old_tail)
+{
+       struct dso *p, *next;
+       /* Chop off the tail of the list of dsos that participate in
+        * the global symbol table, reverting them to RTLD_LOCAL. */
+       for (p=old_tail; p; p=next) {
+               next = p->syms_next;
+               p->syms_next = 0;
+       }
+       syms_tail = old_tail;
 }
 
 static void do_mips_relocs(struct dso *p, size_t *got)
@@ -1175,6 +1251,12 @@ static void kernel_mapped_dso(struct dso *p)
                } else if (ph->p_type == PT_GNU_RELRO) {
                        p->relro_start = ph->p_vaddr & -PAGE_SIZE;
                        p->relro_end = (ph->p_vaddr + ph->p_memsz) & -PAGE_SIZE;
+               } else if (ph->p_type == PT_GNU_STACK) {
+                       if (!runtime && ph->p_memsz > __default_stacksize) {
+                               __default_stacksize =
+                                       ph->p_memsz < DEFAULT_STACK_MAX ?
+                                       ph->p_memsz : DEFAULT_STACK_MAX;
+                       }
                }
                if (ph->p_type != PT_LOAD) continue;
                if (ph->p_vaddr < min_addr)
@@ -1256,17 +1338,16 @@ void __init_tls(size_t *auxv)
 {
 }
 
-__attribute__((__visibility__("hidden")))
-void *__tls_get_new(size_t *v)
+hidden void *__tls_get_new(tls_mod_off_t *v)
 {
        pthread_t self = __pthread_self();
 
        /* Block signals to make accessing new TLS async-signal-safe */
        sigset_t set;
        __block_all_sigs(&set);
-       if (v[0]<=(size_t)self->dtv[0]) {
+       if (v[0] <= self->dtv[0]) {
                __restore_sigs(&set);
-               return (char *)self->dtv[v[0]]+v[1]+DTP_OFFSET;
+               return (void *)(self->dtv[v[0]] + v[1]);
        }
 
        /* This is safe without any locks held because, if the caller
@@ -1276,15 +1357,12 @@ void *__tls_get_new(size_t *v)
        struct dso *p;
        for (p=head; p->tls_id != v[0]; p=p->next);
 
-       /* Get new DTV space from new DSO if needed */
-       if (v[0] > (size_t)self->dtv[0]) {
-               void **newdtv = p->new_dtv +
-                       (v[0]+1)*a_fetch_add(&p->new_dtv_idx,1);
-               memcpy(newdtv, self->dtv,
-                       ((size_t)self->dtv[0]+1) * sizeof(void *));
-               newdtv[0] = (void *)v[0];
-               self->dtv = self->dtv_copy = newdtv;
-       }
+       /* Get new DTV space from new DSO */
+       uintptr_t *newdtv = p->new_dtv +
+               (v[0]+1)*a_fetch_add(&p->new_dtv_idx,1);
+       memcpy(newdtv, self->dtv, (self->dtv[0]+1) * sizeof(uintptr_t));
+       newdtv[0] = v[0];
+       self->dtv = self->dtv_copy = newdtv;
 
        /* Get new TLS memory from all new DSOs up to the requested one */
        unsigned char *mem;
@@ -1294,7 +1372,7 @@ void *__tls_get_new(size_t *v)
                        * a_fetch_add(&p->new_tls_idx,1);
                mem += ((uintptr_t)p->tls.image - (uintptr_t)mem)
                        & (p->tls.align-1);
-               self->dtv[p->tls_id] = mem;
+               self->dtv[p->tls_id] = (uintptr_t)mem + DTP_OFFSET;
                memcpy(mem, p->tls.image, p->tls.len);
                if (p->tls_id == v[0]) break;
        }
@@ -1325,15 +1403,15 @@ static void update_tls_size()
  * linker itself, but some of the relocations performed may need to be
  * replaced later due to copy relocations in the main program. */
 
-__attribute__((__visibility__("hidden")))
-void __dls2(unsigned char *base, size_t *sp)
+hidden void __dls2(unsigned char *base, size_t *sp)
 {
        if (DL_FDPIC) {
                void *p1 = (void *)sp[-2];
                void *p2 = (void *)sp[-1];
                if (!p1) {
                        size_t *auxv, aux[AUX_CNT];
-                       for (auxv=sp+1+*sp+1; *auxv; auxv++); auxv++;
+                       for (auxv=sp+1+*sp+1; *auxv; auxv++);
+                       auxv++;
                        decode_vec(auxv, aux, AUX_CNT);
                        if (aux[AT_BASE]) ldso.base = (void *)aux[AT_BASE];
                        else ldso.base = (void *)(aux[AT_PHDR] & -4096);
@@ -1346,7 +1424,6 @@ void __dls2(unsigned char *base, size_t *sp)
        }
        Ehdr *ehdr = (void *)ldso.base;
        ldso.name = ldso.shortname = "libc.so";
-       ldso.global = 1;
        ldso.phnum = ehdr->e_phnum;
        ldso.phdr = laddr(&ldso, ehdr->e_phoff);
        ldso.phentsize = ehdr->e_phentsize;
@@ -1376,9 +1453,31 @@ void __dls2(unsigned char *base, size_t *sp)
 
        ldso.relocated = 0;
 
-       /* Call dynamic linker stage-3, __dls3, looking it up
+       /* Call dynamic linker stage-2b, __dls2b, looking it up
         * symbolically as a barrier against moving the address
         * load across the above relocation processing. */
+       struct symdef dls2b_def = find_sym(&ldso, "__dls2b", 0);
+       if (DL_FDPIC) ((stage3_func)&ldso.funcdescs[dls2b_def.sym-ldso.syms])(sp);
+       else ((stage3_func)laddr(&ldso, dls2b_def.sym->st_value))(sp);
+}
+
+/* Stage 2b sets up a valid thread pointer, which requires relocations
+ * completed in stage 2, and on which stage 3 is permitted to depend.
+ * This is done as a separate stage, with symbolic lookup as a barrier,
+ * so that loads of the thread pointer and &errno can be pure/const and
+ * thereby hoistable. */
+
+_Noreturn void __dls2b(size_t *sp)
+{
+       /* Setup early thread pointer in builtin_tls for ldso/libc itself to
+        * use during dynamic linking. If possible it will also serve as the
+        * thread pointer at runtime. */
+       libc.tls_size = sizeof builtin_tls;
+       libc.tls_align = tls_align;
+       if (__init_tp(__copy_tls((void *)builtin_tls)) < 0) {
+               a_crash();
+       }
+
        struct symdef dls3_def = find_sym(&ldso, "__dls3", 0);
        if (DL_FDPIC) ((stage3_func)&ldso.funcdescs[dls3_def.sym-ldso.syms])(sp);
        else ((stage3_func)laddr(&ldso, dls3_def.sym->st_value))(sp);
@@ -1395,6 +1494,7 @@ _Noreturn void __dls3(size_t *sp)
        size_t aux[AUX_CNT], *auxv;
        size_t i;
        char *env_preload=0;
+       char *replace_argv0=0;
        size_t vdso_base;
        int argc = *sp;
        char **argv = (void *)(sp+1);
@@ -1412,15 +1512,6 @@ _Noreturn void __dls3(size_t *sp)
        libc.secure = ((aux[0]&0x7800)!=0x7800 || aux[AT_UID]!=aux[AT_EUID]
                || aux[AT_GID]!=aux[AT_EGID] || aux[AT_SECURE]);
 
-       /* Setup early thread pointer in builtin_tls for ldso/libc itself to
-        * use during dynamic linking. If possible it will also serve as the
-        * thread pointer at runtime. */
-       libc.tls_size = sizeof builtin_tls;
-       libc.tls_align = tls_align;
-       if (__init_tp(__copy_tls((void *)builtin_tls)) < 0) {
-               a_crash();
-       }
-
        /* Only trust user/env if kernel says we're not suid/sgid */
        if (!libc.secure) {
                env_path = getenv("LD_LIBRARY_PATH");
@@ -1479,6 +1570,10 @@ _Noreturn void __dls3(size_t *sp)
                                if (opt[7]=='=') env_preload = opt+8;
                                else if (opt[7]) *argv = 0;
                                else if (*argv) env_preload = *argv++;
+                       } else if (!memcmp(opt, "argv0", 5)) {
+                               if (opt[5]=='=') replace_argv0 = opt+6;
+                               else if (opt[5]) *argv = 0;
+                               else if (*argv) replace_argv0 = *argv++;
                        } else {
                                argv[0] = 0;
                        }
@@ -1489,7 +1584,7 @@ _Noreturn void __dls3(size_t *sp)
                                "Version %s\n"
                                "Dynamic Program Loader\n"
                                "Usage: %s [options] [--] pathname%s\n",
-                               __libc_get_version(), ldname,
+                               __libc_version, ldname,
                                ldd_mode ? "" : " [args]");
                        _exit(1);
                }
@@ -1498,13 +1593,11 @@ _Noreturn void __dls3(size_t *sp)
                        dprintf(2, "%s: cannot load %s: %s\n", ldname, argv[0], strerror(errno));
                        _exit(1);
                }
-               runtime = 1;
                Ehdr *ehdr = (void *)map_library(fd, &app);
                if (!ehdr) {
                        dprintf(2, "%s: %s: Not a valid dynamic program\n", ldname, argv[0]);
                        _exit(1);
                }
-               runtime = 0;
                close(fd);
                ldso.name = ldname;
                app.name = argv[0];
@@ -1523,8 +1616,9 @@ _Noreturn void __dls3(size_t *sp)
                libc.tls_head = tls_tail = &app.tls;
                app.tls_id = tls_cnt = 1;
 #ifdef TLS_ABOVE_TP
-               app.tls.offset = 0;
-               tls_offset = app.tls.size
+               app.tls.offset = GAP_ABOVE_TP;
+               app.tls.offset += -GAP_ABOVE_TP & (app.tls.align-1);
+               tls_offset = app.tls.offset + app.tls.size
                        + ( -((uintptr_t)app.tls.image + app.tls.size)
                        & (app.tls.align-1) );
 #else
@@ -1534,7 +1628,6 @@ _Noreturn void __dls3(size_t *sp)
 #endif
                tls_align = MAXP2(tls_align, app.tls.align);
        }
-       app.global = 1;
        decode_dyn(&app);
        if (DL_FDPIC) {
                makefuncdescs(&app);
@@ -1549,8 +1642,22 @@ _Noreturn void __dls3(size_t *sp)
                argv[-3] = (void *)app.loadmap;
        }
 
-       /* Attach to vdso, if provided by the kernel */
-       if (search_vec(auxv, &vdso_base, AT_SYSINFO_EHDR)) {
+       /* Initial dso chain consists only of the app. */
+       head = tail = syms_tail = &app;
+
+       /* Donate unused parts of app and library mapping to malloc */
+       reclaim_gaps(&app);
+       reclaim_gaps(&ldso);
+
+       /* Load preload/needed libraries, add symbols to global namespace. */
+       if (env_preload) load_preload(env_preload);
+       load_deps(&app);
+       for (struct dso *p=head; p; p=p->next)
+               add_syms(p);
+
+       /* Attach to vdso, if provided by the kernel, last so that it does
+        * not become part of the global namespace.  */
+       if (search_vec(auxv, &vdso_base, AT_SYSINFO_EHDR) && vdso_base) {
                Ehdr *ehdr = (void *)vdso_base;
                Phdr *phdr = vdso.phdr = (void *)(vdso_base + ehdr->e_phoff);
                vdso.phnum = ehdr->e_phnum;
@@ -1563,26 +1670,13 @@ _Noreturn void __dls3(size_t *sp)
                }
                vdso.name = "";
                vdso.shortname = "linux-gate.so.1";
-               vdso.global = 1;
                vdso.relocated = 1;
                decode_dyn(&vdso);
-               vdso.prev = &ldso;
-               ldso.next = &vdso;
+               vdso.prev = tail;
+               tail->next = &vdso;
+               tail = &vdso;
        }
 
-       /* Initial dso chain consists only of the app. */
-       head = tail = &app;
-
-       /* Donate unused parts of app and library mapping to malloc */
-       reclaim_gaps(&app);
-       reclaim_gaps(&ldso);
-
-       /* Load preload/needed libraries, add their symbols to the global
-        * namespace, and perform all remaining relocations. */
-       if (env_preload) load_preload(env_preload);
-       load_deps(&app);
-       make_global(&app);
-
        for (i=0; app.dynv[i]; i+=2) {
                if (!DT_DEBUG_INDIRECT && app.dynv[i]==DT_DEBUG)
                        app.dynv[i+1] = (size_t)&debug;
@@ -1623,6 +1717,12 @@ _Noreturn void __dls3(size_t *sp)
        if (ldso_fail) _exit(127);
        if (ldd_mode) _exit(0);
 
+       /* Determine if malloc was interposed by a replacement implementation
+        * so that calloc and the memalign family can harden against the
+        * possibility of incomplete replacement. */
+       if (find_sym(head, "malloc", 1).dso != &ldso)
+               __malloc_replaced = 1;
+
        /* Switch to runtime mode: any further failures in the dynamic
         * linker are a reportable failure rather than a fatal startup
         * error. */
@@ -1635,15 +1735,39 @@ _Noreturn void __dls3(size_t *sp)
        debug.state = 0;
        _dl_debug_state();
 
+       if (replace_argv0) argv[0] = replace_argv0;
+
        errno = 0;
 
        CRTJMP((void *)aux[AT_ENTRY], argv-1);
        for(;;);
 }
 
+static void prepare_lazy(struct dso *p)
+{
+       size_t dyn[DYN_CNT], n, flags1=0;
+       decode_vec(p->dynv, dyn, DYN_CNT);
+       search_vec(p->dynv, &flags1, DT_FLAGS_1);
+       if (dyn[DT_BIND_NOW] || (dyn[DT_FLAGS] & DF_BIND_NOW) || (flags1 & DF_1_NOW))
+               return;
+       n = dyn[DT_RELSZ]/2 + dyn[DT_RELASZ]/3 + dyn[DT_PLTRELSZ]/2 + 1;
+       if (NEED_MIPS_GOT_RELOCS) {
+               size_t j=0; search_vec(p->dynv, &j, DT_MIPS_GOTSYM);
+               size_t i=0; search_vec(p->dynv, &i, DT_MIPS_SYMTABNO);
+               n += i-j;
+       }
+       p->lazy = calloc(n, 3*sizeof(size_t));
+       if (!p->lazy) {
+               error("Error preparing lazy relocation for %s: %m", p->name);
+               longjmp(*rtld_fail, 1);
+       }
+       p->lazy_next = lazy_head;
+       lazy_head = p;
+}
+
 void *dlopen(const char *file, int mode)
 {
-       struct dso *volatile p, *orig_tail, *next;
+       struct dso *volatile p, *orig_tail, *orig_syms_tail, *orig_lazy_head, *next;
        struct tls_module *orig_tls_tail;
        size_t orig_tls_cnt, orig_tls_offset, orig_tls_align;
        size_t i;
@@ -1661,15 +1785,15 @@ void *dlopen(const char *file, int mode)
        orig_tls_cnt = tls_cnt;
        orig_tls_offset = tls_offset;
        orig_tls_align = tls_align;
+       orig_lazy_head = lazy_head;
+       orig_syms_tail = syms_tail;
        orig_tail = tail;
        noload = mode & RTLD_NOLOAD;
 
        rtld_fail = &jb;
        if (setjmp(*rtld_fail)) {
                /* Clean up anything new that was (partially) loaded */
-               if (p && p->deps) for (i=0; p->deps[i]; i++)
-                       if (p->deps[i]->global < 0)
-                               p->deps[i]->global = 0;
+               revert_syms(orig_syms_tail);
                for (p=orig_tail->next; p; p=next) {
                        next = p->next;
                        while (p->td_index) {
@@ -1680,15 +1804,18 @@ void *dlopen(const char *file, int mode)
                        free(p->funcdescs);
                        if (p->rpath != p->rpath_orig)
                                free(p->rpath);
-                       free(p->deps);
+                       if (p->deps != &nodeps_dummy)
+                               free(p->deps);
                        unmap_library(p);
                        free(p);
                }
                if (!orig_tls_tail) libc.tls_head = 0;
                tls_tail = orig_tls_tail;
+               if (tls_tail) tls_tail->next = 0;
                tls_cnt = orig_tls_cnt;
                tls_offset = orig_tls_offset;
                tls_align = orig_tls_align;
+               lazy_head = orig_lazy_head;
                tail = orig_tail;
                tail->next = 0;
                p = 0;
@@ -1704,24 +1831,37 @@ void *dlopen(const char *file, int mode)
        }
 
        /* First load handling */
-       if (!p->deps) {
+       int first_load = !p->deps;
+       if (first_load) {
                load_deps(p);
-               if (p->deps) for (i=0; p->deps[i]; i++)
-                       if (!p->deps[i]->global)
-                               p->deps[i]->global = -1;
-               if (!p->global) p->global = -1;
+               if (!p->relocated && (mode & RTLD_LAZY)) {
+                       prepare_lazy(p);
+                       for (i=0; p->deps[i]; i++)
+                               if (!p->deps[i]->relocated)
+                                       prepare_lazy(p->deps[i]);
+               }
+       }
+       if (first_load || (mode & RTLD_GLOBAL)) {
+               /* Make new symbols global, at least temporarily, so we can do
+                * relocations. If not RTLD_GLOBAL, this is reverted below. */
+               add_syms(p);
+               for (i=0; p->deps[i]; i++)
+                       add_syms(p->deps[i]);
+       }
+       if (first_load) {
                reloc_all(p);
-               if (p->deps) for (i=0; p->deps[i]; i++)
-                       if (p->deps[i]->global < 0)
-                               p->deps[i]->global = 0;
-               if (p->global < 0) p->global = 0;
        }
 
-       if (mode & RTLD_GLOBAL) {
-               if (p->deps) for (i=0; p->deps[i]; i++)
-                       p->deps[i]->global = 1;
-               p->global = 1;
-       }
+       /* If RTLD_GLOBAL was not specified, undo any new additions
+        * to the global symbol table. This is a nop if the library was
+        * previously loaded and already global. */
+       if (!(mode & RTLD_GLOBAL))
+               revert_syms(orig_syms_tail);
+
+       /* Processing of deferred lazy relocations must not happen until
+        * the new libraries are committed; otherwise we could end up with
+        * relocations resolved to symbol definitions that get removed. */
+       redo_lazy_relocs();
 
        update_tls_size();
        _dl_debug_state();
@@ -1735,8 +1875,7 @@ end:
        return p;
 }
 
-__attribute__((__visibility__("hidden")))
-int __dl_invalid_handle(void *h)
+hidden int __dl_invalid_handle(void *h)
 {
        struct dso *p;
        for (p=head; p; p=p->next) if (h==p) return 0;
@@ -1761,15 +1900,22 @@ static void *addr2dso(size_t a)
                                        return p;
                        }
                } else {
+                       Phdr *ph = p->phdr;
+                       size_t phcnt = p->phnum;
+                       size_t entsz = p->phentsize;
+                       size_t base = (size_t)p->base;
+                       for (; phcnt--; ph=(void *)((char *)ph+entsz)) {
+                               if (ph->p_type != PT_LOAD) continue;
+                               if (a-base-ph->p_vaddr < ph->p_memsz)
+                                       return p;
+                       }
                        if (a-(size_t)p->map < p->map_len)
-                               return p;
+                               return 0;
                }
        }
        return 0;
 }
 
-void *__tls_get_addr(size_t *);
-
 static void *do_dlsym(struct dso *p, const char *s, void *ra)
 {
        size_t i;
@@ -1786,7 +1932,7 @@ static void *do_dlsym(struct dso *p, const char *s, void *ra)
                struct symdef def = find_sym(p, s, 0);
                if (!def.sym) goto failed;
                if ((def.sym->st_info&0xf) == STT_TLS)
-                       return __tls_get_addr((size_t []){def.dso->tls_id, def.sym->st_value});
+                       return __tls_get_addr((tls_mod_off_t []){def.dso->tls_id, def.sym->st_value-DTP_OFFSET});
                if (DL_FDPIC && (def.sym->st_info&0xf) == STT_FUNC)
                        return def.dso->funcdescs + (def.sym - def.dso->syms);
                return laddr(def.dso, def.sym->st_value);
@@ -1801,12 +1947,12 @@ static void *do_dlsym(struct dso *p, const char *s, void *ra)
                sym = sysv_lookup(s, h, p);
        }
        if (sym && (sym->st_info&0xf) == STT_TLS)
-               return __tls_get_addr((size_t []){p->tls_id, sym->st_value});
+               return __tls_get_addr((tls_mod_off_t []){p->tls_id, sym->st_value-DTP_OFFSET});
        if (DL_FDPIC && sym && sym->st_shndx && (sym->st_info&0xf) == STT_FUNC)
                return p->funcdescs + (sym - p->syms);
        if (sym && sym->st_value && (1<<(sym->st_info&0xf) & OK_TYPES))
                return laddr(p, sym->st_value);
-       if (p->deps) for (i=0; p->deps[i]; i++) {
+       for (i=0; p->deps[i]; i++) {
                if ((ght = p->deps[i]->ghashtab)) {
                        if (!gh) gh = gnu_hash(s);
                        sym = gnu_lookup(gh, ght, p->deps[i], s);
@@ -1815,7 +1961,7 @@ static void *do_dlsym(struct dso *p, const char *s, void *ra)
                        sym = sysv_lookup(s, h, p->deps[i]);
                }
                if (sym && (sym->st_info&0xf) == STT_TLS)
-                       return __tls_get_addr((size_t []){p->deps[i]->tls_id, sym->st_value});
+                       return __tls_get_addr((tls_mod_off_t []){p->deps[i]->tls_id, sym->st_value-DTP_OFFSET});
                if (DL_FDPIC && sym && sym->st_shndx && (sym->st_info&0xf) == STT_FUNC)
                        return p->deps[i]->funcdescs + (sym - p->deps[i]->syms);
                if (sym && sym->st_value && (1<<(sym->st_info&0xf) & OK_TYPES))
@@ -1826,16 +1972,18 @@ failed:
        return 0;
 }
 
-int dladdr(const void *addr, Dl_info *info)
+int dladdr(const void *addr_arg, Dl_info *info)
 {
+       size_t addr = (size_t)addr_arg;
        struct dso *p;
        Sym *sym, *bestsym;
        uint32_t nsym;
        char *strings;
-       void *best = 0;
+       size_t best = 0;
+       size_t besterr = -1;
 
        pthread_rwlock_rdlock(&lock);
-       p = addr2dso((size_t)addr);
+       p = addr2dso(addr);
        pthread_rwlock_unlock(&lock);
 
        if (!p) return 0;
@@ -1845,11 +1993,12 @@ int dladdr(const void *addr, Dl_info *info)
        nsym = count_syms(p);
 
        if (DL_FDPIC) {
-               size_t idx = ((size_t)addr-(size_t)p->funcdescs)
+               size_t idx = (addr-(size_t)p->funcdescs)
                        / sizeof(*p->funcdescs);
                if (idx < nsym && (sym[idx].st_info&0xf) == STT_FUNC) {
-                       best = p->funcdescs + idx;
+                       best = (size_t)(p->funcdescs + idx);
                        bestsym = sym + idx;
+                       besterr = 0;
                }
        }
 
@@ -1857,31 +2006,40 @@ int dladdr(const void *addr, Dl_info *info)
                if (sym->st_value
                 && (1<<(sym->st_info&0xf) & OK_TYPES)
                 && (1<<(sym->st_info>>4) & OK_BINDS)) {
-                       void *symaddr = laddr(p, sym->st_value);
-                       if (symaddr > addr || symaddr < best)
+                       size_t symaddr = (size_t)laddr(p, sym->st_value);
+                       if (symaddr > addr || symaddr <= best)
                                continue;
                        best = symaddr;
                        bestsym = sym;
+                       besterr = addr - symaddr;
                        if (addr == symaddr)
                                break;
                }
        }
 
-       if (!best) return 0;
-
-       if (DL_FDPIC && (bestsym->st_info&0xf) == STT_FUNC)
-               best = p->funcdescs + (bestsym - p->syms);
+       if (bestsym && besterr > bestsym->st_size-1) {
+               best = 0;
+               bestsym = 0;
+       }
 
        info->dli_fname = p->name;
-       info->dli_fbase = p->base;
+       info->dli_fbase = p->map;
+
+       if (!best) {
+               info->dli_sname = 0;
+               info->dli_saddr = 0;
+               return 1;
+       }
+
+       if (DL_FDPIC && (bestsym->st_info&0xf) == STT_FUNC)
+               best = (size_t)(p->funcdescs + (bestsym - p->syms));
        info->dli_sname = strings + bestsym->st_name;
-       info->dli_saddr = best;
+       info->dli_saddr = (void *)best;
 
        return 1;
 }
 
-__attribute__((__visibility__("hidden")))
-void *__dlsym(void *restrict p, const char *restrict s, void *restrict ra)
+hidden void *__dlsym(void *restrict p, const char *restrict s, void *restrict ra)
 {
        void *res;
        pthread_rwlock_rdlock(&lock);
@@ -1916,9 +2074,6 @@ int dl_iterate_phdr(int(*callback)(struct dl_phdr_info *info, size_t size, void
        return ret;
 }
 
-__attribute__((__visibility__("hidden")))
-void __dl_vseterr(const char *, va_list);
-
 static void error(const char *fmt, ...)
 {
        va_list ap;