Re: [RFC PATCH v4 2/6] module: preserve Elf information for livepatch modules

From: Rusty Russell
Date: Tue Feb 09 2016 - 19:29:40 EST


Petr Mladek <pmladek@xxxxxxxx> writes:
> On Tue 2016-02-09 11:33:07, Jiri Kosina wrote:
>> On Tue, 9 Feb 2016, Petr Mladek wrote:
>>
>> > > +#ifdef CONFIG_KALLSYMS
>> > > + /* Make symtab and strtab available prior to module init call */
>> > > + mod->num_symtab = mod->core_num_syms;
>> > > + mod->symtab = mod->core_symtab;
>> > > + mod->strtab = mod->core_strtab;
>> > > +#endif
>> >
>> > This should be done with module_mutex. Otherwise, it looks racy at least
>> > against module_kallsyms_on_each_symbol().
>>
>> Hmm, if this is the case, the comment describing the locking rules for
>> module_mutex should be updated.
>>
>> > BTW: I wonder why even the original code is not racy for example against
>> > module_get_kallsym. It is called without the mutex. This code sets the
>> > number of entries before the pointer to the entries.
>> >
>> > Note that the module is in the list even in the UNFORMED state.
>>
>> module_kallsyms_on_each_symbol() gives up in such case though.
>
> The problem is that we set the three values above in the COMMING
> state. They were even set in the LIVE state before this patch.

True. Indeed, I have a fix for exactly this queued in my fixes tree,
and am about to send Linus a pull req.

Below is the fix I went with, for your reference (part of a series, but
you get the idea).

Thanks!
Rusty.

commit 8244062ef1e54502ef55f54cced659913f244c3e
Author: Rusty Russell <rusty@xxxxxxxxxxxxxxx>
Date: Wed Feb 3 16:55:26 2016 +1030

modules: fix longstanding /proc/kallsyms vs module insertion race.

For CONFIG_KALLSYMS, we keep two symbol tables and two string tables.
There's one full copy, marked SHF_ALLOC and laid out at the end of the
module's init section. There's also a cut-down version that only
contains core symbols and strings, and lives in the module's core
section.

After module init (and before we free the module memory), we switch
the mod->symtab, mod->num_symtab and mod->strtab to point to the core
versions. We do this under the module_mutex.

However, kallsyms doesn't take the module_mutex: it uses
preempt_disable() and rcu tricks to walk through the modules, because
it's used in the oops path. It's also used in /proc/kallsyms.
There's nothing atomic about the change of these variables, so we can
get the old (larger!) num_symtab and the new symtab pointer; in fact
this is what I saw when trying to reproduce.

By grouping these variables together, we can use a
carefully-dereferenced pointer to ensure we always get one or the
other (the free of the module init section is already done in an RCU
callback, so that's safe). We allocate the init one at the end of the
module init section, and keep the core one inside the struct module
itself (it could also have been allocated at the end of the module
core, but that's probably overkill).

Reported-by: Weilong Chen <chenweilong@xxxxxxxxxx>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111541
Cc: stable@xxxxxxxxxx
Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx>

diff --git a/include/linux/module.h b/include/linux/module.h
index 4560d8f1545d..2bb0c3085706 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -324,6 +324,12 @@ struct module_layout {
#define __module_layout_align
#endif

+struct mod_kallsyms {
+ Elf_Sym *symtab;
+ unsigned int num_symtab;
+ char *strtab;
+};
+
struct module {
enum module_state state;

@@ -405,15 +411,10 @@ struct module {
#endif

#ifdef CONFIG_KALLSYMS
- /*
- * We keep the symbol and string tables for kallsyms.
- * The core_* fields below are temporary, loader-only (they
- * could really be discarded after module init).
- */
- Elf_Sym *symtab, *core_symtab;
- unsigned int num_symtab, core_num_syms;
- char *strtab, *core_strtab;
-
+ /* Protected by RCU and/or module_mutex: use rcu_dereference() */
+ struct mod_kallsyms *kallsyms;
+ struct mod_kallsyms core_kallsyms;
+
/* Section attributes */
struct module_sect_attrs *sect_attrs;

diff --git a/kernel/module.c b/kernel/module.c
index 1e79d8157712..9537da37ce87 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -303,6 +303,9 @@ struct load_info {
struct _ddebug *debug;
unsigned int num_debug;
bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
struct {
unsigned int sym, str, mod, vers, info, pcpu;
} index;
@@ -2480,10 +2483,21 @@ static void layout_symtab(struct module *mod, struct load_info *info)
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
info->index.str) | INIT_OFFSET_MASK;
- mod->init_layout.size = debug_align(mod->init_layout.size);
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+ /* We'll tack temporary mod_kallsyms on the end. */
+ mod->init_layout.size = ALIGN(mod->init_layout.size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod->init_layout.size;
+ mod->init_layout.size += sizeof(struct mod_kallsyms);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
}

+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section. Later we switch to the cut-down
+ * core-only ones.
+ */
static void add_kallsyms(struct module *mod, const struct load_info *info)
{
unsigned int i, ndst;
@@ -2492,29 +2506,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
char *s;
Elf_Shdr *symsec = &info->sechdrs[info->index.sym];

- mod->symtab = (void *)symsec->sh_addr;
- mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Set up to point into init section. */
+ mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
+
+ mod->kallsyms->symtab = (void *)symsec->sh_addr;
+ mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
/* Make sure we get permanent strtab: don't use info->strtab. */
- mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+ mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;

/* Set types up while we still have access to sections. */
- for (i = 0; i < mod->num_symtab; i++)
- mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
-
- mod->core_symtab = dst = mod->core_layout.base + info->symoffs;
- mod->core_strtab = s = mod->core_layout.base + info->stroffs;
- src = mod->symtab;
- for (ndst = i = 0; i < mod->num_symtab; i++) {
+ for (i = 0; i < mod->kallsyms->num_symtab; i++)
+ mod->kallsyms->symtab[i].st_info
+ = elf_type(&mod->kallsyms->symtab[i], info);
+
+ /* Now populate the cut down core kallsyms for after init. */
+ mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+ mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+ src = mod->kallsyms->symtab;
+ for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
if (i == 0 ||
is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
info->index.pcpu)) {
dst[ndst] = src[i];
- dst[ndst++].st_name = s - mod->core_strtab;
- s += strlcpy(s, &mod->strtab[src[i].st_name],
+ dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
KSYM_NAME_LEN) + 1;
}
}
- mod->core_num_syms = ndst;
+ mod->core_kallsyms.num_symtab = ndst;
}
#else
static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -3263,9 +3282,8 @@ static noinline int do_init_module(struct module *mod)
module_put(mod);
trim_init_extable(mod);
#ifdef CONFIG_KALLSYMS
- mod->num_symtab = mod->core_num_syms;
- mod->symtab = mod->core_symtab;
- mod->strtab = mod->core_strtab;
+ /* Switch to core kallsyms now init is done: kallsyms may be walking! */
+ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
mod_tree_remove_init(mod);
disable_ro_nx(&mod->init_layout);
@@ -3627,9 +3645,9 @@ static inline int is_arm_mapping_symbol(const char *str)
&& (str[2] == '\0' || str[2] == '.');
}

-static const char *symname(struct module *mod, unsigned int symnum)
+static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum)
{
- return mod->strtab + mod->symtab[symnum].st_name;
+ return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
}

static const char *get_ksymbol(struct module *mod,
@@ -3639,6 +3657,7 @@ static const char *get_ksymbol(struct module *mod,
{
unsigned int i, best = 0;
unsigned long nextval;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);

/* At worse, next value is at end of module */
if (within_module_init(addr, mod))
@@ -3648,32 +3667,32 @@ static const char *get_ksymbol(struct module *mod,

/* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
- for (i = 1; i < mod->num_symtab; i++) {
- if (mod->symtab[i].st_shndx == SHN_UNDEF)
+ for (i = 1; i < kallsyms->num_symtab; i++) {
+ if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
continue;

/* We ignore unnamed symbols: they're uninformative
* and inserted at a whim. */
- if (*symname(mod, i) == '\0'
- || is_arm_mapping_symbol(symname(mod, i)))
+ if (*symname(kallsyms, i) == '\0'
+ || is_arm_mapping_symbol(symname(kallsyms, i)))
continue;

- if (mod->symtab[i].st_value <= addr
- && mod->symtab[i].st_value > mod->symtab[best].st_value)
+ if (kallsyms->symtab[i].st_value <= addr
+ && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value)
best = i;
- if (mod->symtab[i].st_value > addr
- && mod->symtab[i].st_value < nextval)
- nextval = mod->symtab[i].st_value;
+ if (kallsyms->symtab[i].st_value > addr
+ && kallsyms->symtab[i].st_value < nextval)
+ nextval = kallsyms->symtab[i].st_value;
}

if (!best)
return NULL;

if (size)
- *size = nextval - mod->symtab[best].st_value;
+ *size = nextval - kallsyms->symtab[best].st_value;
if (offset)
- *offset = addr - mod->symtab[best].st_value;
- return symname(mod, best);
+ *offset = addr - kallsyms->symtab[best].st_value;
+ return symname(kallsyms, best);
}

/* For kallsyms to ask for address resolution. NULL means not found. Careful
@@ -3763,18 +3782,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,

preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (symnum < mod->num_symtab) {
- *value = mod->symtab[symnum].st_value;
- *type = mod->symtab[symnum].st_info;
- strlcpy(name, symname(mod, symnum), KSYM_NAME_LEN);
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ if (symnum < kallsyms->num_symtab) {
+ *value = kallsyms->symtab[symnum].st_value;
+ *type = kallsyms->symtab[symnum].st_info;
+ strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN);
strlcpy(module_name, mod->name, MODULE_NAME_LEN);
*exported = is_exported(name, *value, mod);
preempt_enable();
return 0;
}
- symnum -= mod->num_symtab;
+ symnum -= kallsyms->num_symtab;
}
preempt_enable();
return -ERANGE;
@@ -3783,11 +3805,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
static unsigned long mod_find_symname(struct module *mod, const char *name)
{
unsigned int i;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);

- for (i = 0; i < mod->num_symtab; i++)
- if (strcmp(name, symname(mod, i)) == 0 &&
- mod->symtab[i].st_info != 'U')
- return mod->symtab[i].st_value;
+ for (i = 0; i < kallsyms->num_symtab; i++)
+ if (strcmp(name, symname(kallsyms, i)) == 0 &&
+ kallsyms->symtab[i].st_info != 'U')
+ return kallsyms->symtab[i].st_value;
return 0;
}

@@ -3826,11 +3849,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
module_assert_mutex();

list_for_each_entry(mod, &modules, list) {
+ /* We hold module_mutex: no need for rcu_dereference_sched */
+ struct mod_kallsyms *kallsyms = mod->kallsyms;
+
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- for (i = 0; i < mod->num_symtab; i++) {
- ret = fn(data, symname(mod, i),
- mod, mod->symtab[i].st_value);
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ ret = fn(data, symname(kallsyms, i),
+ mod, kallsyms->symtab[i].st_value);
if (ret != 0)
return ret;
}