[PATCH] module, fix percpu reserved memory exhaustion

From: Prarit Bhargava
Date: Wed Jan 09 2013 - 21:41:13 EST


Rusty,

There is likely some subtlety of moving the module mutex that I'm unaware of.
What I can say is that this patch seems to resolve the problem for me, or at
least through 100+ reboots I have not seen the problem (I'm still testing
as I write this).

I'm more than willing to hear an alternative approach, or test an alternative
patch.

Thanks,

P.

----8<----

In recent Fedora releases (F17 & F18) some users have reported seeing
messages similar to

[ 15.478121] Pid: 727, comm: systemd-udevd Tainted: GF 3.8.0-rc2+ #1
[ 15.478121] Call Trace:
[ 15.478131] [<ffffffff81153001>] pcpu_alloc+0xa01/0xa60
[ 15.478137] [<ffffffff8163d8b0>] ? printk+0x61/0x63
[ 15.478140] [<ffffffff811532f3>] __alloc_reserved_percpu+0x13/0x20
[ 15.478145] [<ffffffff810c42e2>] load_module+0x1dc2/0x20b0
[ 15.478150] [<ffffffff8164b21e>] ? do_page_fault+0xe/0x10
[ 15.478152] [<ffffffff81647858>] ? page_fault+0x28/0x30
[ 15.478155] [<ffffffff810c46a7>] sys_init_module+0xd7/0x120
[ 15.478159] [<ffffffff8164f859>] system_call_fastpath+0x16/0x1b
[ 15.478160] kvm: Could not allocate 304 bytes percpu data
[ 15.478174] PERCPU: allocation failed, size=304 align=32, alloc from reserved chunk failed

during system boot. In some cases, users have also reported seeing this
message along with a failed load of other modules.

As the message indicates, the reserved chunk of percpu memory (where
modules allocate their memory) is exhausted. A debug printk inserted in
the code shows

[ 15.478533] PRARIT size = 304 > chunk->contig_hint = 208

ie) the reserved chunk of percpu has only 208 bytes of available space.

What is happening is systemd is loading an instance of the kvm module for
each cpu found (see commit e9bda3b). When the module load occurs the kernel
currently allocates the modules percpu data area prior to checking to see
if the module is already loaded or is in the process of being loaded. If
the module is already loaded, or finishes load, the module loading code
releases the current instance's module's percpu data.

The problem is that these module loads race and it is possible that all of
the percpu reserved area is consumed by repeated loads of the same module
which results in the failure of other drivers to load.

This patch moves the module percpu allocation after the check for an
existing instance of the module.

Signed-off-by: Prarit Bhargava <prarit@xxxxxxxxxx>
Cc: Rusty Russell <rusty@xxxxxxxxxxxxxxx>
Cc: Mike Galbraith <efault@xxxxxx>
---
kernel/module.c | 124 ++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 85 insertions(+), 39 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 250092c..e7e9b57 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1929,6 +1929,27 @@ static int verify_export_symbols(struct module *mod)
return 0;
}

+static void simplify_percpu_symbols(struct module *mod,
+ const struct load_info *info)
+{
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ Elf_Sym *sym = (void *)symsec->sh_addr;
+ unsigned long secbase;
+ unsigned int i;
+
+ /*
+ * No need for error checking in this function because
+ * simplify_symbols has already been called.
+ */
+ for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+ /* Divert to percpu allocation if a percpu var. */
+ if (sym[i].st_shndx == info->index.pcpu) {
+ secbase = (unsigned long)mod_percpu(mod);
+ sym[i].st_value += secbase;
+ }
+ }
+}
+
/* Change all symbols so that st_value encodes the pointer directly. */
static int simplify_symbols(struct module *mod, const struct load_info *info)
{
@@ -1976,12 +1997,11 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
break;

default:
- /* Divert to percpu allocation if a percpu var. */
- if (sym[i].st_shndx == info->index.pcpu)
- secbase = (unsigned long)mod_percpu(mod);
- else
+ /* percpu diverts handled in simplify_percpu_symbols */
+ if (sym[i].st_shndx != info->index.pcpu) {
secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
- sym[i].st_value += secbase;
+ sym[i].st_value += secbase;
+ }
break;
}
}
@@ -2899,11 +2919,29 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
return 0;
}

+static int allocate_percpu(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *pcpusec;
+ int err;
+
+ pcpusec = &info->sechdrs[info->index.pcpu];
+ if (pcpusec->sh_size) {
+ /* We have a special allocation for this section. */
+ pr_debug("module %s attempting to percpu with size %d\n",
+ mod->name, pcpusec->sh_size);
+ err = percpu_modalloc(mod,
+ pcpusec->sh_size, pcpusec->sh_addralign);
+ if (err)
+ return -ENOMEM;
+ pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
+ }
+ return 0;
+}
+
static struct module *layout_and_allocate(struct load_info *info, int flags)
{
/* Module within temporary copy. */
struct module *mod;
- Elf_Shdr *pcpusec;
int err;

mod = setup_load_info(info, flags);
@@ -2920,16 +2958,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
if (err < 0)
goto out;

- pcpusec = &info->sechdrs[info->index.pcpu];
- if (pcpusec->sh_size) {
- /* We have a special allocation for this section. */
- err = percpu_modalloc(mod,
- pcpusec->sh_size, pcpusec->sh_addralign);
- if (err)
- goto out;
- pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
- }
-
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
special cases for the architectures. */
@@ -2955,7 +2983,6 @@ out:
/* mod is no longer valid after this! */
static void module_deallocate(struct module *mod, struct load_info *info)
{
- percpu_modfree(mod);
module_free(mod, mod->module_init);
module_free(mod, mod->module_core);
}
@@ -3135,18 +3162,54 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Set up MODINFO_ATTR fields */
setup_modinfo(mod, info);

- /* Fix up syms, so that st_value is a pointer to location. */
+ /* Fix up non-percpu syms, so that st_value is a pointer to location. */
err = simplify_symbols(mod, info);
if (err < 0)
goto free_modinfo;

+ /*
+ * This mutex protects against concurrent writers of the module
+ * RCU list. The code takes it pretty early because there are
+ * several things that must be completed after checking for an
+ * existing module, and before adding it to the RCU list.
+ */
+ mutex_lock(&module_mutex);
+
+ /* Mark state as coming so strong_try_module_get() ignores us. */
+ mod->state = MODULE_STATE_COMING;
+
+ /* is another instance of this module already loading or loaded? */
+again:
+ if ((old = find_module(mod->name)) != NULL) {
+ if (old->state == MODULE_STATE_COMING) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
+ if (err)
+ goto free_modinfo;
+ mutex_lock(&module_mutex);
+ goto again;
+ }
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ /* allocate this modules percpu reserved area */
+ err = allocate_percpu(mod, info);
+ if (err)
+ goto free_modinfo;
+
+ /* Fix up percpu syms, so that st_value is a pointer to location. */
+ simplify_percpu_symbols(mod, info);
+
err = apply_relocations(mod, info);
if (err < 0)
- goto free_modinfo;
+ goto free_percpu;

err = post_relocation(mod, info);
if (err < 0)
- goto free_modinfo;
+ goto free_percpu;

flush_module_icache(mod);

@@ -3157,31 +3220,12 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_arch_cleanup;
}

- /* Mark state as coming so strong_try_module_get() ignores us. */
- mod->state = MODULE_STATE_COMING;
-
/* Now sew it into the lists so we can get lockdep and oops
* info during argument parsing. No one should access us, since
* strong_try_module_get() will fail.
* lockdep/oops can run asynchronous, so use the RCU list insertion
* function to insert in a way safe to concurrent readers.
- * The mutex protects against concurrent writers.
*/
-again:
- mutex_lock(&module_mutex);
- if ((old = find_module(mod->name)) != NULL) {
- if (old->state == MODULE_STATE_COMING) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto free_arch_cleanup;
- goto again;
- }
- err = -EEXIST;
- goto unlock;
- }

/* This has to be done once we're sure module name is unique. */
dynamic_debug_setup(info->debug, info->num_debug);
@@ -3228,6 +3272,8 @@ again:
kfree(mod->args);
free_arch_cleanup:
module_arch_cleanup(mod);
+ free_percpu:
+ percpu_modfree(mod);
free_modinfo:
free_modinfo(mod);
free_unload:
--
1.7.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/