Re: [PATCH V1] x86, espfix: postpone the initialization of espfix stack for AP

From: Zhu Guihua
Date: Wed Jun 17 2015 - 01:54:56 EST


Any feedback about this?

On 06/04/2015 05:45 PM, Gu Zheng wrote:
The following lockdep warning occurrs when running with latest kernel:
[ 3.178000] ------------[ cut here ]------------
[ 3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
[ 3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
[ 3.199000] Modules linked in:

[ 3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
[ 3.221000] 0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
[ 3.230000] 0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
[ 3.238000] ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
[ 3.246000] Call Trace:
[ 3.249000] [<ffffffff81773f0a>] dump_stack+0x4c/0x65
[ 3.255000] [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
[ 3.261000] [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
[ 3.268000] [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
[ 3.274000] [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
[ 3.281000] [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
[ 3.288000] [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
[ 3.295000] [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
[ 3.301000] [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
[ 3.308000] [<ffffffff811c869e>] __get_free_pages+0xe/0x50
[ 3.314000] [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
[ 3.320000] [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
[ 3.327000] ---[ end trace 1b3327d9d6a1d62c ]---

As we alloc pages with GFP_KERNEL in init_espfix_ap() which is called
before enabled local irq, and the lockdep sub-system considers this
behaviour as allocating memory with GFP_FS with local irq disabled,
then trigger the warning as mentioned about.

Though we could allocate them on the boot CPU side and hand them over to
the secondary CPU, but it seemes a bit waste if some of cpus are offline.
As thers is no need to these pages(espfix stack) until we try to run user
code, so we postpone the initialization of espfix stack, and let the boot
up routine init the espfix stack for the target cpu after it booted to
avoid the noise.

Signed-off-by: Gu Zheng <guz.fnst@xxxxxxxxxxxxxx>
---
v1:
Alloc the page on the node the target CPU is on.
RFC:
Let the boot up routine init the espfix stack for the target cpu after it
booted.
---
---
arch/x86/include/asm/espfix.h | 2 +-
arch/x86/kernel/espfix_64.c | 28 ++++++++++++++++------------
arch/x86/kernel/smpboot.c | 14 +++++++-------
3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index 99efebb..ca3ce9a 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
extern void init_espfix_bsp(void);
-extern void init_espfix_ap(void);
+extern void init_espfix_ap(int cpu);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730..e397583 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -131,25 +131,24 @@ void __init init_espfix_bsp(void)
init_espfix_random();
/* The rest is the same as for any other processor */
- init_espfix_ap();
+ init_espfix_ap(0);
}
-void init_espfix_ap(void)
+void init_espfix_ap(int cpu)
{
- unsigned int cpu, page;
+ unsigned int page;
unsigned long addr;
pud_t pud, *pud_p;
pmd_t pmd, *pmd_p;
pte_t pte, *pte_p;
- int n;
+ int n, node;
void *stack_page;
pteval_t ptemask;
/* We only have to do this once... */
- if (likely(this_cpu_read(espfix_stack)))
+ if (likely(per_cpu(espfix_stack, cpu)))
return; /* Already initialized */
- cpu = smp_processor_id();
addr = espfix_base_addr(cpu);
page = cpu/ESPFIX_STACKS_PER_PAGE;
@@ -165,12 +164,15 @@ void init_espfix_ap(void)
if (stack_page)
goto unlock_done;
+ node = cpu_to_node(cpu);
ptemask = __supported_pte_mask;
pud_p = &espfix_pud_page[pud_index(addr)];
pud = *pud_p;
if (!pud_present(pud)) {
- pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pmd_p = (pmd_t *)page_address(page);
pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PUD_CLONES; n++)
@@ -180,7 +182,9 @@ void init_espfix_ap(void)
pmd_p = pmd_offset(&pud, addr);
pmd = *pmd_p;
if (!pmd_present(pmd)) {
- pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pte_p = (pte_t *)page_address(page);
pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PMD_CLONES; n++)
@@ -188,7 +192,7 @@ void init_espfix_ap(void)
}
pte_p = pte_offset_kernel(&pmd, addr);
- stack_page = (void *)__get_free_page(GFP_KERNEL);
+ stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
@@ -199,7 +203,7 @@ void init_espfix_ap(void)
unlock_done:
mutex_unlock(&espfix_init_mutex);
done:
- this_cpu_write(espfix_stack, addr);
- this_cpu_write(espfix_waddr, (unsigned long)stack_page
- + (addr & ~PAGE_MASK));
+ per_cpu(espfix_stack, cpu) = addr;
+ per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ + (addr & ~PAGE_MASK);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 50e547e..e9fdd0e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();
/*
- * Enable the espfix hack for this CPU
- */
-#ifdef CONFIG_X86_ESPFIX64
- init_espfix_ap();
-#endif
-
- /*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
@@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
}
}
+ /*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap(cpu);
+#endif
+
/* mark "stuck" area as not stuck */
*trampoline_status = 0;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/