Re: [PATCHv11 18/19] x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method

From: Kirill A. Shutemov
Date: Fri Jun 07 2024 - 11:15:11 EST


On Mon, Jun 03, 2024 at 10:39:30AM +0200, Borislav Petkov wrote:
> > +/*
> > + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
> > + * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
> > + * to the identity mapping and the function has be present at the same spot in
> > + * the virtual address space before and after switching page tables.
> > + */
> > +static int __init init_transition_pgtable(pgd_t *pgd)
>
> This looks like a generic helper which should be in set_memory.c. And
> looking at that file, there's populate_pgd() which does pretty much the
> same thing, if I squint real hard.
>
> Let's tone down the duplication.

Okay, there is a function called kernel_map_pages_in_pgd() in set_memory.c
that does what we need here.

I tried to use it, but encountered a few issues:

- The code in set_memory.c allocates memory using the buddy allocator,
which is not yet ready. We can work around this limitation by delaying
the initialization of offlining until later, using a separate
early_initcall();

- I noticed a complaint that the allocation is being done from an atomic
context: a spinlock called cpa_lock is taken when populate_pgd()
allocates memory.

I am not sure why this was not noticed before. kernel_map_pages_in_pgd()
has only been used in EFI mapping initialization so far, so maybe it is
somehow special, I don't know.

I was able to address this issue by switching cpa_lock to a mutex.
However, this solution will only work if the callers for set_memory
interfaces are not called from an atomic context. I need to verify if
this is the case.

- The function __flush_tlb_all() in kernel_(un)map_pages_in_pgd() must be
called with preemption disabled. Once again, I am unsure why this has
not caused issues in the EFI case.

- I discovered a bug in kernel_ident_mapping_free() when it is used on a
machine with 5-level paging. I will submit a proper patch to fix this
issue.

The fixup is below.

Any comments?

diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index 6cfe762be28b..fbbfe78f7f27 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -59,82 +59,55 @@ static void acpi_mp_cpu_die(unsigned int cpu)
pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
}

+static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
+{
+ cpu_hotplug_disable_offlining();
+
+ /*
+ * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
+ * limits kexec: the second kernel won't be able to use more than one CPU.
+ *
+ * To prevent a kexec kernel from onlining secondary CPUs invalidate the
+ * mailbox address in the ACPI MADT wakeup structure which prevents a
+ * kexec kernel to use it.
+ *
+ * This is safe as the booting kernel has the mailbox address cached
+ * already and acpi_wakeup_cpu() uses the cached value to bring up the
+ * secondary CPUs.
+ *
+ * Note: This is a Linux specific convention and not covered by the
+ * ACPI specification.
+ */
+ mp_wake->mailbox_address = 0;
+}
+
/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
static void __init *alloc_pgt_page(void *dummy)
{
- return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+ return (void *)get_zeroed_page(GFP_KERNEL);
}

static void __init free_pgt_page(void *pgt, void *dummy)
{
- return memblock_free(pgt, PAGE_SIZE);
+ return free_page((unsigned long)pgt);
}

-/*
- * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
- * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
- * to the identity mapping and the function has be present at the same spot in
- * the virtual address space before and after switching page tables.
- */
-static int __init init_transition_pgtable(pgd_t *pgd)
-{
- pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
- unsigned long vaddr, paddr;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- vaddr = (unsigned long)asm_acpi_mp_play_dead;
- pgd += pgd_index(vaddr);
- if (!pgd_present(*pgd)) {
- p4d = (p4d_t *)alloc_pgt_page(NULL);
- if (!p4d)
- return -ENOMEM;
- set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
- }
- p4d = p4d_offset(pgd, vaddr);
- if (!p4d_present(*p4d)) {
- pud = (pud_t *)alloc_pgt_page(NULL);
- if (!pud)
- return -ENOMEM;
- set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
- }
- pud = pud_offset(p4d, vaddr);
- if (!pud_present(*pud)) {
- pmd = (pmd_t *)alloc_pgt_page(NULL);
- if (!pmd)
- return -ENOMEM;
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- }
- pmd = pmd_offset(pud, vaddr);
- if (!pmd_present(*pmd)) {
- pte = (pte_t *)alloc_pgt_page(NULL);
- if (!pte)
- return -ENOMEM;
- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
- }
- pte = pte_offset_kernel(pmd, vaddr);
-
- paddr = __pa(vaddr);
- set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
-
- return 0;
-}
-
-static int __init acpi_mp_setup_reset(u64 reset_vector)
+static int __init acpi_mp_setup_reset(union acpi_subtable_headers *header,
+ const unsigned long end)
{
+ struct acpi_madt_multiproc_wakeup *mp_wake;
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
.free_pgt_page = free_pgt_page,
.page_flag = __PAGE_KERNEL_LARGE_EXEC,
- .kernpg_flag = _KERNPG_TABLE_NOENC,
+ .kernpg_flag = _KERNPG_TABLE,
};
+ unsigned long vaddr, pfn;
pgd_t *pgd;

pgd = alloc_pgt_page(NULL);
if (!pgd)
- return -ENOMEM;
+ goto err;

for (int i = 0; i < nr_pfn_mapped; i++) {
unsigned long mstart, mend;
@@ -143,30 +116,45 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
mend = pfn_mapped[i].end << PAGE_SHIFT;
if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
kernel_ident_mapping_free(&info, pgd);
- return -ENOMEM;
+ goto err;
}
}

if (kernel_ident_mapping_init(&info, pgd,
- PAGE_ALIGN_DOWN(reset_vector),
- PAGE_ALIGN(reset_vector + 1))) {
+ PAGE_ALIGN_DOWN(acpi_mp_reset_vector_paddr),
+ PAGE_ALIGN(acpi_mp_reset_vector_paddr + 1))) {
kernel_ident_mapping_free(&info, pgd);
- return -ENOMEM;
+ goto err;
}

- if (init_transition_pgtable(pgd)) {
+ /*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping
+ * at the same place as in the kernel page tables.
+ *
+ * asm_acpi_mp_play_dead() switches to the identity mapping and the
+ * function has be present at the same spot in the virtual address space
+ * before and after switching page tables.
+ */
+ vaddr = (unsigned long)asm_acpi_mp_play_dead;
+ pfn = __pa(vaddr) >> PAGE_SHIFT;
+ if (kernel_map_pages_in_pgd(pgd, pfn, vaddr, 1, _KERNPG_TABLE)) {
kernel_ident_mapping_free(&info, pgd);
- return -ENOMEM;
+ goto err;
}

smp_ops.play_dead = acpi_mp_play_dead;
smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
smp_ops.cpu_die = acpi_mp_cpu_die;

- acpi_mp_reset_vector_paddr = reset_vector;
acpi_mp_pgd = __pa(pgd);

return 0;
+err:
+ pr_warn("Failed to setup MADT reset vector\n");
+ mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
+ acpi_mp_disable_offlining(mp_wake);
+ return -ENOMEM;
+
}

static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
@@ -226,28 +214,6 @@ static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
return 0;
}

-static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
-{
- cpu_hotplug_disable_offlining();
-
- /*
- * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
- * limits kexec: the second kernel won't be able to use more than one CPU.
- *
- * To prevent a kexec kernel from onlining secondary CPUs invalidate the
- * mailbox address in the ACPI MADT wakeup structure which prevents a
- * kexec kernel to use it.
- *
- * This is safe as the booting kernel has the mailbox address cached
- * already and acpi_wakeup_cpu() uses the cached value to bring up the
- * secondary CPUs.
- *
- * Note: This is a Linux specific convention and not covered by the
- * ACPI specification.
- */
- mp_wake->mailbox_address = 0;
-}
-
int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
const unsigned long end)
{
@@ -274,10 +240,7 @@ int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,

if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
- if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
- pr_warn("Failed to setup MADT reset vector\n");
- acpi_mp_disable_offlining(mp_wake);
- }
+ acpi_mp_reset_vector_paddr = mp_wake->reset_vector;
} else {
/*
* CPU offlining requires version 1 of the ACPI MADT wakeup
@@ -290,3 +253,13 @@ int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,

return 0;
}
+
+static int __init acpi_mp_offline_init(void)
+{
+ if (!acpi_mp_reset_vector_paddr)
+ return 0;
+
+ return acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP,
+ acpi_mp_setup_reset, 1);
+}
+early_initcall(acpi_mp_offline_init);
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 3996af7b4abf..c45127265f2f 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -60,7 +60,7 @@ static void free_p4d(struct x86_mapping_info *info, pgd_t *pgd)
}

if (pgtable_l5_enabled())
- info->free_pgt_page(pgd, info->context);
+ info->free_pgt_page(p4d, info->context);
}

void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 443a97e515c0..72715674f492 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -69,7 +69,7 @@ static const int cpa_warn_level = CPA_PROTECT;
* entries change the page attribute in parallel to some other cpu
* splitting a large page entry along with changing the attribute.
*/
-static DEFINE_SPINLOCK(cpa_lock);
+static DEFINE_MUTEX(cpa_lock);

#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
@@ -1186,10 +1186,10 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
struct page *base;

if (!debug_pagealloc_enabled())
- spin_unlock(&cpa_lock);
+ mutex_unlock(&cpa_lock);
base = alloc_pages(GFP_KERNEL, 0);
if (!debug_pagealloc_enabled())
- spin_lock(&cpa_lock);
+ mutex_lock(&cpa_lock);
if (!base)
return -ENOMEM;

@@ -1804,10 +1804,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
cpa->numpages = 1;

if (!debug_pagealloc_enabled())
- spin_lock(&cpa_lock);
+ mutex_lock(&cpa_lock);
ret = __change_page_attr(cpa, primary);
if (!debug_pagealloc_enabled())
- spin_unlock(&cpa_lock);
+ mutex_unlock(&cpa_lock);
if (ret)
goto out;

@@ -2516,7 +2516,9 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);

retval = __change_page_attr_set_clr(&cpa, 1);
+ preempt_disable();
__flush_tlb_all();
+ preempt_enable();

out:
return retval;
@@ -2551,7 +2553,9 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");

retval = __change_page_attr_set_clr(&cpa, 1);
+ preempt_disable();
__flush_tlb_all();
+ preempt_enable();

return retval;
}
--
Kiryl Shutsemau / Kirill A. Shutemov