Re: [RFC PATCH v2 14/32] x86: mm: Provide support to use memblock when spliting large pages

From: Paolo Bonzini
Date: Thu Mar 16 2017 - 09:16:09 EST




On 10/03/2017 23:41, Brijesh Singh wrote:
>> Maybe there's a reason this fires:
>>
>> WARNING: modpost: Found 2 section mismatch(es).
>> To see full details build your kernel with:
>> 'make CONFIG_DEBUG_SECTION_MISMATCH=y'
>>
>> WARNING: vmlinux.o(.text+0x48edc): Section mismatch in reference from
>> the function __change_page_attr() to the function
>> .init.text:memblock_alloc()
>> The function __change_page_attr() references
>> the function __init memblock_alloc().
>> This is often because __change_page_attr lacks a __init
>> annotation or the annotation of memblock_alloc is wrong.
>>
>> WARNING: vmlinux.o(.text+0x491d1): Section mismatch in reference from
>> the function __change_page_attr() to the function
>> .meminit.text:memblock_free()
>> The function __change_page_attr() references
>> the function __meminit memblock_free().
>> This is often because __change_page_attr lacks a __meminit
>> annotation or the annotation of memblock_free is wrong.
>>
>> But maybe Paolo might have an even better idea...
>
> I am sure he will have better idea :)

Not sure if it's better or worse, but an alternative idea is to turn
__change_page_attr and __change_page_attr_set_clr inside out, so that:
1) the alloc_pages/__free_page happens in __change_page_attr_set_clr;
2) __change_page_attr_set_clr overall does not beocome more complex.

Then you can introduce __early_change_page_attr_set_clr and/or
early_kernel_map_pages_in_pgd, for use in your next patches. They use
the memblock allocator instead of alloc/free_page

The attached patch is compile-tested only and almost certainly has some
thinko in it. But it even skims a few lines from the code so the idea
might have some merit.

Paolo
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 28d42130243c..953c8e697562 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -490,11 +490,12 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
}

static int
-try_preserve_large_page(pte_t *kpte, unsigned long address,
+try_preserve_large_page(pte_t **p_kpte, unsigned long address,
struct cpa_data *cpa)
{
unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
- pte_t new_pte, old_pte, *tmp;
+ pte_t *kpte = *p_kpte;
+ pte_t new_pte, old_pte;
pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
enum pg_level level;
@@ -507,8 +508,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
- if (tmp != kpte)
+ *p_kpte = _lookup_address_cpa(cpa, address, &level);
+ if (*p_kpte != kpte)
goto out_unlock;

switch (level) {
@@ -634,17 +635,18 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
unsigned int i, level;
pte_t *tmp;
pgprot_t ref_prot;
+ int retry = 1;

+ if (!debug_pagealloc_enabled())
+ spin_lock(&cpa_lock);
spin_lock(&pgd_lock);
/*
* Check for races, another CPU might have split this page
* up for us already:
*/
tmp = _lookup_address_cpa(cpa, address, &level);
- if (tmp != kpte) {
- spin_unlock(&pgd_lock);
- return 1;
- }
+ if (tmp != kpte)
+ goto out;

paravirt_alloc_pte(&init_mm, page_to_pfn(base));

@@ -671,10 +673,11 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
break;

default:
- spin_unlock(&pgd_lock);
- return 1;
+ goto out;
}

+ retry = 0;
+
/*
* Set the GLOBAL flags only if the PRESENT flag is set
* otherwise pmd/pte_present will return true even on a non
@@ -718,28 +721,34 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* going on.
*/
__flush_tlb_all();
- spin_unlock(&pgd_lock);

- return 0;
-}
-
-static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
- unsigned long address)
-{
- struct page *base;
+out:
+ spin_unlock(&pgd_lock);

+ /*
+ * Do a global flush tlb after splitting the large page
+ * and before we do the actual change page attribute in the PTE.
+ *
+ * With out this, we violate the TLB application note, that says
+ * "The TLBs may contain both ordinary and large-page
+ * translations for a 4-KByte range of linear addresses. This
+ * may occur if software modifies the paging structures so that
+ * the page size used for the address range changes. If the two
+ * translations differ with respect to page frame or attributes
+ * (e.g., permissions), processor behavior is undefined and may
+ * be implementation-specific."
+ *
+ * We do this global tlb flush inside the cpa_lock, so that we
+ * don't allow any other cpu, with stale tlb entries change the
+ * page attribute in parallel, that also falls into the
+ * just split large page entry.
+ */
+ if (!retry)
+ flush_tlb_all();
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
- if (!debug_pagealloc_enabled())
- spin_lock(&cpa_lock);
- if (!base)
- return -ENOMEM;
-
- if (__split_large_page(cpa, kpte, address, base))
- __free_page(base);

- return 0;
+ return retry;
}

static bool try_to_free_pte_page(pte_t *pte)
@@ -1166,30 +1175,26 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
}
}

-static int __change_page_attr(struct cpa_data *cpa, int primary)
+static int __change_page_attr(struct cpa_data *cpa, pte_t **p_kpte, unsigned long address,
+ int primary)
{
- unsigned long address;
- int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ int err = 0;

- if (cpa->flags & CPA_PAGES_ARRAY) {
- struct page *page = cpa->pages[cpa->curpage];
- if (unlikely(PageHighMem(page)))
- return 0;
- address = (unsigned long)page_address(page);
- } else if (cpa->flags & CPA_ARRAY)
- address = cpa->vaddr[cpa->curpage];
- else
- address = *cpa->vaddr;
-repeat:
- kpte = _lookup_address_cpa(cpa, address, &level);
- if (!kpte)
- return __cpa_process_fault(cpa, address, primary);
+ if (!debug_pagealloc_enabled())
+ spin_lock(&cpa_lock);
+ *p_kpte = kpte = _lookup_address_cpa(cpa, address, &level);
+ if (!kpte) {
+ err = __cpa_process_fault(cpa, address, primary);
+ goto out;
+ }

old_pte = *kpte;
- if (pte_none(old_pte))
- return __cpa_process_fault(cpa, address, primary);
+ if (pte_none(old_pte)) {
+ err = __cpa_process_fault(cpa, address, primary);
+ goto out;
+ }

if (level == PG_LEVEL_4K) {
pte_t new_pte;
@@ -1228,59 +1233,27 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
cpa->flags |= CPA_FLUSHTLB;
}
cpa->numpages = 1;
- return 0;
+ goto out;
}

/*
* Check, whether we can keep the large page intact
* and just change the pte:
*/
- do_split = try_preserve_large_page(kpte, address, cpa);
- /*
- * When the range fits into the existing large page,
- * return. cp->numpages and cpa->tlbflush have been updated in
- * try_large_page:
- */
- if (do_split <= 0)
- return do_split;
-
- /*
- * We have to split the large page:
- */
- err = split_large_page(cpa, kpte, address);
- if (!err) {
- /*
- * Do a global flush tlb after splitting the large page
- * and before we do the actual change page attribute in the PTE.
- *
- * With out this, we violate the TLB application note, that says
- * "The TLBs may contain both ordinary and large-page
- * translations for a 4-KByte range of linear addresses. This
- * may occur if software modifies the paging structures so that
- * the page size used for the address range changes. If the two
- * translations differ with respect to page frame or attributes
- * (e.g., permissions), processor behavior is undefined and may
- * be implementation-specific."
- *
- * We do this global tlb flush inside the cpa_lock, so that we
- * don't allow any other cpu, with stale tlb entries change the
- * page attribute in parallel, that also falls into the
- * just split large page entry.
- */
- flush_tlb_all();
- goto repeat;
- }
+ err = try_preserve_large_page(p_kpte, address, cpa);

+out:
+ if (!debug_pagealloc_enabled())
+ spin_unlock(&cpa_lock);
return err;
}

static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);

-static int cpa_process_alias(struct cpa_data *cpa)
+static int cpa_process_alias(struct cpa_data *cpa, unsigned long vaddr)
{
struct cpa_data alias_cpa;
unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
- unsigned long vaddr;
int ret;

if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
@@ -1290,16 +1263,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (cpa->flags & CPA_PAGES_ARRAY) {
- struct page *page = cpa->pages[cpa->curpage];
- if (unlikely(PageHighMem(page)))
- return 0;
- vaddr = (unsigned long)page_address(page);
- } else if (cpa->flags & CPA_ARRAY)
- vaddr = cpa->vaddr[cpa->curpage];
- else
- vaddr = *cpa->vaddr;
-
if (!(within(vaddr, PAGE_OFFSET,
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {

@@ -1338,33 +1301,64 @@ static int cpa_process_alias(struct cpa_data *cpa)
return 0;
}

+static unsigned long cpa_address(struct cpa_data *cpa, unsigned long numpages)
+{
+ /*
+ * Store the remaining nr of pages for the large page
+ * preservation check.
+ */
+ /* for array changes, we can't use large page */
+ cpa->numpages = 1;
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return -EINVAL;
+ return (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY) {
+ return cpa->vaddr[cpa->curpage];
+ } else {
+ cpa->numpages = numpages;
+ return *cpa->vaddr;
+ }
+}
+
+static void cpa_advance(struct cpa_data *cpa)
+{
+ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
+ cpa->curpage++;
+ else
+ *cpa->vaddr += cpa->numpages * PAGE_SIZE;
+}
+
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
{
unsigned long numpages = cpa->numpages;
+ unsigned long vaddr;
+ struct page *base;
+ pte_t *kpte;
int ret;

while (numpages) {
- /*
- * Store the remaining nr of pages for the large page
- * preservation check.
- */
- cpa->numpages = numpages;
- /* for array changes, we can't use large page */
- if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
- cpa->numpages = 1;
-
- if (!debug_pagealloc_enabled())
- spin_lock(&cpa_lock);
- ret = __change_page_attr(cpa, checkalias);
- if (!debug_pagealloc_enabled())
- spin_unlock(&cpa_lock);
- if (ret)
- return ret;
-
- if (checkalias) {
- ret = cpa_process_alias(cpa);
- if (ret)
+ vaddr = cpa_address(cpa, numpages);
+ if (!IS_ERR_VALUE(vaddr)) {
+repeat:
+ ret = __change_page_attr(cpa, &kpte, vaddr, checkalias);
+ if (ret < 0)
return ret;
+ if (ret) {
+ base = alloc_page(GFP_KERNEL|__GFP_NOTRACK);
+ if (!base)
+ return -ENOMEM;
+ if (__split_large_page(cpa, kpte, vaddr, base))
+ __free_page(base);
+ goto repeat;
+ }
+
+ if (checkalias) {
+ ret = cpa_process_alias(cpa, vaddr);
+ if (ret < 0)
+ return ret;
+ }
}

/*
@@ -1374,11 +1368,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
*/
BUG_ON(cpa->numpages > numpages || !cpa->numpages);
numpages -= cpa->numpages;
- if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
- cpa->curpage++;
- else
- *cpa->vaddr += cpa->numpages * PAGE_SIZE;
-
+ cpa_advance(cpa);
}
return 0;
}