[PATCH 1/5] Paravirt page alloc.patch

From: Zachary Amsden
Date: Tue Dec 12 2006 - 17:35:11 EST


The VMI backend uses explicit page type notification to track shadow
page tables. The allocation of page table roots is especially tricky.
We want to clone the root for non-PAE mode while it is protected under
the pgd lock.

Signed-off-by: Zachary Amsden <zach@xxxxxxxxxx>

===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -545,6 +545,12 @@ struct paravirt_ops paravirt_ops = {
.flush_tlb_kernel = native_flush_tlb_global,
.flush_tlb_single = native_flush_tlb_single,

+ .alloc_pt = (void *)native_nop,
+ .alloc_pd = (void *)native_nop,
+ .alloc_pd_clone = (void *)native_nop,
+ .release_pt = (void *)native_nop,
+ .release_pd = (void *)native_nop,
+
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
===================================================================
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -62,6 +62,7 @@ static pmd_t * __init one_md_table_init(

#ifdef CONFIG_X86_PAE
pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+ paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
if (pmd_table != pmd_offset(pud, 0))
@@ -82,6 +83,7 @@ static pte_t * __init one_page_table_ini
{
if (pmd_none(*pmd)) {
pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+ paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
if (page_table != pte_offset_kernel(pmd, 0))
BUG();
@@ -347,6 +349,8 @@ static void __init pagetable_init (void)
/* Init entries of the first-level page table to the zero page */
for (i = 0; i < PTRS_PER_PGD; i++)
set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+#else
+ paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
#endif

/* Enable PSE if available */
===================================================================
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,6 +60,7 @@ static struct page *split_large_page(uns
address = __pa(address);
addr = address & LARGE_PAGE_MASK;
pbase = (pte_t *)page_address(base);
+ paravirt_alloc_pt(page_to_pfn(base));
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
addr == address ? prot : ref_prot));
@@ -166,6 +167,7 @@ __change_page_attr(struct page *page, pg
if (!PageReserved(kpte_page)) {
if (cpu_has_pse && (page_private(kpte_page) == 0)) {
ClearPagePrivate(kpte_page);
+ paravirt_release_pt(page_to_pfn(kpte_page));
list_add(&kpte_page->lru, &df_list);
revert_page(kpte_page, address);
}
===================================================================
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -245,8 +245,14 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD,
KERNEL_PGD_PTRS);
+
if (PTRS_PER_PMD > 1)
return;
+
+ /* must happen under lock */
+ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
+ USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);

pgd_list_add(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
@@ -257,6 +263,7 @@ void pgd_dtor(void *pgd, kmem_cache_t *c
{
unsigned long flags; /* can be called from interrupt context */

+ paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
spin_lock_irqsave(&pgd_lock, flags);
pgd_list_del(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
@@ -274,13 +281,18 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
if (!pmd)
goto out_oom;
+ paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
}
return pgd;

out_oom:
- for (i--; i >= 0; i--)
- kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+ for (i--; i >= 0; i--) {
+ pgd_t pgdent = pgd[i];
+ void* pmd = (void *)__va(pgd_val(pgdent)-1);
+ paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+ kmem_cache_free(pmd_cache, pmd);
+ }
kmem_cache_free(pgd_cache, pgd);
return NULL;
}
@@ -291,8 +303,12 @@ void pgd_free(pgd_t *pgd)

/* in the PAE case user pgd entries are overwritten before usage */
if (PTRS_PER_PMD > 1)
- for (i = 0; i < USER_PTRS_PER_PGD; ++i)
- kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+ for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+ pgd_t pgdent = pgd[i];
+ void* pmd = (void *)__va(pgd_val(pgdent)-1);
+ paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+ kmem_cache_free(pmd_cache, pmd);
+ }
/* in the non-PAE case, free_pgtables() clears user pgd entries */
kmem_cache_free(pgd_cache, pgd);
}
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -126,6 +126,12 @@ struct paravirt_ops
void (fastcall *flush_tlb_user)(void);
void (fastcall *flush_tlb_kernel)(void);
void (fastcall *flush_tlb_single)(u32 addr);
+
+ void (fastcall *alloc_pt)(u32 pfn);
+ void (fastcall *alloc_pd)(u32 pfn);
+ void (fastcall *alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
+ void (fastcall *release_pt)(u32 pfn);
+ void (fastcall *release_pd)(u32 pfn);

void (fastcall *set_pte)(pte_t *ptep, pte_t pteval);
void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval);
@@ -325,6 +331,14 @@ static __inline unsigned long apic_read(
#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)

+#define paravirt_alloc_pt(pfn) paravirt_ops.alloc_pt(pfn)
+#define paravirt_release_pt(pfn) paravirt_ops.release_pt(pfn)
+
+#define paravirt_alloc_pd(pfn) paravirt_ops.alloc_pd(pfn)
+#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) \
+ paravirt_ops.alloc_pd_clone(pfn, clonepfn, start, count)
+#define paravirt_release_pd(pfn) paravirt_ops.release_pd(pfn)
+
static inline void set_pte(pte_t *ptep, pte_t pteval)
{
paravirt_ops.set_pte(ptep, pteval);
===================================================================
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -5,13 +5,31 @@
#include <linux/threads.h>
#include <linux/mm.h> /* for struct page */

-#define pmd_populate_kernel(mm, pmd, pte) \
- set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_alloc_pt(pfn) do { } while (0)
+#define paravirt_alloc_pd(pfn) do { } while (0)
+#define paravirt_alloc_pd(pfn) do { } while (0)
+#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
+#define paravirt_release_pt(pfn) do { } while (0)
+#define paravirt_release_pd(pfn) do { } while (0)
+#endif
+
+#define pmd_populate_kernel(mm, pmd, pte) \
+do { \
+ paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT); \
+ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
+} while (0)

#define pmd_populate(mm, pmd, pte) \
+do { \
+ paravirt_alloc_pt(page_to_pfn(pte)); \
set_pmd(pmd, __pmd(_PAGE_TABLE + \
((unsigned long long)page_to_pfn(pte) << \
- (unsigned long long) PAGE_SHIFT)))
+ (unsigned long long) PAGE_SHIFT))); \
+} while (0)
+
/*
* Allocate and free page tables.
*/
@@ -32,7 +50,11 @@ static inline void pte_free(struct page
}


-#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
+#define __pte_free_tlb(tlb,pte) \
+do { \
+ paravirt_release_pt(page_to_pfn(pte)); \
+ tlb_remove_page((tlb),(pte)); \
+} while (0)

#ifdef CONFIG_X86_PAE
/*
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/