[tip:x86/mm] x86, mm: Hold mm->page_table_lock while doing vmalloc_sync
From: tip-bot for Jeremy Fitzhardinge
Date:  Tue Oct 19 2010 - 18:17:55 EST
Commit-ID:  617d34d9e5d8326ec8f188c616aa06ac59d083fe
Gitweb:     http://git.kernel.org/tip/617d34d9e5d8326ec8f188c616aa06ac59d083fe
Author:     Jeremy Fitzhardinge <jeremy.fitzhardinge@xxxxxxxxxx>
AuthorDate: Tue, 21 Sep 2010 12:01:51 -0700
Committer:  H. Peter Anvin <hpa@xxxxxxxxxxxxxxx>
CommitDate: Tue, 19 Oct 2010 13:57:08 -0700
x86, mm: Hold mm->page_table_lock while doing vmalloc_sync
Take mm->page_table_lock while syncing the vmalloc region.  This prevents
a race with the Xen pagetable pin/unpin code, which expects that the
page_table_lock is already held.  If this race occurs, then Xen can see
an inconsistent page type (a page can either be read/write or a pagetable
page, and pin/unpin converts it between them), which will cause either
the pin or the set_p[gm]d to fail; either will crash the kernel.
vmalloc_sync_all() should be called rarely, so this extra use of
page_table_lock should not interfere with its normal users.
The mm pointer is stashed in the pgd page's index field, as that won't
be otherwise used for pgds.
Reported-by: Ian Campbell <ian.cambell@xxxxxxxxxxxxx>
Originally-by: Jan Beulich <jbeulich@xxxxxxxxxx>
LKML-Reference: <4CB88A4C.1080305@xxxxxxxx>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@xxxxxxxxxx>
Signed-off-by: H. Peter Anvin <hpa@xxxxxxxxxxxxxxx>
---
 arch/x86/include/asm/pgtable.h |    2 ++
 arch/x86/mm/fault.c            |   11 ++++++++++-
 arch/x86/mm/init_64.c          |    7 +++++++
 arch/x86/mm/pgtable.c          |   20 +++++++++++++++++---
 4 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2d0a33b..ada823a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index caec229..6c27c39 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
+			spinlock_t *pgt_lock;
+			int ret;
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
 				break;
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ad7c0f..4d323fb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -116,12 +116,19 @@ void sync_global_pgds(unsigned long start, unsigned long end)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd)
 				       != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee42..c70e57d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
 		pgd_list_add(pgd);
+	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(pgd);
+	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/