[PATCH 3.13.y-ckt 083/143] mm/hugetlb: take page table lock in follow_huge_pmd()
From: Kamal Mostafa
Date: Tue Mar 31 2015 - 16:06:22 EST
3.13.11-ckt18 -stable review patch. If anyone has any objections, please let me know.
------------------
From: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
commit e66f17ff71772b209eed39de35aaa99ba819c93d upstream.
We have a race condition between move_pages() and freeing hugepages, where
move_pages() calls follow_page(FOLL_GET) for hugepages internally and
tries to get its refcount without preventing concurrent freeing. This
race crashes the kernel, so this patch fixes it by moving FOLL_GET code
for hugepages into follow_huge_pmd() with taking the page table lock.
This patch intentionally removes page==NULL check after pte_page.
This is justified because pte_page() never returns NULL for any
architectures or configurations.
This patch changes the behavior of follow_huge_pmd() for tail pages and
then tail pages can be pinned/returned. So the caller must be changed to
properly handle the returned tail pages.
We could have a choice to add the similar locking to
follow_huge_(addr|pud) for consistency, but it's not necessary because
currently these functions don't support FOLL_GET flag, so let's leave it
for future development.
Here is the reproducer:
$ cat movepages.c
#include <stdio.h>
#include <stdlib.h>
#include <numaif.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
#define PS 0x1000
int main(int argc, char *argv[]) {
int i;
int nr_hp = strtol(argv[1], NULL, 0);
int nr_p = nr_hp * HPS / PS;
int ret;
void **addrs;
int *status;
int *nodes;
pid_t pid;
pid = strtol(argv[2], NULL, 0);
addrs = malloc(sizeof(char *) * nr_p + 1);
status = malloc(sizeof(char *) * nr_p + 1);
nodes = malloc(sizeof(char *) * nr_p + 1);
while (1) {
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 1;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 0;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
}
return 0;
}
$ cat hugepage.c
#include <stdio.h>
#include <sys/mman.h>
#include <string.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
int main(int argc, char *argv[]) {
int nr_hp = strtol(argv[1], NULL, 0);
char *p;
while (1) {
p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
if (p != (void *)ADDR_INPUT) {
perror("mmap");
break;
}
memset(p, 0, nr_hp * HPS);
munmap(p, nr_hp * HPS);
}
}
$ sysctl vm.nr_hugepages=40
$ ./hugepage 10 &
$ ./movepages 10 $(pgrep -f hugepage)
Fixes: e632a938d914 ("mm: migrate: add hugepage migration code to move_pages()")
Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
Reported-by: Hugh Dickins <hughd@xxxxxxxxxx>
Cc: James Hogan <james.hogan@xxxxxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Mel Gorman <mel@xxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Luiz Capitulino <lcapitulino@xxxxxxxxxx>
Cc: Nishanth Aravamudan <nacc@xxxxxxxxxxxxxxxxxx>
Cc: Lee Schermerhorn <lee.schermerhorn@xxxxxx>
Cc: Steve Capper <steve.capper@xxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
[ kamal: backport to 3.13-stable: applied mm/gup.c change to mm/memory.c ]
Signed-off-by: Kamal Mostafa <kamal@xxxxxxxxxxxxx>
---
include/linux/hugetlb.h | 8 ++++----
include/linux/swapops.h | 4 ++++
mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++--------------
mm/memory.c | 25 ++++++++-----------------
mm/migrate.c | 5 +++--
5 files changed, 53 insertions(+), 37 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5572709..723c75c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -89,9 +89,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
int write);
struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write);
+ pmd_t *pmd, int flags);
struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write);
+ pud_t *pud, int flags);
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pmd);
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -128,8 +128,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
static inline void hugetlb_show_meminfo(void)
{
}
-#define follow_huge_pmd(mm, addr, pmd, write) NULL
-#define follow_huge_pud(mm, addr, pud, write) NULL
+#define follow_huge_pmd(mm, addr, pmd, flags) NULL
+#define follow_huge_pud(mm, addr, pud, flags) NULL
#define prepare_hugepage_range(file, addr, len) (-EINVAL)
#define pmd_huge(x) 0
#define pud_huge(x) 0
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index c0f7526..d7f3b3f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -137,6 +137,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
}
+extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+ spinlock_t *ptl);
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address);
extern void migration_entry_wait_huge(struct vm_area_struct *vma,
@@ -150,6 +152,8 @@ static inline int is_migration_entry(swp_entry_t swp)
}
#define migration_entry_to_page(swp) NULL
static inline void make_migration_entry_read(swp_entry_t *entryp) { }
+static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+ spinlock_t *ptl) { }
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2a9e991..f6f1972 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3437,28 +3437,48 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
+ pmd_t *pmd, int flags)
{
- struct page *page;
-
- if (!pmd_present(*pmd))
- return NULL;
- page = pte_page(*(pte_t *)pmd);
- if (page)
- page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ struct page *page = NULL;
+ spinlock_t *ptl;
+retry:
+ ptl = pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
+ /*
+ * make sure that the address range covered by this pmd is not
+ * unmapped from other threads.
+ */
+ if (!pmd_huge(*pmd))
+ goto out;
+ if (pmd_present(*pmd)) {
+ page = pte_page(*(pte_t *)pmd) +
+ ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ if (flags & FOLL_GET)
+ get_page(page);
+ } else {
+ if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+ spin_unlock(ptl);
+ __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+ goto retry;
+ }
+ /*
+ * hwpoisoned entry is treated as no_page_table in
+ * follow_page_mask().
+ */
+ }
+out:
+ spin_unlock(ptl);
return page;
}
struct page *
follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
+ pud_t *pud, int flags)
{
- struct page *page;
+ if (flags & FOLL_GET)
+ return NULL;
- page = pte_page(*(pte_t *)pud);
- if (page)
- page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
- return page;
+ return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
}
#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
diff --git a/mm/memory.c b/mm/memory.c
index b7785d4..7040326 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1591,10 +1591,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pud_none(*pud))
return no_page_table(vma, flags);
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
- if (flags & FOLL_GET)
- return NULL;
- page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
- return page;
+ page = follow_huge_pud(mm, address, pud, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
}
if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags);
@@ -1603,19 +1603,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pmd_none(*pmd))
return no_page_table(vma, flags);
if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
- if (flags & FOLL_GET) {
- /*
- * Refcount on tail pages are not well-defined and
- * shouldn't be taken. The caller should handle a NULL
- * return when trying to follow tail pages.
- */
- if (PageHead(page))
- get_page(page);
- else
- page = NULL;
- }
- return page;
+ page = follow_huge_pmd(mm, address, pmd, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
}
if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
return no_page_table(vma, flags);
diff --git a/mm/migrate.c b/mm/migrate.c
index 62047f8..af1bf18 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -210,7 +210,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*/
-static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl)
{
pte_t pte;
@@ -1232,7 +1232,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto put_and_set;
if (PageHuge(page)) {
- isolate_huge_page(page, &pagelist);
+ if (PageHead(page))
+ isolate_huge_page(page, &pagelist);
goto put_and_set;
}
--
1.9.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/