Re: Race in new page migration code?

From: Nick Piggin
Date: Sun Jan 15 2006 - 01:57:14 EST


On Sat, Jan 14, 2006 at 10:58:25AM -0800, Christoph Lameter wrote:
> On Sat, 14 Jan 2006, Nick Piggin wrote:
>
> > > We take that reference count on the page:
> > Yes, after you have dropped all your claims to pin this page
> > (ie. pte lock). You really can't take a refcount on a page that
>
> Oh. Now I see. I screwed that up by a fix I added.... We cannot drop the
> ptl here. So back to the way it was before. Remove the draining from
> isolate_lru_page and do it before scanning for pages so that we do not
> have to drop the ptl.
>

OK, if you prefer doing it with the pte lock held, how's this patch?

--
Migration code currently does not take a reference to target page
properly, so between unlocking the pte and trying to take a new
reference to the page with isolate_lru_page, anything could happen to
it.

Fix this by holding the pte lock until we get a chance to elevate the
refcount.

Other small cleanups while we're here.

Signed-off-by: Nick Piggin <npiggin@xxxxxxx>

Index: linux-2.6/mm/mempolicy.c
===================================================================
--- linux-2.6.orig/mm/mempolicy.c
+++ linux-2.6/mm/mempolicy.c
@@ -216,11 +216,8 @@ static int check_pte_range(struct vm_are

if (flags & MPOL_MF_STATS)
gather_stats(page, private);
- else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- spin_unlock(ptl);
+ else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(vma, page, private, flags);
- spin_lock(ptl);
- }
else
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -309,6 +306,10 @@ check_range(struct mm_struct *mm, unsign
int err;
struct vm_area_struct *first, *vma, *prev;

+ /* Clear the LRU lists so pages can be isolated */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_add_drain_all();
+
first = find_vma(mm, start);
if (!first)
return ERR_PTR(-EFAULT);
@@ -555,15 +556,8 @@ static void migrate_page_add(struct vm_a
if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
mapping_writably_mapped(page->mapping) ||
single_mm_mapping(vma->vm_mm, page->mapping)) {
- int rc = isolate_lru_page(page);
-
- if (rc == 1)
+ if (isolate_lru_page(page))
list_add(&page->lru, pagelist);
- /*
- * If the isolate attempt was not successful then we just
- * encountered an unswappable page. Something must be wrong.
- */
- WARN_ON(rc == 0);
}
}

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -586,7 +586,7 @@ static inline void move_to_lru(struct pa
}

/*
- * Add isolated pages on the list back to the LRU
+ * Add isolated pages on the list back to the LRU.
*
* returns the number of pages put back.
*/
@@ -760,46 +760,33 @@ next:
return nr_failed + retry;
}

-static void lru_add_drain_per_cpu(void *dummy)
-{
- lru_add_drain();
-}
-
/*
* Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
- * page is not on the LRU lists yet.
+ * indicated list with elevated refcount.
*
* Result:
* 0 = page not on LRU list
* 1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
*/
int isolate_lru_page(struct page *page)
{
- int rc = 0;
- struct zone *zone = page_zone(page);
+ int ret = 0;

-redo:
- spin_lock_irq(&zone->lru_lock);
- rc = __isolate_lru_page(page);
- if (rc == 1) {
- if (PageActive(page))
- del_page_from_active_list(zone, page);
- else
- del_page_from_inactive_list(zone, page);
- }
- spin_unlock_irq(&zone->lru_lock);
- if (rc == 0) {
- /*
- * Maybe this page is still waiting for a cpu to drain it
- * from one of the lru lists?
- */
- rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
- if (rc == 0 && PageLRU(page))
- goto redo;
+ if (PageLRU(page)) {
+ struct zone *zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (TestClearPageLRU(page)) {
+ ret = 1;
+ get_page(page);
+ if (PageActive(page))
+ del_page_from_active_list(zone, page);
+ else
+ del_page_from_inactive_list(zone, page);
+ }
+ spin_unlock_irq(&zone->lru_lock);
}
- return rc;
+
+ return ret;
}
#endif

@@ -831,18 +818,20 @@ static int isolate_lru_pages(int nr_to_s
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);

- switch (__isolate_lru_page(page)) {
- case 1:
- /* Succeeded to isolate page */
- list_move(&page->lru, dst);
- nr_taken++;
- break;
- case -ENOENT:
- /* Not possible to isolate */
- list_move(&page->lru, src);
- break;
- default:
+ if (!TestClearPageLRU(page))
BUG();
+ list_del(&page->lru);
+ if (get_page_testone(page)) {
+ /*
+ * It is being freed elsewhere
+ */
+ __put_page(page);
+ SetPageLRU(page);
+ list_add(&page->lru, src);
+ continue;
+ } else {
+ list_add(&page->lru, dst);
+ nr_taken++;
}
}

Index: linux-2.6/include/linux/mm_inline.h
===================================================================
--- linux-2.6.orig/include/linux/mm_inline.h
+++ linux-2.6/include/linux/mm_inline.h
@@ -39,24 +39,3 @@ del_page_from_lru(struct zone *zone, str
}
}

-/*
- * Isolate one page from the LRU lists.
- *
- * - zone->lru_lock must be held
- */
-static inline int __isolate_lru_page(struct page *page)
-{
- if (unlikely(!TestClearPageLRU(page)))
- return 0;
-
- if (get_page_testone(page)) {
- /*
- * It is being freed elsewhere
- */
- __put_page(page);
- SetPageLRU(page);
- return -ENOENT;
- }
-
- return 1;
-}
Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h
+++ linux-2.6/include/linux/swap.h
@@ -167,6 +167,7 @@ extern void FASTCALL(lru_cache_add_activ
extern void FASTCALL(activate_page(struct page *));
extern void FASTCALL(mark_page_accessed(struct page *));
extern void lru_add_drain(void);
+extern int lru_add_drain_all(void);
extern int rotate_reclaimable_page(struct page *page);
extern void swap_setup(void);

Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c
+++ linux-2.6/mm/swap.c
@@ -174,6 +174,24 @@ void lru_add_drain(void)
put_cpu();
}

+static void lru_add_drain_per_cpu(void *dummy)
+{
+ lru_add_drain();
+}
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+#ifdef CONFIG_SMP
+ return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+#else
+ lru_add_drain();
+ return 0;
+#endif
+}
+
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct ki
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed)
+ * ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
* ->inode_lock (page_remove_rmap->set_page_dirty)
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -33,7 +33,7 @@
* mapping->i_mmap_lock
* anon_vma->lock
* mm->page_table_lock or pte_lock
- * zone->lru_lock (in mark_page_accessed)
+ * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/