[PATCH] per-page pte_chain locking

From: William Lee Irwin III (wli@holomorphy.com)
Date: Thu Apr 18 2002 - 17:09:44 EST


This is basically the same patch as before with a correction to some
code in page_add_rmap() under a #ifdef DEBUG_RMAP (whose bug I
introduced myself).

riel, please apply.

Cheers,
Bill

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
# ChangeSet 1.404 -> 1.406
# mm/rmap.c 1.3 -> 1.5
# include/linux/mm.h 1.39 -> 1.40
# mm/vmscan.c 1.61 -> 1.62
# mm/filemap.c 1.66 -> 1.67
# mm/swap.c 1.18 -> 1.19
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/04/17 wli@elm3b52.eng.beaverton.ibm.com 1.405
# mm.h:
# Add pte_chain's lock bit and locking routines to the list of page flag accessor declarations.
# vmscan.c:
# Protect accesses to page->pte_chain with the pte_chain_lock
# rmap.c:
# Convert reverse-mapping helper routines to using per-page pte_chain locks, with comment updates.
# swap.c:
# Protect against concurrent modification of page->pte_chain in drop_page()
# filemap.c:
# truncate_complete_page() requires eliminating all references arising from ->pte_chain; if it is non-NULL, BUG()
# --------------------------------------------
# 02/04/18 wli@holomorphy.com 1.406
# rmap.c:
# Repair the DEBUG_RMAP code in page_add_rmap() so it doesn't deadlock.
# --------------------------------------------
#
diff -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h Thu Apr 18 08:17:26 2002
+++ b/include/linux/mm.h Thu Apr 18 08:17:26 2002
@@ -299,6 +299,7 @@
 #define PG_arch_1 13
 #define PG_reserved 14
 #define PG_launder 15 /* written out by VM pressure.. */
+#define PG_chainlock 16 /* lock bit for ->pte_chain */
 
 /* Make it prettier to test the above... */
 #define UnlockPage(page) unlock_page(page)
@@ -315,6 +316,20 @@
 #define SetPageChecked(page) set_bit(PG_checked, &(page)->flags)
 #define PageLaunder(page) test_bit(PG_launder, &(page)->flags)
 #define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags)
+
+/*
+ * inlines for acquisition and release of PG_chainlock
+ */
+static inline void pte_chain_lock(struct page *page)
+{
+ while (test_and_set_bit(PG_chainlock, &page->flags))
+ cpu_relax();
+}
+
+static inline void pte_chain_unlock(struct page *page)
+{
+ clear_bit(PG_chainlock, &page->flags);
+}
 
 /*
  * The zone field is never updated after free_area_init_core()
diff -Nru a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c Thu Apr 18 08:17:26 2002
+++ b/mm/filemap.c Thu Apr 18 08:17:26 2002
@@ -233,8 +233,14 @@
 
 static void truncate_complete_page(struct page *page)
 {
- /* Leave it on the LRU if it gets converted into anonymous buffers */
- if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0)))
+ /*
+ * Leave it on the LRU if it gets converted into anonymous buffers
+ * It is furthermore assumed that no process has mapped this page.
+ */
+ if (page->pte_chain)
+ BUG();
+
+ if (!page->buffers || do_flushpage(page, 0))
                 lru_cache_del(page);
 
         /*
diff -Nru a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c Thu Apr 18 08:17:26 2002
+++ b/mm/rmap.c Thu Apr 18 08:17:26 2002
@@ -13,9 +13,9 @@
 
 /*
  * Locking:
- * - the page->pte_chain is protected by the pagemap_lru_lock,
- * we probably want to change this to a per-page lock in the
- * future
+ * - the page->pte_chain is protected by the PG_chainlock bit,
+ * which nests within the pagemap_lru_lock, then the
+ * mm->page_table_lock, and then the page lock.
  * - because swapout locking is opposite to the locking order
  * in the page fault path, the swapout path uses trylocks
  * on the mm->page_table_lock
@@ -59,7 +59,7 @@
  *
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of processes which referenced the page.
- * Caller needs to hold the pagemap_lru_lock.
+ * Caller needs to hold the pte_chain_lock.
  */
 int FASTCALL(page_referenced(struct page *));
 int page_referenced(struct page * page)
@@ -91,14 +91,7 @@
 void page_add_rmap(struct page * page, pte_t * ptep)
 {
         struct pte_chain * pte_chain;
- zone_t *zone;
-
- if (!VALID_PAGE(page) || PageReserved(page))
- return;
 
- zone = page_zone(page);
-
- spin_lock(&pagemap_lru_lock);
 #ifdef DEBUG_RMAP
         if (!page || !ptep)
                 BUG();
@@ -106,6 +99,10 @@
                 BUG();
         if (!ptep_to_mm(ptep));
                 BUG();
+ if (!VALID_PAGE(page) || PageReserved(page))
+ return;
+
+ pte_chain_lock(page);
         {
                 struct pte_chain * pc;
                 for (pc = page->pte_chain; pc; pc = pc->next) {
@@ -113,15 +110,22 @@
                                 BUG();
                 }
         }
+ pte_chain_unlock(page);
 #endif
- pte_chain = pte_chain_alloc(zone);
+
+ if (!VALID_PAGE(page) || PageReserved(page))
+ return;
+
+ pte_chain = pte_chain_alloc(page_zone(page));
+
+ pte_chain_lock(page);
 
         /* Hook up the pte_chain to the page. */
         pte_chain->ptep = ptep;
         pte_chain->next = page->pte_chain;
         page->pte_chain = pte_chain;
 
- spin_unlock(&pagemap_lru_lock);
+ pte_chain_unlock(page);
 }
 
 /**
@@ -147,7 +151,7 @@
 
         zone = page_zone(page);
 
- spin_lock(&pagemap_lru_lock);
+ pte_chain_lock(page);
         for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
                 if (pc->ptep == ptep) {
                         pte_chain_free(pc, prev_pc, page, zone);
@@ -165,7 +169,7 @@
 #endif
 
 out:
- spin_unlock(&pagemap_lru_lock);
+ pte_chain_unlock(page);
         return;
                         
 }
@@ -181,7 +185,8 @@
  * Locking:
  * pagemap_lru_lock page_launder()
  * page lock page_launder(), trylock
- * mm->page_table_lock try_to_unmap_one(), trylock
+ * pte_chain_lock page_launder()
+ * mm->page_table_lock try_to_unmap_one(), trylock
  */
 int FASTCALL(try_to_unmap_one(struct page *, pte_t *));
 int try_to_unmap_one(struct page * page, pte_t * ptep)
@@ -300,7 +305,7 @@
  * is over its RSS (resident set size) limit. For shared pages
  * we penalise it only if all processes using it are over their
  * rss limits.
- * The caller needs to hold the pagemap_lru_lock.
+ * The caller needs to hold the page's pte_chain_lock.
  */
 int FASTCALL(page_over_rsslimit(struct page *));
 int page_over_rsslimit(struct page * page)
@@ -364,6 +369,7 @@
  * This function unlinks pte_chain from the singly linked list it
  * may be on and adds the pte_chain to the free list. May also be
  * called for new pte_chain structures which aren't on any list yet.
+ * Caller needs to hold the pte_chain_lock if the page is non-NULL.
  */
 static inline void pte_chain_free(struct pte_chain * pte_chain,
                 struct pte_chain * prev_pte_chain, struct page * page,
@@ -385,6 +391,7 @@
  *
  * Returns a pointer to a fresh pte_chain structure. Allocates new
  * pte_chain structures as required.
+ * Caller needs to hold the page's pte_chain_lock.
  */
 static inline struct pte_chain * pte_chain_alloc(zone_t * zone)
 {
diff -Nru a/mm/swap.c b/mm/swap.c
--- a/mm/swap.c Thu Apr 18 08:17:26 2002
+++ b/mm/swap.c Thu Apr 18 08:17:26 2002
@@ -104,6 +104,7 @@
         }
 
         /* Make sure the page really is reclaimable. */
+ pte_chain_lock(page);
         if (!page->mapping || PageDirty(page) || page->pte_chain ||
                         page->buffers || page_count(page) > 1)
                 deactivate_page_nolock(page);
@@ -119,6 +120,7 @@
                         add_page_to_inactive_clean_list(page);
                 }
         }
+ pte_chain_unlock(page);
 }
 
 /*
diff -Nru a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c Thu Apr 18 08:17:26 2002
+++ b/mm/vmscan.c Thu Apr 18 08:17:26 2002
@@ -47,6 +47,7 @@
         page->age -= min(PAGE_AGE_DECL, (int)page->age);
 }
 
+/* Must be called with page's pte_chain_lock held. */
 static inline int page_mapping_inuse(struct page * page)
 {
         struct address_space * mapping = page->mapping;
@@ -108,14 +109,21 @@
                 }
 
                 /* Page cannot be reclaimed ? Move to inactive_dirty list. */
+ pte_chain_lock(page);
                 if (unlikely(page->pte_chain || page->buffers ||
                                 PageReferenced(page) || PageDirty(page) ||
                                 page_count(page) > 1 || TryLockPage(page))) {
                         del_page_from_inactive_clean_list(page);
                         add_page_to_inactive_dirty_list(page);
+ pte_chain_unlock(page);
                         continue;
                 }
 
+ /*
+ * From here until reaching either the bottom of the loop
+ * or found_page: the pte_chain_lock is held.
+ */
+
                 /* OK, remove the page from the caches. */
                 if (PageSwapCache(page)) {
                         entry.val = page->index;
@@ -132,6 +140,7 @@
                 printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
                 list_del(page_lru);
                 zone->inactive_clean_pages--;
+ pte_chain_unlock(page);
                 UnlockPage(page);
         }
         spin_unlock(&pagecache_lock);
@@ -141,6 +150,7 @@
 
 found_page:
         del_page_from_inactive_clean_list(page);
+ pte_chain_unlock(page);
         spin_unlock(&pagecache_lock);
         spin_unlock(&pagemap_lru_lock);
         if (entry.val)
@@ -243,13 +253,16 @@
                  * The page is in active use or really unfreeable. Move to
                  * the active list and adjust the page age if needed.
                  */
+ pte_chain_lock(page);
                 if (page_referenced(page) && page_mapping_inuse(page) &&
                                 !page_over_rsslimit(page)) {
                         del_page_from_inactive_dirty_list(page);
                         add_page_to_active_list(page);
                         page->age = max((int)page->age, PAGE_AGE_START);
+ pte_chain_unlock(page);
                         continue;
                 }
+ pte_chain_unlock(page);
 
                 /*
                  * Page is being freed, don't worry about it.
@@ -270,8 +283,10 @@
                  *
                  * XXX: implement swap clustering ?
                  */
+ pte_chain_lock(page);
                 if (page->pte_chain && !page->mapping && !page->buffers) {
                         page_cache_get(page);
+ pte_chain_unlock(page);
                         spin_unlock(&pagemap_lru_lock);
                         if (!add_to_swap(page)) {
                                 activate_page(page);
@@ -282,24 +297,28 @@
                         }
                         page_cache_release(page);
                         spin_lock(&pagemap_lru_lock);
- }
+ } else
+ pte_chain_unlock(page);
 
                 /*
                  * The page is mapped into the page tables of one or more
                  * processes. Try to unmap it here.
                  */
+ pte_chain_lock(page);
                 if (page->pte_chain) {
                         switch (try_to_unmap(page)) {
                                 case SWAP_ERROR:
                                 case SWAP_FAIL:
                                         goto page_active;
                                 case SWAP_AGAIN:
+ pte_chain_unlock(page);
                                         UnlockPage(page);
                                         continue;
                                 case SWAP_SUCCESS:
                                         ; /* try to free the page below */
                         }
                 }
+ pte_chain_unlock(page);
 
                 if (PageDirty(page) && page->mapping) {
                         /*
@@ -386,10 +405,12 @@
                  * This test is not safe from races, but only the one
                  * in reclaim_page() needs to be.
                  */
+ pte_chain_lock(page);
                 if (page->mapping && !PageDirty(page) && !page->pte_chain &&
                                 page_count(page) == 1) {
                         del_page_from_inactive_dirty_list(page);
                         add_page_to_inactive_clean_list(page);
+ pte_chain_unlock(page);
                         UnlockPage(page);
                         cleaned_pages++;
                 } else {
@@ -401,6 +422,7 @@
 page_active:
                         del_page_from_inactive_dirty_list(page);
                         add_page_to_active_list(page);
+ pte_chain_unlock(page);
                         UnlockPage(page);
                 }
         }
@@ -471,8 +493,12 @@
                  * If the object the page is in is not in use we don't
                  * bother with page aging. If the page is touched again
                  * while on the inactive_clean list it'll be reactivated.
+ * From here until the end of the current iteration
+ * the pte_chain_lock is held.
                  */
+ pte_chain_lock(page);
                 if (!page_mapping_inuse(page)) {
+ pte_chain_unlock(page);
                         drop_page(page);
                         continue;
                 }
@@ -496,9 +522,12 @@
                         list_add(page_lru, &zone->active_list);
                 } else {
                         deactivate_page_nolock(page);
- if (++nr_deactivated > target)
- break;
+ if (++nr_deactivated > target) {
+ pte_chain_unlock(page);
+ goto done;
+ }
                 }
+ pte_chain_unlock(page);
 
                 /* Low latency reschedule point */
                 if (current->need_resched) {
@@ -507,6 +536,8 @@
                         spin_lock(&pagemap_lru_lock);
                 }
         }
+
+done:
         spin_unlock(&pagemap_lru_lock);
 
         return nr_deactivated;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Tue Apr 23 2002 - 22:00:22 EST