[PATCH 1/4] swap: Split up try_to_unuse()

From: Richard Purdie
Date: Mon Jan 08 2007 - 08:49:38 EST


Split the core of try_to_unuse() into a new function,
try_to_unuse_entry().

These patches are a based on a patch by Nick Piggin and some of my own
patches/bugfixes as discussed on LKML.

Signed-off-by: Richard Purdie <rpurdie@xxxxxxxxxxxxxx>

---
mm/swapfile.c | 321 ++++++++++++++++++++++++++++++----------------------------
1 file changed, 170 insertions(+), 151 deletions(-)

Index: git/mm/swapfile.c
===================================================================
--- git.orig/mm/swapfile.c 2007-01-07 21:39:29.000000000 +0000
+++ git/mm/swapfile.c 2007-01-08 11:39:27.000000000 +0000
@@ -638,6 +638,174 @@ static int unuse_mm(struct mm_struct *mm
return 0;
}

+static int try_to_unuse_entry(swp_entry_t entry, unsigned short *swap_map,
+ struct page *page, struct mm_struct **start_mm_p)
+{
+ struct mm_struct *start_mm;
+ unsigned short swcount;
+ int retval = 0;
+ int shmem;
+
+ if (start_mm_p)
+ start_mm = *start_mm_p;
+ else {
+ start_mm = &init_mm;
+ atomic_inc(&init_mm.mm_users);
+ }
+
+ /*
+ * Don't hold on to start_mm if it looks like exiting.
+ */
+ if (atomic_read(&start_mm->mm_users) == 1) {
+ mmput(start_mm);
+ start_mm = &init_mm;
+ atomic_inc(&init_mm.mm_users);
+ }
+
+ /*
+ * Wait for and lock page. When do_swap_page races with try_to_unuse,
+ * do_swap_page can handle the fault much faster than try_to_unuse can
+ * locate the entry. This apparently redundant "wait_on_page_locked"
+ * lets try_to_unuse defer to do_swap_page in such a case - in some
+ * tests, do_swap_page and try_to_unuse repeatedly compete.
+ */
+
+ wait_on_page_locked(page);
+ wait_on_page_writeback(page);
+ lock_page(page);
+ wait_on_page_writeback(page);
+retry:
+
+ /*
+ * Remove all references to entry.
+ * Whenever we reach init_mm, there's no address space to search, but
+ * use it as a reminder to search shmem.
+ */
+ shmem = 0;
+ swcount = *swap_map;
+ if (swcount > 1) {
+ if (start_mm == &init_mm)
+ shmem = shmem_unuse(entry, page);
+ else
+ retval = unuse_mm(start_mm, entry, page);
+ }
+ if (*swap_map > 1) {
+ int set_start_mm = (*swap_map >= swcount);
+ struct list_head *p = &start_mm->mmlist;
+ struct mm_struct *new_start_mm = start_mm;
+ struct mm_struct *prev_mm = start_mm;
+ struct mm_struct *mm;
+
+ atomic_inc(&new_start_mm->mm_users);
+ atomic_inc(&prev_mm->mm_users);
+ spin_lock(&mmlist_lock);
+ while (*swap_map > 1 && !retval &&
+ (p = p->next) != &start_mm->mmlist) {
+ mm = list_entry(p, struct mm_struct, mmlist);
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ continue;
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+
+ cond_resched();
+
+ swcount = *swap_map;
+ if (swcount <= 1)
+ ;
+ else if (mm == &init_mm) {
+ set_start_mm = 1;
+ shmem = shmem_unuse(entry, page);
+ } else
+ retval = unuse_mm(mm, entry, page);
+ if (set_start_mm && *swap_map < swcount) {
+ mmput(new_start_mm);
+ atomic_inc(&mm->mm_users);
+ new_start_mm = mm;
+ set_start_mm = 0;
+ }
+ spin_lock(&mmlist_lock);
+ }
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ mmput(start_mm);
+ start_mm = new_start_mm;
+ }
+ if (retval)
+ goto unuse_err;
+
+ /*
+ * How could swap count reach 0x7fff when the maximum pid is 0x7fff,
+ * and there's no way to repeat a swap page within an mm (except in
+ * shmem, where it's the shared object which takes the reference
+ * count)? We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
+ *
+ * If that's wrong, then we should worry more about exit_mmap() and
+ * do_munmap() cases described above: we might be resetting
+ * SWAP_MAP_MAX too early here. We know "Undead"s can happen, they're
+ * okay, so don't report them; but do report if we reset SWAP_MAP_MAX.
+ */
+ if (*swap_map == SWAP_MAP_MAX) {
+ spin_lock(&swap_lock);
+ *swap_map = 1;
+ spin_unlock(&swap_lock);
+ if (printk_ratelimit())
+ printk(KERN_WARNING
+ "try_to_unuse_entry: cleared swap entry overflow\n");
+ }
+
+ /*
+ * If a reference remains (rare), we would like to leave the page in
+ * the swap cache; but try_to_unmap could then re-duplicate the entry
+ * once we drop page lock, so we might loop indefinitely; also, that
+ * page could not be swapped out to other storage meanwhile. So:
+ * delete from cache even if there's another reference, after ensuring
+ * that the data has been saved to disk - since if the reference
+ * remains (rarer), it will be read from disk into another page.
+ * Splitting into two pages would be incorrect if swap supported
+ * "shared private" pages, but they are handled by tmpfs files.
+ *
+ * Note shmem_unuse already deleted a swappage from the swap cache,
+ * unless the move to filepage failed: in which case it left swappage
+ * in cache, lowered its swap count to pass quickly through the loops
+ * above, and now we must reincrement count to try again later.
+ */
+ if (PageSwapCache(page)) {
+ if ((*swap_map > 1) && PageDirty(page)) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ };
+
+ swap_writepage(page, &wbc);
+ lock_page(page);
+ wait_on_page_writeback(page);
+ goto retry;
+ }
+
+ if (shmem)
+ swap_duplicate(entry);
+ else
+ delete_from_swap_cache(page);
+ }
+
+ /*
+ * So we could skip searching mms once swap count went to 1, we did not
+ * mark any present ptes as dirty: must mark page dirty so shrink_list
+ * will preserve it.
+ */
+ SetPageDirty(page);
+unuse_err:
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (start_mm_p)
+ *start_mm_p = start_mm;
+ else
+ mmput(start_mm);
+
+ return retval;
+}
+
/*
* Scan swap_map from current position to next entry still in use.
* Recycle to start on reaching the end, returning 0 when empty.
@@ -686,13 +854,10 @@ static int try_to_unuse(unsigned int typ
struct swap_info_struct * si = &swap_info[type];
struct mm_struct *start_mm;
unsigned short *swap_map;
- unsigned short swcount;
struct page *page;
swp_entry_t entry;
unsigned int i = 0;
int retval = 0;
- int reset_overflow = 0;
- int shmem;

/*
* When searching mms for an entry, a good strategy is to
@@ -744,152 +909,10 @@ static int try_to_unuse(unsigned int typ
break;
}

- /*
- * Don't hold on to start_mm if it looks like exiting.
- */
- if (atomic_read(&start_mm->mm_users) == 1) {
- mmput(start_mm);
- start_mm = &init_mm;
- atomic_inc(&init_mm.mm_users);
- }
-
- /*
- * Wait for and lock page. When do_swap_page races with
- * try_to_unuse, do_swap_page can handle the fault much
- * faster than try_to_unuse can locate the entry. This
- * apparently redundant "wait_on_page_locked" lets try_to_unuse
- * defer to do_swap_page in such a case - in some tests,
- * do_swap_page and try_to_unuse repeatedly compete.
- */
- wait_on_page_locked(page);
- wait_on_page_writeback(page);
- lock_page(page);
- wait_on_page_writeback(page);
-
- /*
- * Remove all references to entry.
- * Whenever we reach init_mm, there's no address space
- * to search, but use it as a reminder to search shmem.
- */
- shmem = 0;
- swcount = *swap_map;
- if (swcount > 1) {
- if (start_mm == &init_mm)
- shmem = shmem_unuse(entry, page);
- else
- retval = unuse_mm(start_mm, entry, page);
- }
- if (*swap_map > 1) {
- int set_start_mm = (*swap_map >= swcount);
- struct list_head *p = &start_mm->mmlist;
- struct mm_struct *new_start_mm = start_mm;
- struct mm_struct *prev_mm = start_mm;
- struct mm_struct *mm;
-
- atomic_inc(&new_start_mm->mm_users);
- atomic_inc(&prev_mm->mm_users);
- spin_lock(&mmlist_lock);
- while (*swap_map > 1 && !retval &&
- (p = p->next) != &start_mm->mmlist) {
- mm = list_entry(p, struct mm_struct, mmlist);
- if (!atomic_inc_not_zero(&mm->mm_users))
- continue;
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- prev_mm = mm;
-
- cond_resched();
+ retval = try_to_unuse_entry(entry, swap_map, page, &start_mm);

- swcount = *swap_map;
- if (swcount <= 1)
- ;
- else if (mm == &init_mm) {
- set_start_mm = 1;
- shmem = shmem_unuse(entry, page);
- } else
- retval = unuse_mm(mm, entry, page);
- if (set_start_mm && *swap_map < swcount) {
- mmput(new_start_mm);
- atomic_inc(&mm->mm_users);
- new_start_mm = mm;
- set_start_mm = 0;
- }
- spin_lock(&mmlist_lock);
- }
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- mmput(start_mm);
- start_mm = new_start_mm;
- }
- if (retval) {
- unlock_page(page);
- page_cache_release(page);
+ if (retval)
break;
- }
-
- /*
- * How could swap count reach 0x7fff when the maximum
- * pid is 0x7fff, and there's no way to repeat a swap
- * page within an mm (except in shmem, where it's the
- * shared object which takes the reference count)?
- * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
- *
- * If that's wrong, then we should worry more about
- * exit_mmap() and do_munmap() cases described above:
- * we might be resetting SWAP_MAP_MAX too early here.
- * We know "Undead"s can happen, they're okay, so don't
- * report them; but do report if we reset SWAP_MAP_MAX.
- */
- if (*swap_map == SWAP_MAP_MAX) {
- spin_lock(&swap_lock);
- *swap_map = 1;
- spin_unlock(&swap_lock);
- reset_overflow = 1;
- }
-
- /*
- * If a reference remains (rare), we would like to leave
- * the page in the swap cache; but try_to_unmap could
- * then re-duplicate the entry once we drop page lock,
- * so we might loop indefinitely; also, that page could
- * not be swapped out to other storage meanwhile. So:
- * delete from cache even if there's another reference,
- * after ensuring that the data has been saved to disk -
- * since if the reference remains (rarer), it will be
- * read from disk into another page. Splitting into two
- * pages would be incorrect if swap supported "shared
- * private" pages, but they are handled by tmpfs files.
- *
- * Note shmem_unuse already deleted a swappage from
- * the swap cache, unless the move to filepage failed:
- * in which case it left swappage in cache, lowered its
- * swap count to pass quickly through the loops above,
- * and now we must reincrement count to try again later.
- */
- if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- };
-
- swap_writepage(page, &wbc);
- lock_page(page);
- wait_on_page_writeback(page);
- }
- if (PageSwapCache(page)) {
- if (shmem)
- swap_duplicate(entry);
- else
- delete_from_swap_cache(page);
- }
-
- /*
- * So we could skip searching mms once swap count went
- * to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so shrink_list will preserve it.
- */
- SetPageDirty(page);
- unlock_page(page);
- page_cache_release(page);

/*
* Make sure that we aren't completely killing
@@ -899,10 +922,6 @@ static int try_to_unuse(unsigned int typ
}

mmput(start_mm);
- if (reset_overflow) {
- printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
- swap_overflow = 0;
- }
return retval;
}



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/