[RFC][PATCH] swsusp: support creating bigger images (rev. 2)

From: Rafael J. Wysocki
Date: Thu Apr 27 2006 - 15:54:17 EST


Hi,

On Tuesday 25 April 2006 17:39, Rafael J. Wysocki wrote:
}-- snip --{
>
> Thanks a lot for the comments. I'll do my best to follow your suggestions
> in the next version of the patch.

The corrected version of the patch is appended.

A short summary of changes:
- the code that refers to the mm internals moved to mm/rmap.c and mm/vmscan.c
- added some comments (if you think more are needed or the existing ones should
be clarified, please let me know)
- the PageLRU flags are set/reset under spinlocks

Could you please have a look at it?

Greetings,
Rafael

---
include/linux/rmap.h | 8 +
include/linux/swap.h | 4
kernel/power/power.h | 10 +-
kernel/power/snapshot.c | 225 +++++++++++++++++++++++++++++++++---------------
kernel/power/swsusp.c | 15 +--
kernel/power/user.c | 4
mm/rmap.c | 40 ++++++++
mm/vmscan.c | 63 +++++++++++++
8 files changed, 293 insertions(+), 76 deletions(-)

Index: linux-2.6.17-rc2-mm1/mm/rmap.c
===================================================================
--- linux-2.6.17-rc2-mm1.orig/mm/rmap.c
+++ linux-2.6.17-rc2-mm1/mm/rmap.c
@@ -851,3 +851,43 @@ int try_to_unmap(struct page *page, int
return ret;
}

+#ifdef CONFIG_SOFTWARE_SUSPEND
+static int page_mapped_by_task(struct page *page, struct task_struct *task)
+{
+ struct vm_area_struct *vma;
+ int ret = 0;
+
+ spin_lock(&task->mm->page_table_lock);
+
+ for (vma = task->mm->mmap; vma; vma = vma->vm_next)
+ if (page_address_in_vma(page, vma) != -EFAULT) {
+ ret = 1;
+ break;
+ }
+
+ spin_unlock(&task->mm->page_table_lock);
+
+ return ret;
+}
+
+/**
+ * page_to_copy - determine if a page can be included in the
+ * suspend image without copying (returns false if that's the case).
+ *
+ * It is safe to include the page in the suspend image without
+ * copying if (a) it's on the LRU and (b) it's mapped by a frozen task
+ * (all tasks except for the current task should be frozen when it's
+ * called). Otherwise the page should be copied for this purpose
+ * (compound pages are avoided for simplicity).
+ */
+
+int page_to_copy(struct page *page)
+{
+ if (!PageLRU(page) || PageCompound(page))
+ return 1;
+ if (page_mapped(page))
+ return page_mapped_by_task(page, current);
+
+ return 1;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
Index: linux-2.6.17-rc2-mm1/include/linux/rmap.h
===================================================================
--- linux-2.6.17-rc2-mm1.orig/include/linux/rmap.h
+++ linux-2.6.17-rc2-mm1/include/linux/rmap.h
@@ -104,6 +104,14 @@ pte_t *page_check_address(struct page *,
*/
unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);

+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Used to determine if the page can be included in the suspend image without
+ * copying
+ */
+int page_to_copy(struct page *);
+#endif
+
#else /* !CONFIG_MMU */

#define anon_vma_init() do {} while (0)
Index: linux-2.6.17-rc2-mm1/kernel/power/snapshot.c
===================================================================
--- linux-2.6.17-rc2-mm1.orig/kernel/power/snapshot.c
+++ linux-2.6.17-rc2-mm1/kernel/power/snapshot.c
@@ -13,6 +13,7 @@
#include <linux/version.h>
#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/rmap.h>
#include <linux/suspend.h>
#include <linux/smp_lock.h>
#include <linux/delay.h>
@@ -261,6 +262,14 @@ int restore_special_mem(void)
return ret;
}

+/* Represents a stacked allocated page to be used in the future */
+struct res_page {
+ struct res_page *next;
+ char padding[PAGE_SIZE - sizeof(void *)];
+};
+
+static struct res_page *page_list;
+
static int pfn_is_nosave(unsigned long pfn)
{
unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
@@ -269,7 +278,7 @@ static int pfn_is_nosave(unsigned long p
}

/**
- * saveable - Determine whether a page should be cloned or not.
+ * saveable - Determine whether a page should be saved or not.
* @pfn: The page
*
* We save a page if it's Reserved, and not in the range of pages
@@ -277,9 +286,8 @@ static int pfn_is_nosave(unsigned long p
* isn't part of a free chunk of pages.
*/

-static int saveable(struct zone *zone, unsigned long *zone_pfn)
+static int saveable(unsigned long pfn)
{
- unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
struct page *page;

if (!pfn_valid(pfn))
@@ -296,46 +304,103 @@ static int saveable(struct zone *zone, u
return 1;
}

-unsigned int count_data_pages(void)
+/**
+ * count_data_pages - returns the number of pages that need to be copied
+ * in order to be included in the suspend snapshot image. If @total is
+ * not null, the total number of pages that should be included in the
+ * snapshot image is stored in the location pointed to by it.
+ */
+
+unsigned int count_data_pages(unsigned int *total)
{
struct zone *zone;
unsigned long zone_pfn;
- unsigned int n = 0;
+ unsigned int n, m;

+ n = m = 0;
for_each_zone (zone) {
if (is_highmem(zone))
continue;
mark_free_pages(zone);
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
- n += saveable(zone, &zone_pfn);
+ for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+ unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+ if (saveable(pfn)) {
+ n++;
+ if (!page_to_copy(pfn_to_page(pfn)))
+ m++;
+ }
+ }
}
- return n;
+ if (total)
+ *total = n;
+
+ return n - m;
}

+/**
+ * copy_data_pages - populates the page backup list @pblist with
+ * the addresses of the pages that should be included in the
+ * suspend snapshot image. The pages that cannot be included in the
+ * image without copying are copied into empty page frames allocated
+ * earlier and available from the list page_list (the addresses of
+ * these page frames are also stored in the page backup list).
+ *
+ * This function is only called from the critical section, ie. when
+ * processes are frozen, there's only one CPU online and local IRQs
+ * are disabled on it.
+ */
+
static void copy_data_pages(struct pbe *pblist)
{
struct zone *zone;
unsigned long zone_pfn;
struct pbe *pbe, *p;
+ struct res_page *ptr;

pbe = pblist;
for_each_zone (zone) {
if (is_highmem(zone))
continue;
+
mark_free_pages(zone);
- /* This is necessary for swsusp_free() */
+ /* This is necessary for free_image() */
for_each_pb_page (p, pblist)
SetPageNosaveFree(virt_to_page(p));
+
for_each_pbe (p, pblist)
- SetPageNosaveFree(virt_to_page(p->address));
+ if (p->address && p->orig_address != p->address)
+ SetPageNosaveFree(virt_to_page(p->address));
+
+ ptr = page_list;
+ while (ptr) {
+ SetPageNosaveFree(virt_to_page(ptr));
+ ptr = ptr->next;
+ }
+
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
- if (saveable(zone, &zone_pfn)) {
+ unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+ if (saveable(pfn)) {
struct page *page;
- page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+
BUG_ON(!pbe);
+ page = pfn_to_page(pfn);
pbe->orig_address = (unsigned long)page_address(page);
- /* copy_page is not usable for copying task structs. */
- memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+ if (page_to_copy(page)) {
+ BUG_ON(!page_list);
+ pbe->address = (unsigned long)page_list;
+ page_list = page_list->next;
+ /*
+ * copy_page is not usable for copying
+ * task structs.
+ */
+ memcpy((void *)pbe->address,
+ (void *)pbe->orig_address,
+ PAGE_SIZE);
+ } else {
+ pbe->address = pbe->orig_address;
+ }
pbe = pbe->next;
}
}
@@ -419,7 +484,7 @@ static inline void *alloc_image_page(gfp
res = (void *)get_zeroed_page(gfp_mask);
if (safe_needed)
while (res && PageNosaveFree(virt_to_page(res))) {
- /* The page is unsafe, mark it for swsusp_free() */
+ /* The page is unsafe, mark it for free_image() */
SetPageNosave(virt_to_page(res));
unsafe_pages++;
res = (void *)get_zeroed_page(gfp_mask);
@@ -474,11 +539,32 @@ static struct pbe *alloc_pagedir(unsigne
}

/**
+ * protect_data_pages - move data pages that need to be protected from
+ * being reclaimed (ie. those included in the suspend image without
+ * copying) out of their respective LRU lists. This is done after the
+ * image has been created so the LRU lists only have to be restored if
+ * the suspend fails.
+ *
+ * This function is only called from the critical section, ie. when
+ * processes are frozen, there's only one CPU online and local IRQs
+ * are disabled on it.
+ */
+
+static void protect_data_pages(struct pbe *pblist)
+{
+ struct pbe *p;
+
+ for_each_pbe (p, pblist)
+ if (p->address == p->orig_address)
+ move_out_of_lru(virt_to_page(p->address));
+}
+
+/**
* Free pages we allocated for suspend. Suspend pages are alocated
* before atomic copy, so we need to free them after resume.
*/

-void swsusp_free(void)
+void free_image(void)
{
struct zone *zone;
unsigned long zone_pfn;
@@ -501,6 +587,11 @@ void swsusp_free(void)
buffer = NULL;
}

+void swsusp_free(void)
+{
+ free_image();
+ restore_active_inactive_lists();
+}

/**
* enough_free_mem - Make sure we enough free memory to snapshot.
@@ -509,32 +600,33 @@ void swsusp_free(void)
* free pages.
*/

-static int enough_free_mem(unsigned int nr_pages)
+static int enough_free_mem(unsigned int nr_pages, unsigned int copy_pages)
{
struct zone *zone;
- unsigned int n = 0;
+ long n = 0;
+
+ pr_debug("swsusp: pages needed: %u + %lu + %lu\n",
+ copy_pages,
+ (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
+ EXTRA_PAGES);

for_each_zone (zone)
- if (!is_highmem(zone))
+ if (!is_highmem(zone) && populated_zone(zone)) {
n += zone->free_pages;
- pr_debug("swsusp: available memory: %u pages\n", n);
- return n > (nr_pages + PAGES_FOR_IO +
- (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
-
-static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
-{
- struct pbe *p;
+ /*
+ * We're going to use atomic allocations, so we
+ * shouldn't count the lowmem reserves in the lower
+ * zones as available to us
+ */
+ n -= zone->lowmem_reserve[ZONE_NORMAL];
+ }

- for_each_pbe (p, pblist) {
- p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
- if (!p->address)
- return -ENOMEM;
- }
- return 0;
+ pr_debug("swsusp: available memory: %ld pages\n", n);
+ return n > (long)(copy_pages + EXTRA_PAGES +
+ (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
}

-static struct pbe *swsusp_alloc(unsigned int nr_pages)
+static struct pbe *swsusp_alloc(unsigned int nr_pages, unsigned int copy_pages)
{
struct pbe *pblist;

@@ -543,10 +635,19 @@ static struct pbe *swsusp_alloc(unsigned
return NULL;
}

- if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
- printk(KERN_ERR "suspend: Allocating image pages failed.\n");
- swsusp_free();
- return NULL;
+ page_list = NULL;
+ while (copy_pages--) {
+ struct res_page *ptr;
+
+ ptr = alloc_image_page(GFP_ATOMIC | __GFP_COLD, 0);
+ if (!ptr) {
+ printk(KERN_ERR
+ "suspend: Allocating image pages failed.\n");
+ free_image();
+ return NULL;
+ }
+ ptr->next = page_list;
+ page_list = ptr;
}

return pblist;
@@ -554,25 +655,21 @@ static struct pbe *swsusp_alloc(unsigned

asmlinkage int swsusp_save(void)
{
- unsigned int nr_pages;
+ unsigned int nr_pages, copy_pages;

pr_debug("swsusp: critical section: \n");

drain_local_pages();
- nr_pages = count_data_pages();
- printk("swsusp: Need to copy %u pages\n", nr_pages);
-
- pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
- nr_pages,
- (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
- PAGES_FOR_IO, nr_free_pages());
+ copy_pages = count_data_pages(&nr_pages);
+ printk("swsusp: Need to save %u pages\n", nr_pages);
+ printk("swsusp: %u pages to copy\n", copy_pages);

- if (!enough_free_mem(nr_pages)) {
+ if (!enough_free_mem(nr_pages, copy_pages)) {
printk(KERN_ERR "swsusp: Not enough free memory\n");
return -ENOMEM;
}

- pagedir_nosave = swsusp_alloc(nr_pages);
+ pagedir_nosave = swsusp_alloc(nr_pages, copy_pages);
if (!pagedir_nosave)
return -ENOMEM;

@@ -582,6 +679,9 @@ asmlinkage int swsusp_save(void)
drain_local_pages();
copy_data_pages(pagedir_nosave);

+ /* Make sure the pages that we have not copied won't be reclaimed */
+ protect_data_pages(pagedir_nosave);
+
/*
* End of critical section. From now on, we can write to memory,
* but we should not touch disk. This specially means we must _not_
@@ -591,7 +691,7 @@ asmlinkage int swsusp_save(void)
nr_copy_pages = nr_pages;
nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;

- printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+ printk("swsusp: critical section/: done (%d pages)\n", nr_pages);
return 0;
}

@@ -654,7 +754,7 @@ int snapshot_read_next(struct snapshot_h
if (handle->page > nr_meta_pages + nr_copy_pages)
return 0;
if (!buffer) {
- /* This makes the buffer be freed by swsusp_free() */
+ /* This makes the buffer be freed by free_image() */
buffer = alloc_image_page(GFP_ATOMIC, 0);
if (!buffer)
return -ENOMEM;
@@ -810,13 +910,6 @@ static inline struct pbe *unpack_orig_ad
* of "safe" which will be used later
*/

-struct safe_page {
- struct safe_page *next;
- char padding[PAGE_SIZE - sizeof(void *)];
-};
-
-static struct safe_page *safe_pages;
-
static int prepare_image(struct snapshot_handle *handle)
{
int error = 0;
@@ -833,21 +926,21 @@ static int prepare_image(struct snapshot
if (!pblist)
error = -ENOMEM;
}
- safe_pages = NULL;
+ page_list = NULL;
if (!error && nr_pages > unsafe_pages) {
nr_pages -= unsafe_pages;
while (nr_pages--) {
- struct safe_page *ptr;
+ struct res_page *ptr;

- ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
+ ptr = (struct res_page *)get_zeroed_page(GFP_ATOMIC);
if (!ptr) {
error = -ENOMEM;
break;
}
if (!PageNosaveFree(virt_to_page(ptr))) {
/* The page is "safe", add it to the list */
- ptr->next = safe_pages;
- safe_pages = ptr;
+ ptr->next = page_list;
+ page_list = ptr;
}
/* Mark the page as allocated */
SetPageNosave(virt_to_page(ptr));
@@ -858,7 +951,7 @@ static int prepare_image(struct snapshot
pagedir_nosave = pblist;
} else {
handle->pbe = NULL;
- swsusp_free();
+ free_image();
}
return error;
}
@@ -882,8 +975,8 @@ static void *get_buffer(struct snapshot_
* The "original" page frame has not been allocated and we have to
* use a "safe" page frame to store the read page
*/
- pbe->address = (unsigned long)safe_pages;
- safe_pages = safe_pages->next;
+ pbe->address = (unsigned long)page_list;
+ page_list = page_list->next;
if (last)
last->next = pbe;
handle->last_pbe = pbe;
@@ -919,7 +1012,7 @@ int snapshot_write_next(struct snapshot_
if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
return 0;
if (!buffer) {
- /* This makes the buffer be freed by swsusp_free() */
+ /* This makes the buffer be freed by free_image() */
buffer = alloc_image_page(GFP_ATOMIC, 0);
if (!buffer)
return -ENOMEM;
Index: linux-2.6.17-rc2-mm1/kernel/power/power.h
===================================================================
--- linux-2.6.17-rc2-mm1.orig/kernel/power/power.h
+++ linux-2.6.17-rc2-mm1/kernel/power/power.h
@@ -48,7 +48,14 @@ extern dev_t swsusp_resume_device;
extern asmlinkage int swsusp_arch_suspend(void);
extern asmlinkage int swsusp_arch_resume(void);

-extern unsigned int count_data_pages(void);
+#if (defined(CONFIG_BLK_DEV_INITRD) && defined(CONFIG_BLK_DEV_RAM))
+#define EXTRA_PAGES (PAGES_FOR_IO + \
+ (2 * CONFIG_BLK_DEV_RAM_SIZE * 1024) / PAGE_SIZE)
+#else
+#define EXTRA_PAGES PAGES_FOR_IO
+#endif
+
+extern unsigned int count_data_pages(unsigned int *total);

struct snapshot_handle {
loff_t offset;
@@ -111,6 +118,7 @@ extern int restore_special_mem(void);

extern int swsusp_check(void);
extern int swsusp_shrink_memory(void);
+extern void free_image(void);
extern void swsusp_free(void);
extern int swsusp_suspend(void);
extern int swsusp_resume(void);
Index: linux-2.6.17-rc2-mm1/kernel/power/swsusp.c
===================================================================
--- linux-2.6.17-rc2-mm1.orig/kernel/power/swsusp.c
+++ linux-2.6.17-rc2-mm1/kernel/power/swsusp.c
@@ -177,28 +177,29 @@ int swsusp_shrink_memory(void)
long size, tmp;
struct zone *zone;
unsigned long pages = 0;
- unsigned int i = 0;
+ unsigned int to_save, i = 0;
char *p = "-\\|/";

printk("Shrinking memory... ");
do {
size = 2 * count_special_pages();
- size += size / 50 + count_data_pages();
- size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
- PAGES_FOR_IO;
+ size += size / 50 + count_data_pages(&to_save);
+ size += (to_save + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
+ EXTRA_PAGES;
tmp = size;
for_each_zone (zone)
if (!is_highmem(zone) && populated_zone(zone)) {
tmp -= zone->free_pages;
tmp += zone->lowmem_reserve[ZONE_NORMAL];
}
+ size = to_save * (PAGE_SIZE + sizeof(long));
if (tmp > 0) {
tmp = __shrink_memory(tmp);
if (!tmp)
return -ENOMEM;
pages += tmp;
- } else if (size > image_size / PAGE_SIZE) {
- tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
+ } else if (size > image_size) {
+ tmp = __shrink_memory(size - image_size);
pages += tmp;
}
printk("\b%c", p[i++%4]);
@@ -261,7 +262,7 @@ int swsusp_resume(void)
* very tight, so we have to free it as soon as we can to avoid
* subsequent failures
*/
- swsusp_free();
+ free_image();
restore_processor_state();
restore_special_mem();
touch_softlockup_watchdog();
Index: linux-2.6.17-rc2-mm1/kernel/power/user.c
===================================================================
--- linux-2.6.17-rc2-mm1.orig/kernel/power/user.c
+++ linux-2.6.17-rc2-mm1/kernel/power/user.c
@@ -153,6 +153,10 @@ static int snapshot_ioctl(struct inode *
case SNAPSHOT_UNFREEZE:
if (!data->frozen)
break;
+ if (data->ready && in_suspend) {
+ error = -EPERM;
+ break;
+ }
down(&pm_sem);
thaw_processes();
enable_nonboot_cpus();
Index: linux-2.6.17-rc2-mm1/mm/vmscan.c
===================================================================
--- linux-2.6.17-rc2-mm1.orig/mm/vmscan.c
+++ linux-2.6.17-rc2-mm1/mm/vmscan.c
@@ -1437,7 +1437,68 @@ out:

return ret;
}
-#endif
+
+static LIST_HEAD(saved_active_pages);
+static LIST_HEAD(saved_inactive_pages);
+
+/**
+ * move_out_of_lru - software suspend includes some pages in the
+ * suspend snapshot image without copying and these pages should be
+ * procected from being reclaimed, which can be done by (temporarily)
+ * moving them out of their respective LRU lists
+ *
+ * It is to be called with local IRQs disabled.
+ */
+
+void move_out_of_lru(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+
+ spin_lock(&zone->lru_lock);
+ if (PageActive(page)) {
+ del_page_from_active_list(zone, page);
+ list_add(&page->lru, &saved_active_pages);
+ } else {
+ del_page_from_inactive_list(zone, page);
+ list_add(&page->lru, &saved_inactive_pages);
+ }
+ ClearPageLRU(page);
+ spin_unlock(&zone->lru_lock);
+}
+
+
+/**
+ * restore_active_inactive_lists - used by the software suspend to move
+ * the pages taken out of the LRU by take_page_out_of_lru() back to
+ * their respective active/inactive lists (if the suspend fails)
+ */
+
+void restore_active_inactive_lists(void)
+{
+ struct page *page;
+ struct zone *zone;
+
+ while(!list_empty(&saved_active_pages)) {
+ page = lru_to_page(&saved_active_pages);
+ zone = page_zone(page);
+ list_del(&page->lru);
+ spin_lock_irq(&zone->lru_lock);
+ SetPageLRU(page);
+ add_page_to_active_list(zone, page);
+ spin_unlock_irq(&zone->lru_lock);
+ }
+
+ while(!list_empty(&saved_inactive_pages)) {
+ page = lru_to_page(&saved_inactive_pages);
+ zone = page_zone(page);
+ list_del(&page->lru);
+ spin_lock_irq(&zone->lru_lock);
+ SetPageLRU(page);
+ add_page_to_inactive_list(zone, page);
+ spin_unlock_irq(&zone->lru_lock);
+ }
+}
+#endif /* CONFIG_PM */

#ifdef CONFIG_HOTPLUG_CPU
/* It's optimal to keep kswapds on the same CPUs as their memory, but
Index: linux-2.6.17-rc2-mm1/include/linux/swap.h
===================================================================
--- linux-2.6.17-rc2-mm1.orig/include/linux/swap.h
+++ linux-2.6.17-rc2-mm1/include/linux/swap.h
@@ -184,7 +184,9 @@ extern void swap_setup(void);

/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zone **, gfp_t);
-extern unsigned long shrink_all_memory(unsigned long nr_pages);
+extern unsigned long shrink_all_memory(unsigned long);
+extern void move_out_of_lru(struct page *page);
+extern void restore_active_inactive_lists(void);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/