1/3 swsusp: use non-contiguous memory on resume

From: Pavel Machek
Date: Thu Mar 10 2005 - 00:16:04 EST


Hi!

The following patch is designed to fix a problem in the current
implementation of swsusp in mainline kernels. Namely, swsusp uses
an array of page backup entries (aka pagedir) to store pointers to memory
pages that must be saved during suspend and restored during resume.

Unfortunately, the pagedir has to be located in a contiguous chunk of memory
and it sometimes turns out that an 8-order or even 9-order allocation is needed
for this purpose. It sometimes is impossible to get such an allocation and
swsusp may fail during either suspend or resume due to the lack of memory,
although theoretically there is enough free memory for it to succeed.

Moreover, swsusp is more likely to fail for this reason during resume, which
means that it may fail during resume after a successful suspend
(this actually has happened for some people, including me :-)) and this,
potentially, may lead to the loss of data.

The problem is fixed by replacing the pagedir with a linklist so that
high-order memory allocations are avoided (the patches make swsusp use only
0-order allocations). Unfortunately this means that it's necessary to change
assembly routines used to restore the image after it's been loaded from
swap so that they walk the list instead of walking the array.

This patch makes swsusp allocate only individual pages during resume.
it contains the necessary changes to the assembly routines etc. for i386
and x86-64.

Please apply,
Pavel

Signed-off-by: Rafael J. Wysocki <rjw@xxxxxxx>
Signed-off-by: Pavel Machek <pavel@xxxxxxx>


diff -Nru linux-2.6.11-a/arch/i386/kernel/asm-offsets.c linux-2.6.11-b/arch/i386/kernel/asm-offsets.c
--- linux-2.6.11-a/arch/i386/kernel/asm-offsets.c 2005-03-02 08:38:00.000000000 +0100
+++ linux-2.6.11-b/arch/i386/kernel/asm-offsets.c 2005-03-04 20:14:01.000000000 +0100
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/signal.h>
#include <linux/personality.h>
+#include <linux/suspend.h>
#include <asm/ucontext.h>
#include "sigframe.h"
#include <asm/fixmap.h>
@@ -56,6 +57,11 @@

OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
+ BLANK();
+
+ OFFSET(pbe_address, pbe, address);
+ OFFSET(pbe_orig_address, pbe, orig_address);
+ OFFSET(pbe_next, pbe, next);

/* Offset from the sysenter stack to tss.esp0 */
DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
diff -Nru linux-2.6.11-a/arch/i386/power/swsusp.S linux-2.6.11-b/arch/i386/power/swsusp.S
--- linux-2.6.11-a/arch/i386/power/swsusp.S 2005-03-02 08:38:37.000000000 +0100
+++ linux-2.6.11-b/arch/i386/power/swsusp.S 2005-03-04 20:14:01.000000000 +0100
@@ -12,6 +12,7 @@
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page.h>
+#include <asm/asm_offsets.h>

.text

@@ -28,28 +29,28 @@
ret

ENTRY(swsusp_arch_resume)
- movl $swsusp_pg_dir-__PAGE_OFFSET,%ecx
- movl %ecx,%cr3
+ movl $swsusp_pg_dir-__PAGE_OFFSET, %ecx
+ movl %ecx, %cr3

- movl pagedir_nosave, %ebx
- xorl %eax, %eax
- xorl %edx, %edx
+ movl pagedir_nosave, %edx
.p2align 4,,7

copy_loop:
- movl 4(%ebx,%edx),%edi
- movl (%ebx,%edx),%esi
+ testl %edx, %edx
+ jz done
+
+ movl pbe_address(%edx), %esi
+ movl pbe_orig_address(%edx), %edi

movl $1024, %ecx
rep
movsl

- incl %eax
- addl $16, %edx
- cmpl nr_copy_pages,%eax
- jb copy_loop
+ movl pbe_next(%edx), %edx
+ jmp copy_loop
.p2align 4,,7

+done:
movl saved_context_esp, %esp
movl saved_context_ebp, %ebp
movl saved_context_ebx, %ebx
diff -Nru linux-2.6.11-a/arch/x86_64/kernel/asm-offsets.c linux-2.6.11-b/arch/x86_64/kernel/asm-offsets.c
--- linux-2.6.11-a/arch/x86_64/kernel/asm-offsets.c 2005-03-02 08:38:10.000000000 +0100
+++ linux-2.6.11-b/arch/x86_64/kernel/asm-offsets.c 2005-03-04 20:14:01.000000000 +0100
@@ -62,8 +62,8 @@
offsetof (struct rt_sigframe32, uc.uc_mcontext));
BLANK();
#endif
- DEFINE(SIZEOF_PBE, sizeof(struct pbe));
DEFINE(pbe_address, offsetof(struct pbe, address));
DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
+ DEFINE(pbe_next, offsetof(struct pbe, next));
return 0;
}
diff -Nru linux-2.6.11-a/arch/x86_64/kernel/suspend_asm.S linux-2.6.11-b/arch/x86_64/kernel/suspend_asm.S
--- linux-2.6.11-a/arch/x86_64/kernel/suspend_asm.S 2005-03-02 08:38:26.000000000 +0100
+++ linux-2.6.11-b/arch/x86_64/kernel/suspend_asm.S 2005-03-04 20:14:01.000000000 +0100
@@ -54,16 +54,10 @@
movq %rax, %cr4; # turn PGE back on

movq pagedir_nosave(%rip), %rdx
- /* compute the limit */
- movl nr_copy_pages(%rip), %eax
- testl %eax, %eax
- jz done
- movq %rdx,%r8
- movl $SIZEOF_PBE,%r9d
- mul %r9 # with rax, clobbers rdx
- movq %r8, %rdx
- addq %r8, %rax
loop:
+ testq %rdx, %rdx
+ jz done
+
/* get addresses from the pbe and copy the page */
movq pbe_address(%rdx), %rsi
movq pbe_orig_address(%rdx), %rdi
@@ -72,9 +66,8 @@
movsq

/* progress to the next pbe */
- addq $SIZEOF_PBE, %rdx
- cmpq %rax, %rdx
- jb loop
+ movq pbe_next(%rdx), %rdx
+ jmp loop
done:
movl $24, %eax
movl %eax, %ds
diff -Nru linux-2.6.11-a/kernel/power/swsusp.c linux-2.6.11-b/kernel/power/swsusp.c
--- linux-2.6.11-a/kernel/power/swsusp.c 2005-03-04 20:05:49.000000000 +0100
+++ linux-2.6.11-b/kernel/power/swsusp.c 2005-03-04 20:14:01.000000000 +0100
@@ -621,6 +621,46 @@
}

/**
+ * fill_pb_page - Create a list of PBEs on a given memory page
+ */
+
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+ struct pbe *p;
+
+ p = pbpage;
+ pbpage += PB_PAGE_SKIP;
+ do
+ p->next = p + 1;
+ while (++p < pbpage);
+}
+
+/**
+ * create_pbe_list - Create a list of PBEs on top of a given chain
+ * of memory pages allocated with alloc_pagedir()
+ */
+
+static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+{
+ struct pbe *pbpage, *p;
+ unsigned num = PBES_PER_PAGE;
+
+ for_each_pb_page (pbpage, pblist) {
+ if (num >= nr_pages)
+ break;
+
+ fill_pb_page(pbpage);
+ num += PBES_PER_PAGE;
+ }
+ if (pbpage) {
+ for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+ p->next = p + 1;
+ p->next = NULL;
+ }
+ pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
+}
+
+/**
* alloc_pagedir - Allocate the page directory.
*
* First, determine exactly how many pages we need and
@@ -636,7 +676,7 @@
static struct pbe * alloc_pagedir(unsigned nr_pages)
{
unsigned num;
- struct pbe *pblist, *pbe, *p;
+ struct pbe *pblist, *pbe;

if (!nr_pages)
return NULL;
@@ -645,21 +685,13 @@
pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
pbe = pbe->next, num += PBES_PER_PAGE) {
- p = pbe;
pbe += PB_PAGE_SKIP;
- do
- p->next = p + 1;
- while (p++ < pbe);
pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
}
- if (pbe) {
- for (num -= PBES_PER_PAGE - 1, p = pbe; num < nr_pages; p++, num++)
- p->next = p + 1;
- } else { /* get_zeroed_page() failed */
+ if (!pbe) { /* get_zeroed_page() failed */
free_pagedir(pblist);
pblist = NULL;
}
- pr_debug("alloc_pagedir(): allocated %d PBEs\n", num);
return pblist;
}

@@ -766,6 +798,7 @@
printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
return -ENOMEM;
}
+ create_pbe_list(pagedir_save, nr_copy_pages);
pagedir_nosave = pagedir_save;
if ((error = alloc_image_pages())) {
printk(KERN_ERR "suspend: Allocating image pages failed.\n");
@@ -917,44 +950,104 @@
return 0;
}

-/*
- * We check here that pagedir & pages it points to won't collide with pages
- * where we're going to restore from the loaded pages later
+/**
+ * On resume, for storing the PBE list and the image,
+ * we can only use memory pages that do not conflict with the pages
+ * which had been used before suspend.
+ *
+ * We don't know which pages are usable until we allocate them.
+ *
+ * Allocated but unusable (ie eaten) memory pages are linked together
+ * to create a list, so that we can free them easily
+ *
+ * We could have used a type other than (void *)
+ * for this purpose, but ...
*/
-static int __init check_pagedir(void)
+static void **eaten_memory = NULL;
+
+static inline void eat_page(void *page)
{
- int i;
+ void **c;

- for(i=0; i < nr_copy_pages; i++) {
- unsigned long addr;
+ c = eaten_memory;
+ eaten_memory = page;
+ *eaten_memory = c;
+}

- do {
- addr = get_zeroed_page(GFP_ATOMIC);
- if(!addr)
- return -ENOMEM;
- } while (does_collide_order(addr, 0));
+static unsigned long __init get_usable_page(unsigned gfp_mask)
+{
+ unsigned long m;

- (pagedir_nosave+i)->address = addr;
+ m = get_zeroed_page(gfp_mask);
+ while (does_collide_order(m, 0)) {
+ eat_page((void *)m);
+ m = get_zeroed_page(gfp_mask);
+ if (!m)
+ break;
}
- return 0;
+ return m;
}

-static int __init swsusp_pagedir_relocate(void)
+static void __init free_eaten_memory(void)
{
- /*
- * We have to avoid recursion (not to overflow kernel stack),
- * and that's why code looks pretty cryptic
+ unsigned long m;
+ void **c;
+ int i = 0;
+
+ c = eaten_memory;
+ while (c) {
+ m = (unsigned long)c;
+ c = *c;
+ free_page(m);
+ i++;
+ }
+ eaten_memory = NULL;
+ pr_debug("swsusp: %d unused pages freed\n", i);
+}
+
+/**
+ * check_pagedir - We ensure here that pages that the PBEs point to
+ * won't collide with pages where we're going to restore from the loaded
+ * pages later
+ */
+
+static int __init check_pagedir(struct pbe *pblist)
+{
+ struct pbe *p;
+
+ /* This is necessary, so that we can free allocated pages
+ * in case of failure
*/
- suspend_pagedir_t *old_pagedir = pagedir_nosave;
- void **eaten_memory = NULL;
- void **c = eaten_memory, *m, *f;
- int ret = 0;
+ for_each_pbe (p, pblist)
+ p->address = 0UL;
+
+ for_each_pbe (p, pblist) {
+ p->address = get_usable_page(GFP_ATOMIC);
+ if (!p->address)
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/**
+ * swsusp_pagedir_relocate - It is possible, that some memory pages
+ * occupied by the list of PBEs collide with pages where we're going to
+ * restore from the loaded pages later. We relocate them here.
+ */
+
+static struct pbe * __init swsusp_pagedir_relocate(struct pbe *pblist)
+{
struct zone *zone;
- int i;
- struct pbe *p;
unsigned long zone_pfn;
+ struct pbe *pbpage, *tail, *p;
+ void *m;
+ int rel = 0, error = 0;
+
+ if (!pblist) /* a sanity check */
+ return NULL;

- printk("Relocating pagedir ");
+ pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
+ swsusp_info.pagedir_pages);

/* Set page flags */

@@ -964,45 +1057,52 @@
zone->zone_start_pfn));
}

- /* Clear orig address */
+ /* Clear orig addresses */

- for(i = 0, p = pagedir_nosave; i < nr_copy_pages; i++, p++) {
+ for_each_pbe (p, pblist)
ClearPageNosaveFree(virt_to_page(p->orig_address));
- }

- if (!does_collide_order((unsigned long)old_pagedir, pagedir_order)) {
- printk("not necessary\n");
- return check_pagedir();
- }
+ tail = pblist + PB_PAGE_SKIP;

- while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order)) != NULL) {
- if (!does_collide_order((unsigned long)m, pagedir_order))
- break;
- eaten_memory = m;
- printk( "." );
- *eaten_memory = c;
- c = eaten_memory;
- }
+ /* Relocate colliding pages */

- if (!m) {
- printk("out of memory\n");
- ret = -ENOMEM;
- } else {
- pagedir_nosave =
- memcpy(m, old_pagedir, PAGE_SIZE << pagedir_order);
+ for_each_pb_page (pbpage, pblist) {
+ if (does_collide_order((unsigned long)pbpage, 0)) {
+ m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
+ if (!m) {
+ error = -ENOMEM;
+ break;
+ }
+ memcpy(m, (void *)pbpage, PAGE_SIZE);
+ if (pbpage == pblist)
+ pblist = (struct pbe *)m;
+ else
+ tail->next = (struct pbe *)m;
+
+ eat_page((void *)pbpage);
+ pbpage = (struct pbe *)m;
+
+ /* We have to link the PBEs again */
+
+ for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
+ if (p->next) /* needed to save the end */
+ p->next = p + 1;
+
+ rel++;
+ }
+ tail = pbpage + PB_PAGE_SKIP;
}

- c = eaten_memory;
- while (c) {
- printk(":");
- f = c;
- c = *c;
- free_pages((unsigned long)f, pagedir_order);
+ if (error) {
+ printk("\nswsusp: Out of memory\n\n");
+ free_pagedir(pblist);
+ free_eaten_memory();
+ pblist = NULL;
}
- if (ret)
- return ret;
- printk("|\n");
- return check_pagedir();
+ else
+ printk("swsusp: Relocated %d pages\n", rel);
+
+ return pblist;
}

/**
@@ -1147,76 +1247,117 @@
}

/**
- * swsusp_read_data - Read image pages from swap.
+ * data_read - Read image pages from swap.
*
* You do not need to check for overlaps, check_pagedir()
* already did that.
*/

-static int __init data_read(void)
+static int __init data_read(struct pbe *pblist)
{
struct pbe * p;
- int error;
- int i;
- int mod = nr_copy_pages / 100;
+ int error = 0;
+ int i = 0;
+ int mod = swsusp_info.image_pages / 100;

if (!mod)
mod = 1;

- if ((error = swsusp_pagedir_relocate()))
- return error;
+ printk("swsusp: Reading image data (%lu pages): ",
+ swsusp_info.image_pages);
+
+ for_each_pbe (p, pblist) {
+ if (!(i % mod))
+ printk("\b\b\b\b%3d%%", i / mod);

- printk( "Reading image data (%d pages): ", nr_copy_pages );
- for(i = 0, p = pagedir_nosave; i < nr_copy_pages && !error; i++, p++) {
- if (!(i%mod))
- printk( "\b\b\b\b%3d%%", i / mod );
error = bio_read_page(swp_offset(p->swap_address),
(void *)p->address);
+ if (error)
+ return error;
+
+ i++;
}
- printk(" %d done.\n",i);
+ printk("\b\b\b\bdone\n");
return error;
-
}

extern dev_t __init name_to_dev_t(const char *line);

-static int __init read_pagedir(void)
+/**
+ * read_pagedir - Read page backup list pages from swap
+ */
+
+static int __init read_pagedir(struct pbe *pblist)
{
- unsigned long addr;
- int i, n = swsusp_info.pagedir_pages;
- int error = 0;
+ struct pbe *pbpage, *p;
+ unsigned i = 0;
+ int error;

- addr = __get_free_pages(GFP_ATOMIC, pagedir_order);
- if (!addr)
- return -ENOMEM;
- pagedir_nosave = (struct pbe *)addr;
+ if (!pblist)
+ return -EFAULT;
+
+ printk("swsusp: Reading pagedir (%lu pages)\n",
+ swsusp_info.pagedir_pages);

- pr_debug("swsusp: Reading pagedir (%d Pages)\n",n);
+ for_each_pb_page (pbpage, pblist) {
+ unsigned long offset = swp_offset(swsusp_info.pagedir[i++]);

- for (i = 0; i < n && !error; i++, addr += PAGE_SIZE) {
- unsigned long offset = swp_offset(swsusp_info.pagedir[i]);
- if (offset)
- error = bio_read_page(offset, (void *)addr);
- else
- error = -EFAULT;
+ error = -EFAULT;
+ if (offset) {
+ p = (pbpage + PB_PAGE_SKIP)->next;
+ error = bio_read_page(offset, (void *)pbpage);
+ (pbpage + PB_PAGE_SKIP)->next = p;
+ }
+ if (error)
+ break;
}
+
if (error)
- free_pages((unsigned long)pagedir_nosave, pagedir_order);
+ free_page((unsigned long)pblist);
+
+ BUG_ON(i != swsusp_info.pagedir_pages);
+
return error;
}

+
static int __init read_suspend_image(void)
{
int error = 0;
+ struct pbe *p;

if ((error = check_sig()))
return error;
+
if ((error = check_header()))
return error;
- if ((error = read_pagedir()))
+
+ if (!(p = alloc_pagedir(nr_copy_pages)))
+ return -ENOMEM;
+
+ if ((error = read_pagedir(p)))
return error;
- if ((error = data_read()))
- free_pages((unsigned long)pagedir_nosave, pagedir_order);
+
+ create_pbe_list(p, nr_copy_pages);
+
+ if (!(pagedir_nosave = swsusp_pagedir_relocate(p)))
+ return -ENOMEM;
+
+ /* Allocate memory for the image and read the data from swap */
+
+ error = check_pagedir(pagedir_nosave);
+ free_eaten_memory();
+ if (!error)
+ error = data_read(pagedir_nosave);
+
+ if (error) { /* We fail cleanly */
+ for_each_pbe (p, pagedir_nosave)
+ if (p->address) {
+ free_page(p->address);
+ p->address = 0UL;
+ }
+ free_pagedir(pagedir_nosave);
+ }
return error;
}


--
- Would you tell me, please, which way I ought to go from here?
- That depends a good deal on where you want to get to.
-- Lewis Carroll "Alice's Adventures in Wonderland"

----- End forwarded message -----

--
People were complaining that M$ turns users into beta-testers...
...jr ghea gurz vagb qrirybcref, naq gurl frrz gb yvxr vg gung jnl!
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/