Re: patch cow-swapin [was Re: Very bad swap bug -- 2.0, 2.1 at least] (fwd)

Andrea Arcangeli (andrea@e-mind.com)
Thu, 24 Sep 1998 12:08:46 +0200 (CEST)


I repost this email in the list because it seems to been lost.

Andrea[s] Arcangeli

---------- Forwarded message ----------
Date: Wed, 23 Sep 1998 13:28:53 +0200 (CEST)
From: Andrea Arcangeli <andrea@e-mind.com>
To: Simon Kirby <sim@netnation.com>
Cc: Linux Kernel <linux-kernel@vger.rutgers.edu>,
Linus Torvalds <torvalds@transmeta.com>,
Alan Cox <alan@lxorguk.ukuu.org.uk>
Subject: Re: patch cow-swapin [was Re: Very bad swap bug -- 2.0, 2.1 at least]

On Wed, 23 Sep 1998, Andrea Arcangeli wrote:

>I' ll do a 2.0.x backport soon.

I include in this email my patch against 2.0.36-pre9.

If somebody need this patch and is going to apply it, please leave in
background this simple proggy all the time to do the testing:

#include <stdio.h>

#define BUFSIZE 10000000

volatile int buf[BUFSIZE];

main()
{
volatile int i;
for (i=0; i<BUFSIZE; i++)
buf[i] = i;
for(;;)
{
sleep(60);
if(!fork())
{
printf("now\n");
for (i=0; i<BUFSIZE; i++)
buf[i] = i;
break;
}
else
wait();
}
}

You can change the bufsize and the sleep time of course but if you don' t
leave this proggy in the background you won' t test my code at all
because my code never run in normal conditions.

The patch continue to works fine here (UP x86) on 2.0 and 2.1 ;-).

diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/CREDITS linux/CREDITS
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/CREDITS Wed Sep 23 12:46:51 1998
+++ linux/CREDITS Wed Sep 23 13:13:50 1998
@@ -42,6 +42,7 @@
D: Fixed a 2.0.33 mm bug that corrupts memory in linux/mm/vmalloc.c
D: Author of lil (Linux Interrupt Latency benchmark)
D: Fixed the shm swap deallocation at swapoff time
+D: Developed linux/mm/swapin_parent.c (avoids continuous swapin of cow pages)
S: Via Ciaclini 26
S: Imola 40026
S: Italy
diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/include/linux/swap.h linux/include/linux/swap.h
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/include/linux/swap.h Thu Jun 18 23:48:21 1998
+++ linux/include/linux/swap.h Wed Sep 23 12:52:20 1998
@@ -65,6 +65,9 @@
extern void swap_in(struct task_struct *, struct vm_area_struct *,
pte_t *, unsigned long, int);

+/* linux/mm/swapin_parent.c */
+extern void swapin_parent(struct task_struct *, unsigned long,
+ pte_t *, unsigned long, unsigned int);

/* linux/mm/swap_state.c */
extern void show_swap_cache_info(void);
diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/Makefile linux/mm/Makefile
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/Makefile Fri Mar 22 11:56:56 1996
+++ linux/mm/Makefile Wed Sep 23 12:54:18 1998
@@ -9,7 +9,7 @@

O_TARGET := mm.o
O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
- kmalloc.o vmalloc.o \
+ kmalloc.o vmalloc.o swapin_parent.o \
swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o

include $(TOPDIR)/Rules.make
diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/memory.c linux/mm/memory.c
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/memory.c Wed Sep 11 16:57:19 1996
+++ linux/mm/memory.c Wed Sep 23 12:51:40 1998
@@ -843,6 +843,8 @@

if (!vma->vm_ops || !vma->vm_ops->swapin) {
swap_in(tsk, vma, page_table, pte_val(entry), write_access);
+ swapin_parent(tsk, address, page_table, pte_val(entry),
+ write_access);
flush_page_to_ram(pte_page(*page_table));
return;
}
diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/swapin_parent.c linux/mm/swapin_parent.c
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/swapin_parent.c Thu Jan 1 01:00:00 1970
+++ linux/mm/swapin_parent.c Wed Sep 23 13:09:57 1998
@@ -0,0 +1,182 @@
+/*
+ * swapin_parent: join mem between swapped in childs and a swapped out parents
+ * Copyright (C) 1998 Andrea Arcangeli
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * You can reach Andrea Arcangeli at <andrea@e-mind.com>.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/swap.h>
+
+#include <asm/pgtable.h>
+
+static __inline__ unsigned long duplicate(unsigned long old_page)
+{
+ /*
+ * I don' t like to avoid swapins swapping out some other thing
+ * more recently used.
+ */
+ unsigned long new_page = __get_free_page(GFP_ATOMIC);
+ if (new_page)
+ memcpy((void *) new_page, (void *) old_page, PAGE_SIZE);
+ return new_page;
+}
+
+#define pte_mkcow(new_page, vma) \
+ pte_mkwrite(pte_mkdirty(mk_pte((new_page), (vma)->vm_page_prot)))
+
+static void do_swapin_parent(pte_t *new_pte, pte_t *pte,
+ unsigned long entry, struct vm_area_struct *vma,
+ unsigned int write, struct task_struct *parent)
+{
+ struct page *page;
+ unsigned long map_nr;
+
+ map_nr = MAP_NR(pte_page(*new_pte));
+ page = &mem_map[map_nr];
+
+ if (PageReserved(mem_map+map_nr))
+ {
+ printk(KERN_ERR "do_swapin_parent: "
+ "swapped in page was reserved!\n");
+ return;
+ }
+
+ if (write)
+ {
+ unsigned long new_page;
+ if (!pte_write(*new_pte))
+ {
+ printk(KERN_WARNING "do_swapin_parent: swapin after "
+ "writefault marked the page not writable\n");
+ return;
+ }
+ new_page = duplicate(pte_page(*new_pte));
+ if (!new_page)
+ return;
+ set_pte(pte, pte_mkcow(new_page, vma));
+ } else {
+ if (pte_write(*new_pte))
+ {
+ printk(KERN_WARNING "do_swapin_parent: swapin after "
+ "readfault marked the page writable\n");
+ return;
+ }
+ set_pte(pte, *new_pte);
+ atomic_inc(&page->count);
+ }
+
+ ++vma->vm_mm->rss;
+ ++parent->maj_flt;
+ swap_free(entry);
+}
+
+
+static void try_to_swapin_parent(struct task_struct *parent,
+ unsigned long address,
+ pte_t *new_pte, unsigned long entry,
+ unsigned int write)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ struct vm_area_struct *vma;
+
+ vma = find_vma(parent->mm, address);
+ if (!vma)
+ {
+ printk(KERN_ERR "try_to_swapin_parent: NULL vma!\n");
+ return;
+ }
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (pgd_none(*pgd))
+ return;
+ if (pgd_bad(*pgd)) {
+ printk(KERN_ERR "try_to_swapin_parent: bad pgd (%08lx)\n",
+ pgd_val(*pgd));
+ pgd_clear(pgd);
+ return;
+ }
+
+ pmd = pmd_offset(pgd, address);
+ if (pmd_none(*pmd))
+ return;
+ if (pmd_bad(*pmd))
+ {
+ printk(KERN_ERR "try_to_swapin_parent: bad pmd (%08lx)\n",
+ pmd_val(*pmd));
+ pmd_clear(pmd);
+ return;
+ }
+
+ pte = pte_offset(pmd, address);
+
+ if (pte_val(*pte) != entry)
+ return;
+
+ do_swapin_parent(new_pte, pte, entry, vma, write, parent);
+}
+
+void swapin_parent(struct task_struct *child, unsigned long address,
+ pte_t *new_pte, unsigned long entry, unsigned int write)
+{
+ struct task_struct *parent;
+
+ if (child->did_exec)
+ return;
+
+ /*
+ * A bit of PARANOID.
+ */
+ if (pte_val(*new_pte) == entry)
+ {
+ printk(KERN_WARNING "swapin_parent: child not yet swapped "
+ "in\n");
+ return;
+ }
+ if (pte_val(*new_pte) == pte_val(BAD_PAGE))
+ {
+ printk(KERN_WARNING "swapin_parent: swapped in page is BAD\n");
+ return;
+ }
+ if (pte_none(*new_pte))
+ {
+ printk(KERN_ERR "swapin_parent: child page table NULL!\n");
+ return;
+ }
+ if (!pte_present(*new_pte))
+ {
+ printk(KERN_ERR "swapin_parent: child wrong swap entry!\n");
+ return;
+ }
+
+ parent = child->p_pptr;
+ if (!parent)
+ {
+ printk(KERN_ERR "swapin_parent: parent NULL!\n");
+ goto out;
+ }
+#ifdef __SMP__
+ if (parent->processor != NO_PROC_ID)
+ goto out;
+#endif
+ try_to_swapin_parent(parent, address, new_pte, entry, write);
+ out:
+}

Andrea[s] Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/