Re: flush_tlb_page in unuse_pte

Bill Hawes (whawes@star.net)
Wed, 27 Aug 1997 14:49:43 -0400


This is a multi-part message in MIME format.
--------------94DA22B522FCA1CF4C5672A0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Ingo Molnar wrote:
> in this new scheme, it's perfectly legal and desired to say have 10
> clone's trying to allocate 10 pages for the very same VM address. The
> first one will succeed, and these 10 allocations will add enough pressure
> to the mem subsystem so that it goes faster ... speed of completion
> depends on the number of waiting threads. Automagic object-priorites,
> whee.

OK, check this out ... Ingo's suggestion implemented for swapoff. It
seems _much_ cleaner than mucking around with the mmap semaphore. I
just read in each swap page and then chase down the process using it.

Granted that swapoff is easier than handling page faults, but the idea
is the same.

Regards,
Bill
--------------94DA22B522FCA1CF4C5672A0
Content-Type: text/plain; charset=us-ascii; name="swapoff_51-patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="swapoff_51-patch"

--- mm/swapfile.c.old Sat Jul 19 08:17:17 1997
+++ mm/swapfile.c Wed Aug 27 14:30:56 1997
@@ -169,7 +169,7 @@
* from the beginning for this process..
*/
static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
- pte_t *dir, unsigned int type, unsigned long page)
+ pte_t *dir, unsigned long entry, unsigned long page)
{
pte_t pte = *dir;

@@ -178,24 +178,24 @@
if (pte_present(pte)) {
struct page *pg;
unsigned long page_nr = MAP_NR(pte_page(pte));
+ unsigned long pg_swap_entry;
+
if (page_nr >= max_mapnr)
return 0;
pg = mem_map + page_nr;
- if (!in_swap_cache(pg))
+ if (!(pg_swap_entry = in_swap_cache(pg)))
return 0;
- if (SWP_TYPE(in_swap_cache(pg)) != type)
+ if (SWP_TYPE(pg_swap_entry) != SWP_TYPE(entry))
return 0;
delete_from_swap_cache(pg);
set_pte(dir, pte_mkdirty(pte));
- return 0;
- }
- if (SWP_TYPE(pte_val(pte)) != type)
- return 0;
- read_swap_page(pte_val(pte), (char *) page);
- if (pte_val(*dir) != pte_val(pte)) {
+ if (pg_swap_entry != entry)
+ return 0;
free_page(page);
return 1;
}
+ if (pte_val(pte) != entry)
+ return 0;
set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
flush_tlb_page(vma, address);
++vma->vm_mm->rss;
@@ -205,7 +205,7 @@

static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
unsigned long address, unsigned long size, unsigned long offset,
- unsigned int type, unsigned long page)
+ unsigned long entry, unsigned long page)
{
pte_t * pte;
unsigned long end;
@@ -224,7 +224,8 @@
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
+ if (unuse_pte(vma, offset+address-vma->vm_start, pte, entry,
+ page))
return 1;
address += PAGE_SIZE;
pte++;
@@ -234,7 +235,7 @@

static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
unsigned long address, unsigned long size,
- unsigned int type, unsigned long page)
+ unsigned long entry, unsigned long page)
{
pmd_t * pmd;
unsigned long offset, end;
@@ -253,7 +254,8 @@
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
- if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
+ if (unuse_pmd(vma, pmd, address, end - address, offset, entry,
+ page))
return 1;
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
@@ -262,11 +264,12 @@
}

static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
- unsigned long start, unsigned long end,
- unsigned int type, unsigned long page)
+ unsigned long entry, unsigned long page)
{
+ unsigned long start = vma->vm_start, end = vma->vm_end;
+
while (start < end) {
- if (unuse_pgd(vma, pgdir, start, end - start, type, page))
+ if (unuse_pgd(vma, pgdir, start, end - start, entry, page))
return 1;
start = (start + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
@@ -274,7 +277,8 @@
return 0;
}

-static int unuse_process(struct mm_struct * mm, unsigned int type, unsigned long page)
+static int unuse_process(struct mm_struct * mm, unsigned long entry,
+ unsigned long page)
{
struct vm_area_struct* vma;

@@ -283,43 +287,70 @@
*/
if (!mm || mm == &init_mm)
return 0;
- vma = mm->mmap;
- while (vma) {
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
pgd_t * pgd = pgd_offset(mm, vma->vm_start);
- if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
+ if (unuse_vma(vma, pgd, entry, page))
return 1;
- vma = vma->vm_next;
+ }
+ return 0;
+}
+
+static unsigned long find_swap_entry(int type)
+{
+ struct swap_info_struct * p = &swap_info[type];
+ int i;
+
+ for (i = 1 ; i < p->max ; i++) {
+ if (p->swap_map[i] > 0 && p->swap_map[i] != 0x80)
+ return SWP_ENTRY(type, i);
}
return 0;
}

/*
- * To avoid races, we repeat for each process after having
- * swapped something in. That gets rid of a few pesky races,
- * and "swapoff" isn't exactly timing critical.
+ * We completely avoid races by reading each swap page in advance,
+ * and then search for the process using it. All the necessary
+ * page table adjustments can then be made atomically.
*/
static int try_to_unuse(unsigned int type)
{
- unsigned long page = get_free_page(GFP_KERNEL);
+ unsigned long page = 0;
struct task_struct *p;
+ unsigned long entry;

- if (!page)
- return -ENOMEM;
-again:
- read_lock(&tasklist_lock);
- for_each_task(p) {
- read_unlock(&tasklist_lock);
- if(unuse_process(p->mm, type, page)) {
- page = get_free_page(GFP_KERNEL);
- if(!page)
+ /*
+ * Find all swap entries in use ...
+ */
+ while ((entry = find_swap_entry(type)) != 0) {
+ if (!page) {
+ page = __get_free_page(GFP_KERNEL);
+ if (!page)
return -ENOMEM;
- goto again;
}
+
+ /*
+ * Read in the page, and then free the swap page.
+ */
+ read_swap_page(entry, (char *) page);
+
read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if (unuse_process(p->mm, entry, page)) {
+ page = 0;
+ goto unlock;
+ }
+ }
+ unlock:
+ read_unlock(&tasklist_lock);
+ if (page) {
+ printk("try_to_unuse: didn't find entry %8lx\n",
+ entry);
+ swap_free(entry);
+ }
}
- read_unlock(&tasklist_lock);

- free_page(page);
+ if (page)
+ free_page(page);
return 0;
}

--------------94DA22B522FCA1CF4C5672A0--