updated patch for memory defragmentation

Bill Hawes (whawes@star.net)
Sun, 26 Jul 1998 05:42:59 -0400


This is a multi-part message in MIME format.
--------------0A7DE85819B39FC27DD32986
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

The attached patch is an updated memory defragger for test and comment.
It now has the ability to replace mapped pages as well as page/buffer
cache, which appears to make it much more effective. Thanks to all who
pointed out problems along the way.

The patch is stable here, but I'd still classify it as experimental.
I've turned off most of the debugging messages, and the remaining one
indicates that it managed to build a 16K or 32K page group. (Good news
if your machine needs the memory!) Test reports welcome.

Regards,
Bill
--------------0A7DE85819B39FC27DD32986
Content-Type: text/plain; charset=us-ascii; name="mm_defrag111-patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="mm_defrag111-patch"

--- linux-2.1.111/include/linux/mm.h.old Sat Jul 25 10:22:12 1998
+++ linux-2.1.111/include/linux/mm.h Sat Jul 25 11:07:37 1998
@@ -251,20 +251,24 @@
return page;
}

-/* memory.c & swap.c*/
+/* page_alloc.c */

/*
* Decide if we should try to do some swapout..
*/
extern int free_memory_available(void);

+extern int need_defrag(void);
+extern void anneal_mem(void);
+
#define free_page(addr) free_pages((addr),0)
extern void FASTCALL(free_pages(unsigned long addr, unsigned long order));
extern void FASTCALL(__free_page(struct page *));
-
extern void show_free_areas(void);
-extern unsigned long put_dirty_page(struct task_struct * tsk,unsigned long page,
- unsigned long address);
+
+/* memory.c & swap.c*/
+extern unsigned long put_dirty_page(struct task_struct *, unsigned long,
+ unsigned long);

extern void free_page_tables(struct mm_struct * mm);
extern void clear_page_tables(struct task_struct * tsk);
--- linux-2.1.111/mm/page_alloc.c.old Tue Jul 21 14:41:49 1998
+++ linux-2.1.111/mm/page_alloc.c Sat Jul 25 11:07:37 1998
@@ -400,3 +400,569 @@
set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
return;
}
+
+/* ==================== Memory Defragmentation ========================
+ *
+ * Overview of Operation
+ * The defragmentation code works by searching for page groups needing
+ * just one more free page to make a higher-order group. It builds a list
+ * of such pages and then attempts to release or replace the pages.
+ *
+ * need_defrag() -- check whether defragmentation is necessary.
+ * Compares the counts of the memory free lists with a goal
+ * vector for higher-order pages, and requests defragmentation
+ * if any the goals haven't been met.
+ * anneal_mem() -- the top-level entry point.
+ * Creates the page bitmap and iterates through the memory list
+ * orders performing defragmentation.
+ * build_free_map() -- fills in the page bitmap.
+ * Examines the memory free lists and sets a bit in the page
+ * bitmap for each free page.
+ * find_candidate_pages() -- searches the page bitmap for page groups.
+ * replace_cache() -- attempts to free buffer or page cache pages.
+ * replace_mmap() -- attempts to replace mapped pages with a free page.
+ * Calls the generalized mm/vma/pgd/pmd walking routines to check
+ * each pte against the list of mapped pages to be replaced.
+ * do_defrag() -- examine a pte and replace the page if possible.
+ */
+
+#define DEFRAG_MAX_ORDER 3
+#define DEFRAG_PAGES 32
+
+#define DEFRAG_DEBUG
+/* #define DEFRAG_VERBOSE */
+
+/* These are the goals for the defragmentation code */
+int free_mem_goal[NR_MEM_LISTS] = {0, 16, 4, 2, 0, };
+static int free_mem_curr[NR_MEM_LISTS]; /* what we have */
+static int free_page_goal = 64; /* sum of goal vector pages */
+
+/*
+ * Data structures used for the defragging operation.
+ */
+struct defrag_struct {
+ int num; /* page count */
+ unsigned char *map;
+ int *list; /* pages to look for */
+};
+
+struct pte_op {
+ int (*func)(struct pte_op *, struct vm_area_struct *,
+ unsigned long, pte_t *);
+ void *datap;
+ unsigned int vm_exempt;
+};
+
+int walk_mm(struct mm_struct *, struct pte_op *);
+
+/*
+ * Check whether we need to run the memory defragger.
+ * We only care about the order 1, 2, and 3 lists, but
+ * allow a credit to trickle down from the higher orders.
+ */
+int need_defrag(void)
+{
+ int order = NR_MEM_LISTS, credit = 0, defrag = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&page_alloc_lock, flags);
+ while (order--) {
+ struct free_area_struct *list = free_area + order;
+ struct page *next = list->next;
+
+ /*
+ * Count the nodes on the memory list, but
+ * stop as soon as we have enough memory.
+ */
+ free_mem_curr[order] = 0;
+ while (next != memory_head(list)) {
+ credit += (1 << order);
+ /* stop early if we have enough memory */
+ if (!defrag && credit >= free_page_goal)
+ goto out;
+ free_mem_curr[order]++;
+ next = next->next;
+ }
+ /*
+ * Check whether we've met the goal.
+ */
+ if (credit >= (free_mem_goal[order] << order)) {
+ credit -= (free_mem_goal[order] << order);
+ } else {
+ credit = 0;
+ defrag = 1;
+ }
+ }
+out:
+ spin_unlock_irqrestore(&page_alloc_lock, flags);
+ return defrag;
+}
+
+/*
+ * Builds a bitmap of the free pages in the memory lists
+ * up to and including the specified order.
+ */
+static void build_free_map(unsigned char *map, int size, int max_order)
+{
+ unsigned long flags;
+ int order;
+
+ /* Clear the bitmap */
+ memset((void *) map, 0, size);
+
+ spin_lock_irqsave(&page_alloc_lock, flags);
+
+ for (order = 0; order <= max_order; order++) {
+ struct free_area_struct *area = free_area + order;
+ struct page *next = memory_head(area), *node;
+ unsigned long index, map_nr;
+
+ while ((node = next->next) != memory_head(area)) {
+ next = node;
+ map_nr = node->map_nr;
+ index = map_nr + (1 << order) - 1;
+ while (index >= map_nr) {
+ set_bit(index, map);
+ index--;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&page_alloc_lock, flags);
+}
+
+/*
+ * The defragmentation pte_op function. Checks whether a
+ * pte maps one of the pages that would complete a larger
+ * group, and if so replaces it with a new page.
+ */
+static int do_defrag(struct pte_op *opptr, struct vm_area_struct * vma,
+ unsigned long addr, pte_t *dir)
+{
+ struct defrag_struct *dfp = (struct defrag_struct *) opptr->datap;
+ pte_t pte, new_pte;
+ struct page *page_map;
+ unsigned long old_page, new_page;
+ int result = 0, map_nr, npages, i;
+
+ pte = *dir;
+ if (pte_none(pte))
+ goto out;
+ if (!pte_present(pte))
+ goto out;
+
+ old_page = pte_page(pte);
+ map_nr = MAP_NR(old_page);
+
+ /*
+ * Check whether it's one of the pages we want.
+ */
+ npages = dfp->num;
+ for (i = 0; i < npages; i++) {
+ if (map_nr != dfp->list[i])
+ continue;
+#ifdef DEFRAG_VERBOSE
+printk("do_defrag: found page %d, count=%d\n", map_nr, dfp->num);
+#endif
+ /* sanity check */
+ page_map = mem_map + map_nr;
+ if (atomic_read(&page_map->count) != 1) {
+printk(KERN_WARNING "do_defrag: count changed??\n");
+ continue;
+ }
+
+ /*
+ * Note! We do an atomic allocation here, but at
+ * low priority. Since having adequate memory was
+ * a precondition to starting the defragmentation,
+ * consuming a few pages won't cause any problems.
+ */
+ new_page = __get_free_page(__GFP_LOW);
+ /* stop now if an allocation fails */
+ if (!new_page) {
+ result = 1;
+ break;
+ }
+
+ /* Create the new pte value */
+ new_pte = mk_pte(new_page, vma->vm_page_prot);
+ if (pte_write(pte))
+ new_pte = pte_mkwrite(new_pte);
+ if (pte_dirty(pte))
+ new_pte = pte_mkdirty(new_pte);
+ /*
+ * Unmap the page before doing the copy,
+ * in case another CPU is writing to it.
+ */
+ flush_cache_page(vma, addr);
+ pte_clear(dir);
+ flush_tlb_page(vma, addr);
+
+ /* copy the data */
+ copy_page(new_page, old_page);
+ flush_page_to_ram(new_page);
+ flush_page_to_ram(old_page);
+
+#ifdef DEFRAG_DEBUG
+printk("do_defrag: changing pte, old=%08lx, new=%08lx\n",
+pte_val(pte), pte_val(new_pte));
+#endif
+ /* install the new pte */
+ set_pte(dir, new_pte);
+
+ /* Success! Update the bitmap ... */
+ set_bit(page_map->map_nr, dfp->map);
+ __free_page(page_map);
+
+ page_map = mem_map + MAP_NR(new_page);
+ clear_bit(page_map->map_nr, dfp->map);
+ new_page = 0;
+
+ /* Update the list */
+ dfp->list[i] = dfp->list[npages-1];
+ if (!--dfp->num)
+ result = 1;
+ break;
+ }
+out:
+ return result;
+}
+
+/*
+ * Look for candidate pages to anneal the specified order.
+ */
+static int find_candidate_pages(unsigned char *map, int size, int order,
+ int*list)
+{
+ int nbits = (1 << order);
+ int num = 0, i, offset, index, map_nr, quota;
+
+ /*
+ * Set a quota for mapped pages, as we need
+ * to free single pages to replace these.
+ */
+ quota = free_mem_curr[0] << 1;
+ if (order < 2)
+ quota = 0;
+
+ for (i = 0; i < size; i++, map++) {
+ if (!*map)
+ continue;
+ for (offset = 0; offset < 8; offset += nbits) {
+ /*
+ * Look for page groups needing only one
+ * page to be coalesced.
+ */
+ map_nr = 0;
+ for (index = 0; index < nbits; index++) {
+ if (test_bit(offset + index, map))
+ continue;
+ /* More than one page missing? */
+ if (map_nr)
+ goto next;
+ /* Save the index of the missing page */
+ map_nr = (i << 3) + offset + index;
+#ifdef DEFRAG_VERBOSE
+printk(KERN_DEBUG "find_candidate_pages: mask=%x, bit=%d, map_nr=%d\n",
+*map, offset + index, map_nr);
+#endif
+ }
+ /*
+ * Check whether we found a good candidate.
+ */
+ if (map_nr) {
+ struct page *page = mem_map + map_nr;
+
+ /* This is all we handle so far */
+ if (atomic_read(&page->count) > 1)
+ goto next;
+ /* Make a trivial test for buffers in use */
+ if (page->buffers) {
+ if (page->buffers->b_count)
+ goto next;
+ }
+ /* No more than <quota> mapped pages */
+ else if (!page->inode) {
+ if (!quota)
+ goto next;
+ quota--;
+ }
+
+ list[num] = map_nr;
+ num++;
+ if (num >= DEFRAG_PAGES)
+ goto out;
+ }
+ next:
+ }
+ }
+out:
+#ifdef DEFRAG_VERBOSE
+printk(KERN_DEBUG "find_candidate_pages: order %d, found %d\n", order, num);
+#endif
+ return num;
+}
+
+/*
+ * Attempt to free page or buffer cache pages in the list,
+ * and update the list for the next pass.
+ */
+static int replace_cache(unsigned char *bitmap, int *list, int *num)
+{
+ int npage = *num;
+ int result = 0, nmap = 0, i;
+
+ for (i = 0; i < npage; i++) {
+ int map_nr = list[i];
+ struct page *page = mem_map + map_nr;
+
+ /*
+ * Check whether it's a page we can handle ...
+ */
+ if (PageLocked(page))
+ continue;
+ if (atomic_read(&page->count) != 1)
+ continue;
+
+ /* The easiest case of all ... */
+ if (page->inode) {
+#ifdef DEFRAG_VERBOSE
+printk(KERN_DEBUG "replace_page: page %p in page cache\n", page);
+#endif
+ if (PageSwapCache(page))
+ delete_from_swap_cache(page);
+ else
+ remove_inode_page(page);
+ goto update_map;
+ }
+ /* Not too hard if it works ... */
+ if (page->buffers) {
+ struct buffer_head *bh = page->buffers;
+ if (try_to_free_buffer(bh, &bh, 6)) {
+#ifdef DEFRAG_VERBOSE
+printk(KERN_DEBUG "replace_page: page %p in buffer cache\n", page);
+#endif
+ goto update_map;
+ }
+ continue;
+ }
+ /* Possibly a mapped page ... keep it for the next pass */
+ list[nmap] = map_nr;
+ nmap++;
+ continue;
+
+ /*
+ * We freed the page, so update our bitmap.
+ */
+ update_map:
+ set_bit(page->map_nr, bitmap);
+ result = 1;
+ }
+ /* update the list count */
+ *num = nmap;
+
+ return result;
+}
+
+/*
+ * Look for pages mapped by processes and attempt to replace them.
+ */
+static int replace_mmap(unsigned char *bitmap, int *list, int *num)
+{
+ int count = *num;
+ struct task_struct *tsk;
+ struct defrag_struct defrag = {count, bitmap, list};
+ struct pte_op defrag_op;
+
+ /*
+ * Fill in the pte_op for the defragging operation.
+ */
+ defrag_op.func = do_defrag;
+ defrag_op.datap = (void *) &defrag;
+ defrag_op.vm_exempt = (VM_SHM | VM_LOCKED);
+
+ read_lock(&tasklist_lock);
+ for_each_task(tsk) {
+ if (walk_mm(tsk->mm, &defrag_op))
+ break;
+ }
+ read_unlock(&tasklist_lock);
+
+ /* update the list count */
+ *num = defrag.num;
+
+ return (defrag.num < count);
+}
+
+/*
+ * Top-level entry point for memory defragmentation.
+ * Attempts to repopulate the higher order memory lists
+ * by searching for page groups missing only a single
+ * page.
+ */
+void anneal_mem(void)
+{
+ int map_size = (num_physpages >> 3);
+ unsigned char *bitmap;
+ int order, num, progress, tries = 3;
+ int list[DEFRAG_PAGES];
+
+#ifdef DEFRAG_VERBOSE
+printk(KERN_INFO "anneal_mem: before defragging: %d %d %d\n",
+free_mem_curr[1], free_mem_curr[2], free_mem_curr[3]);
+#endif
+
+ /*
+ * Allocate memory for the page bitmap,
+ * using vmalloc() if memory > 128M.
+ */
+ if (map_size <= PAGE_SIZE)
+ bitmap = (unsigned char *) __get_free_page(GFP_KERNEL);
+ else
+ bitmap = (unsigned char *) vmalloc(map_size);
+ if (!bitmap)
+ return;
+ /*
+ * Build the bitmap for the lists to be defragged.
+ * Only need to do this once ... map is updated as
+ * pages are freed.
+ */
+ build_free_map(bitmap, map_size, DEFRAG_MAX_ORDER);
+
+ while (tries--) {
+ progress = 0;
+ /* Iterate over the orders to be defragged. */
+ for (order = 1; order <= DEFRAG_MAX_ORDER; order++) {
+ /*
+ * Select the pages to build page groups of this order.
+ */
+ num = find_candidate_pages(bitmap, map_size, order,
+ list);
+ if (!num)
+ continue;
+ /*
+ * Try to free or replace the candidate pages.
+ */
+ progress |= replace_cache(bitmap, list, &num);
+ if (!num)
+ continue;
+ progress |= replace_mmap(bitmap, list, &num);
+ }
+ /* Stop now if we didn't make progress. */
+ if (!progress)
+ break;
+ if (!need_defrag())
+ break;
+ }
+#ifdef DEFRAG_VERBOSE
+printk(KERN_INFO "anneal_mem: after defragging: %d %d %d\n",
+free_mem_curr[1], free_mem_curr[2], free_mem_curr[3]);
+#endif
+
+ /* Free the bitmap */
+ if (map_size <= PAGE_SIZE)
+ free_page((unsigned long) bitmap);
+ else
+ vfree((void *) bitmap);
+}
+
+/*
+ * Generalized mm/vma/pgd/pmd walk functions.
+ */
+int walk_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long addr,
+ unsigned long size, unsigned long offset, struct pte_op *opptr)
+{
+ pte_t * pte;
+ unsigned long end;
+
+ if (pmd_none(*dir))
+ goto out;
+ if (pmd_bad(*dir))
+ goto out_bad;
+
+ pte = pte_offset(dir, addr);
+ offset += addr & PMD_MASK;
+ addr &= ~PMD_MASK;
+ end = addr + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+ do {
+ if (opptr->func(opptr, vma, offset + addr - vma->vm_start, pte))
+ return 1;
+ addr += PAGE_SIZE;
+ pte++;
+ } while (addr < end);
+out:
+ return 0;
+
+out_bad:
+ printk(KERN_ERR "walk_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
+ pmd_clear(dir);
+ return 0;
+}
+
+int walk_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address,
+ unsigned long size, struct pte_op *opptr)
+{
+ pmd_t * pmd;
+ unsigned long offset, end;
+
+ if (pgd_none(*dir))
+ goto out;
+ if (pgd_bad(*dir))
+ goto out_bad;
+ pmd = pmd_offset(dir, address);
+ offset = address & PGDIR_MASK;
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ do {
+ if (walk_pmd(vma, pmd, address, end - address, offset, opptr))
+ return 1;
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+out:
+ return 0;
+
+out_bad:
+ printk(KERN_ERR "walk_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
+ pgd_clear(dir);
+ return 0;
+}
+
+int walk_vma(struct vm_area_struct * vma, pgd_t *pgdir, struct pte_op *opptr)
+{
+ unsigned long start = vma->vm_start, end = vma->vm_end;
+
+ /* check whether to exempt this vma */
+ if (vma->vm_flags & opptr->vm_exempt)
+ goto out;
+
+ while (start < end) {
+ if (walk_pgd(vma, pgdir, start, end - start, opptr))
+ return 1;
+ start = (start + PGDIR_SIZE) & PGDIR_MASK;
+ pgdir++;
+ }
+out:
+ return 0;
+}
+
+int walk_mm(struct mm_struct * mm, struct pte_op *opptr)
+{
+ struct vm_area_struct* vma;
+
+ if (!mm || mm == &init_mm)
+ goto out;
+ /*
+ * Go through the mmap vma list.
+ */
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ pgd_t * pgd = pgd_offset(mm, vma->vm_start);
+ if (walk_vma(vma, pgd, opptr))
+ return 1;
+ }
+out:
+ return 0;
+}
--- linux-2.1.111/mm/vmscan.c.old Tue Jul 21 14:41:49 1998
+++ linux-2.1.111/mm/vmscan.c Sat Jul 25 15:40:14 1998
@@ -573,6 +593,11 @@
if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
run_task_queue(&tq_disk);
-
}
+ /*
+ * Check whether we need to defragment memory,
+ * but don't try if free memory is still low.
+ */
+ if (free_memory_available() > 0 && need_defrag())
+ anneal_mem();
}
/* As if we could ever get here - maybe we want to make this killable */

--------------0A7DE85819B39FC27DD32986--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.altern.org/andrebalsa/doc/lkml-faq.html