[patch 1/5] page_alloc commentry

From: Mel (mel@csn.ul.ie)
Date: Tue Aug 06 2002 - 14:33:00 EST

Next message: Mel: "[patch 2/5] NUMA document update"
Previous message: Richard B. Johnson: "Re: pppd[32497]: Unsupported protocol 'CallBack Control Protocol (CBCP)' (0xc029) received"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This is a commentry patch documenting more how the buddy allocator does
it's work. No code is changed. Please apply

Mel Gorman
MSc Student, University of Limerick
http://www.csn.ul.ie/~mel

--- linux-2.4.19/mm/page_alloc.c Sat Aug 3 01:39:46 2002
+++ linux-2.4.19-mel/mm/page_alloc.c Tue Aug 6 15:41:33 2002
@@ -25,11 +25,24 @@
int nr_swap_pages;
int nr_active_pages;
int nr_inactive_pages;
+
+/*
+ * The two LRU list. These of primary interest to the page replacement
+ * algorithm. Pages that are referenced often will remain in the active_list.
+ * Pages are moved to the inactive_list by refill_inactive called by kswapd.
+ * Once in the inactive list, the page is a candidate to be swapped out
+ */
LIST_HEAD(inactive_list);
LIST_HEAD(active_list);
pg_data_t *pgdat_list;

-/* Used to look up the address of the struct zone encoded in page->zone */
+/*
+ * zone_table is the replacement for page->zone . It's a simple way of
+ * quickly looking up what zone a page belongs so. During init, the upper
+ * most 8 bits of page->flags will be used to store what zone we are in.
+ * See free_area_init_core . The macro page_zone returns the zone a page
+ * belongs to. See linux/include/mm.h
+ */
zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
EXPORT_SYMBOL(zone_table);

@@ -69,8 +82,23 @@
  * triggers coalescing into a block of larger size.
  *
  * -- wli
+ *
+ * There is a brief explanation of how a buddy algorithm works at
+ * http://www.memorymanagement.org/articles/alloc.html . A better idea
+ * is to read the explanation from a book like UNIX Internals by
+ * Uresh Vahalia
+ *
  */

+/**
+ *
+ * __free_pages_ok - Returns pages to the buddy allocator
+ * @page: The first page of the block to be freed
+ * @order: 2^order number of pages are freed
+ *
+ * This function returns the pages allocated by __alloc_pages and tries to
+ * merge buddies if possible. Do not call directly, use free_pages()
+ **/
static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
static void __free_pages_ok (struct page *page, unsigned int order)
{
@@ -97,13 +125,29 @@
BUG();
page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));

+ /*
+ * If it is balance_classzone that is doing the freeing, the pages
+ * are to be kept for the process doing all the work freeing up pages
+ */
         if (current->flags & PF_FREE_PAGES)
                 goto local_freelist;
  back_local_freelist:

zone = page_zone(page);

+ /*
+ * Multiple uses for mask which defies a common name
+ * -mask == number of pages that will be freed
+ * page_idx & ~mask == Is page aligned to an order size?
+ * Also used later to calculate the address of a buddy
+ */
         mask = (~0UL) << order;
+
+ /*
+ * zone_mem_map = first page in the current zone
+ * page_idx = page offset within zone_mem_map
+ * index = bit offset within map representing the buddies
+ */
         base = zone->zone_mem_map;
         page_idx = page - base;
         if (page_idx & ~mask)
@@ -116,9 +160,16 @@

zone->free_pages -= mask;

+ /* Irregardless of order, this is 0 when MAX_ORDER is reached */
while (mask + (1 << (MAX_ORDER-1))) {
struct page *buddy1, *buddy2;

+ /*
+ * FIXME: This could only be true if order was originally set
+ * to be a value greater than MAX_ORDER. The
+ * sanity check should be at the beginning of the
+ * function.
+ */
                 if (area >= zone->free_area + MAX_ORDER)
                         BUG();
                 if (!__test_and_change_bit(index, area->map))
@@ -150,11 +201,23 @@
         return;

  local_freelist:
+ /* If the process has already freed pages for itself, don't give it
+ * more */
         if (current->nr_local_pages)
                 goto back_local_freelist;
+
+ /*
+ * An interrupt doesn't have a current process to store pages on.
+ *
+ * QUERY: is this not a dead check, an interrupt could only get here if
+ * alloc_pages took the slow path through balance_classzones. If an
+ * interrupt got there, aren't we already dead?
+ */
         if (in_interrupt())
                 goto back_local_freelist;

+ /* Add the page onto the local list, update the page information
+ * and return */
         list_add(&page->list, &current->local_pages);
         page->index = order;
         current->nr_local_pages++;
@@ -163,19 +226,49 @@
#define MARK_USED(index, order, area) \
         __change_bit((index) >> (1+(order)), (area)->map)

+/**
+ *
+ * expand - Break up higher order pages until the right size block is available
+ * @zone: The zone to free pages from
+ * @page: The first page of the first order to split
+ * @index: The page address index inside zone_mem_map
+ * @low: The order block of pages required
+ * @high: The order of the block of pages that are about to be split
+ * @area: The array of free areas for this zone
+ *
+ * This function will break up higher orders of pages necessary and update the
+ * bitmaps as it goes along. If it splits, the lower half will be put onto
+ * the free list and the high half will be either allocated or split
+ * further. This function is called from rmqueue() and not directly
+ *
+ * Note that index here is a page number offset within this zone. In other
+ * parts of the code, index means a bit offset within map.
+ **/
static inline struct page * expand (zone_t *zone, struct page *page,
unsigned long index, int low, int high, free_area_t * area)
{
unsigned long size = 1 << high;

+ /*
+ * If it turned out there was a free block at the right order to begin
+ * with, no splitting will take place
+ */
         while (high > low) {
                 if (BAD_RANGE(zone,page))
                         BUG();
+ /* Prepare to move to next area and size */
                 area--;
                 high--;
                 size >>= 1;
+
+ /*
+ * Add the page to the free list for the "lower" area. The
+ * high half will be split more if necessary
+ */
                 list_add(&(page)->list, &(area)->free_list);
                 MARK_USED(index, high, area);
+
+ /* Move to next page index and addres */
                 index += size;
                 page += size;
         }
@@ -184,6 +277,17 @@
         return page;
}

+/**
+ *
+ * rmqueue - Allocate page blocks of 2^order size via the buddy algorithm
+ * @zone: The zone to allocate from
+ * @order: The 2^order sized block to allocate
+ *
+ * This function is responsible for finding out what order of pages we
+ * have to go to to satisfy the request. For example if there is no
+ * page blocks free to satisfy order=0 (1 page), then see if there is
+ * a free block of order=1 that can be spilt into two order=0 pages
+ **/
static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
static struct page * rmqueue(zone_t *zone, unsigned int order)
{
@@ -198,18 +302,26 @@
head = &area->free_list;
curr = head->next;

+ /*
+ * If there is a free block of pages at the current order,
+ * split it up until we get the required order block of pages
+ * and allocate them
+ */
if (curr != head) {
unsigned int index;

+ /* Get the page and then remove from the freelist */
                         page = list_entry(curr, struct page, list);
                         if (BAD_RANGE(zone,page))
                                 BUG();
                         list_del(curr);
+
+ /* Toggle the bit representing these buddies */
                         index = page - zone->zone_mem_map;
                         if (curr_order != MAX_ORDER-1)
                                 MARK_USED(index, curr_order, area);
- zone->free_pages -= 1UL << order;

+ zone->free_pages -= 1UL << order;
page = expand(zone, page, index, order, curr_order, area);
spin_unlock_irqrestore(&zone->lock, flags);

@@ -220,10 +332,17 @@
                                 BUG();
                         if (PageActive(page))
                                 BUG();
+
                         return page;
                 }
+
+ /*
+ * There isn't pages ready at this order so examine a block of
+ * higher orders
+ */
                 curr_order++;
                 area++;
+
         } while (curr_order < MAX_ORDER);
         spin_unlock_irqrestore(&zone->lock, flags);

@@ -231,13 +350,40 @@
}

#ifndef CONFIG_DISCONTIGMEM
+/**
+ *
+ * _alloc_pages - Find zone to allocate from and call __alloc_pages
+ * @gfp_mask - Flags that determine the behaviour of the allocator
+ * @order - 2^order number of pages will be allocated in a block
+ *
+ * This is called by alloc_pages. It's only task is to identify the
+ * preferred set of zones to allocate from.
+ **/
struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
{
+ /*
+ * Clear the high bits to see if the allocation is from ZONE_DMA,
+ * ZONE_NORMAL or ZONE_HIGHMEM
+ */
return __alloc_pages(gfp_mask, order,
contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
}
#endif

+/**
+ *
+ * balance_classzone - Free page frames from a zone in a synchronous fashion
+ * @classzone: Zone to free from
+ * @gfp_mask: Flags which determine the behaviour of the allocator
+ * @order: It's a block of 2^order pages the caller is looking for
+ * @freed: Returns the number of total number of pages freed
+ *
+ * In a nutshell, this function does some of the work of kswapd in a synchronous
+ * fashion when there simply is too little memory to be waiting around. The
+ * caller will use this when it needs the memory and is willing to block on
+ * waiting for it.
+ *
+ **/
static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
{
@@ -250,6 +396,10 @@
BUG();

current->allocation_order = order;
+
+ /* These flags are set so that __free_pages_ok knows to return the
+ * pages directly to the process
+ */
current->flags |= PF_MEMALLOC | PF_FREE_PAGES;

__freed = try_to_free_pages(classzone, gfp_mask, order);
@@ -257,6 +407,7 @@
current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);

         if (current->nr_local_pages) {
+ /* If pages were freed */
                 struct list_head * entry, * local_pages;
                 struct page * tmp;
                 int nr_pages;
@@ -264,12 +415,21 @@
                 local_pages = &current->local_pages;

                 if (likely(__freed)) {
- /* pick from the last inserted so we're lifo */
+ /* pick from the last inserted so we're LIFO */
                         entry = local_pages->next;
                         do {
                                 tmp = list_entry(entry, struct page, list);
                                 if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
+ /*
+ * This is a block of pages that is of
+ * the correct size so remember it
+ */
                                         list_del(entry);
+
+ /*
+ *QUERY: if order > 0, wouldn't the
+ * nr_local_pages drop by more than 1?
+ */
                                         current->nr_local_pages--;
                                         set_page_count(tmp, 1);
                                         page = tmp;
@@ -295,7 +455,10 @@
                 }

                 nr_pages = current->nr_local_pages;
- /* free in reverse order so that the global order will be lifo */
+ /* free in reverse order so that the global order will be lifo
+ * The pages freed here are ones not of the order we are
+ * interested in for the moment
+ */
                 while ((entry = local_pages->prev) != local_pages) {
                         list_del(entry);
                         tmp = list_entry(entry, struct page, list);
@@ -310,8 +473,27 @@
         return page;
}

-/*
+/**
+ *
+ * __alloc_pages - Allocate pages in a block of size 2^order
+ * @gfp_mask: Flags for this allocation that determine behaviour of allocator
+ * @order: 2^order number of pages to allocate
+ * @zonelist: A list of zones to allocate from starting with the preferred one
+ *
  * This is the 'heart' of the zoned buddy allocator:
+ * There is a few paths the this will take to try and allocate the pages.
+ * is takes depends on what pages are available and what flags on gfp_mask
+ * are set. For instance, if the allocation is for an interrupt handler,
+ * __alloc_pages won't do anything that would block. Each block or attempt
+ * made gets progressively slower as the function executes.
+ *
+ * zonelist is the set of zones which determines the order of fallback if
+ * one zone is full. An allocation may be for ZONE_HIGHMEM, but if there
+ * is none available, ZONE_NORMAL may be used or possibly ZONE_DMA. see
+ * build_zonelist() .
+ *
+ * This function should not be called directly. Use either alloc_pages() or
+ * __get_free_pages()
  */
struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
{
@@ -320,11 +502,22 @@
         struct page * page;
         int freed;

+ /*
+ * zone is the preferred allocation zone. zone++ is a fallback one
+ * classzone is the first zone of the list. It's a "special"
+ * zone which keeps track of whether the whole needs to be
+ * balanced
+ * class_idx is which zone index within this pg_data_t zone is
+ */
         zone = zonelist->zones;
         classzone = *zone;
         if (classzone == NULL)
                 return NULL;
         min = 1UL << order;
+
+ /* Cycle through the zones and their fallbacks. Allocate from the zone
+ * if the allocation can be made without the low watermark been hit
+ */
         for (;;) {
                 zone_t *z = *(zone++);
                 if (!z)
@@ -338,11 +531,21 @@
                 }
         }

+ /* The watermarks.pages_low has been reached. Mark the zone set
+ * as needing balancing and wake up kswapd which will start work
+ * freeing pages in this classzone
+ */
+
         classzone->need_balance = 1;
         mb();
         if (waitqueue_active(&kswapd_wait))
                 wake_up_interruptible(&kswapd_wait);

+ /* Start again moving through the zones. This time it will allow the
+ * zone to reach watermarks.min number of free pages. It is hoped that
+ * kswapd will bring the number of pages above the watermarks again
+ * later
+ */
         zone = zonelist->zones;
         min = 1UL << order;
         for (;;) {
@@ -352,9 +555,15 @@
                         break;

                 local_min = z->pages_min;
+
+ /* If the caller can't wait, allow the zone to be pushed into
+ * a tighter memory position */
                 if (!(gfp_mask & __GFP_WAIT))
                         local_min >>= 2;
+
                 min += local_min;
+
+ /* If we are safe to allocate this, allocate the page */
                 if (z->free_pages > min) {
                         page = rmqueue(z, order);
                         if (page)
@@ -365,6 +574,13 @@
         /* here we're in the low on memory slow path */

rebalance:
+ /*
+ * PF_MEMALLOC if set if the calling process wants to be treated as a
+ * memory allocator, kswapd for example. This process is high priority
+ * and should be served if at all possible. in_interrupt() means we
+ * can't sleep no matter what. This next block will allocate the
+ * memory no matter what watermark is hit if possible
+ */
         if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
                 zone = zonelist->zones;
                 for (;;) {
@@ -383,10 +599,15 @@
         if (!(gfp_mask & __GFP_WAIT))
                 return NULL;

+ /* Do the work of kswapd in a synchronous fashion */
         page = balance_classzone(classzone, gfp_mask, order, &freed);
         if (page)
                 return page;

+ /*
+ * pages were freed by balance_classzone but not of the
+ * proper type. Cycle through in case a higher order was freed
+ */
         zone = zonelist->zones;
         min = 1UL << order;
         for (;;) {
@@ -413,8 +634,15 @@
         goto rebalance;
}

-/*
- * Common helper functions.
+/**
+ *
+ * __get_free_pages - Get a 2^order block of free pages
+ * @gfp_mask: Flags which determine the allocator behaviour
+ * @order: A block sized 2^order will be allocated
+ *
+ * This is the highest level function available for allocating a block of
+ * pages to the caller. Ultimately __alloc_pages() is called to use the
+ * buddy algorithm to retrieve a block of pages
  */
unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
{
@@ -423,9 +651,15 @@
         page = alloc_pages(gfp_mask, order);
         if (!page)
                 return 0;
+
         return (unsigned long) page_address(page);
}

+/**
+ *
+ * get_zerod_page - Allocates one page from the buddy allocator and zeros it
+ * @gfp_mask: Flags which determine the allocator behaviour
+ */
unsigned long get_zeroed_page(unsigned int gfp_mask)
{
struct page * page;
@@ -439,20 +673,43 @@
return 0;
}

+/**
+ *
+ * __free_pages - Sanity check before asking the buddy allocator to take pages
+ * @page: The first page of the block to free
+ * @order: Indicates the block size. size = 2^order
+ */
void __free_pages(struct page *page, unsigned int order)
{
+ /* QUERY: __free_pages_ok() does a load of sanity checks at the
+ * beginning of the function, would it not make more sense
+ * to lump them all together and have one function call?
+ */
if (!PageReserved(page) && put_page_testzero(page))
__free_pages_ok(page, order);
}

+/**
+ *
+ * free_pages - Free pages allocated by the buddy allocator
+ * addr: The address of the pages to free
+ * order: The block size to free
+ *
+ * This is the highest level function available for freeing pages allocated
+ * by the buddy allocator
+ */
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0)
__free_pages(virt_to_page(addr), order);
}

-/*
- * Total amount of free (allocatable) RAM:
+/**
+ *
+ * nr_free_pages - Returns number of free pages in all zones
+ *
+ * This function walks through all zones and sums the free page frames in each
+ * of them.
*/
unsigned int nr_free_pages (void)
{
@@ -469,8 +726,12 @@
return sum;
}

-/*
- * Amount of free RAM allocatable as buffer memory:
+/**
+ *
+ * nr_free_buffer_pages - Amount of free RAM allocatable as buffer memory
+ *
+ * This steps through all the zones that are suitable for normal use and
+ * returns back the totals of "size-pages_high".
*/
unsigned int nr_free_buffer_pages (void)
{
@@ -496,6 +757,10 @@
}

#if CONFIG_HIGHMEM
+/**
+ *
+ * nr_free_highpages - Returns the number of free page frames in high memory
+ */
unsigned int nr_free_highpages (void)
{
pg_data_t *pgdat = pgdat_list;
@@ -509,6 +774,10 @@
}
#endif

+/*
+ * This macro will yield the total amount of RAM in kB
+ * addressed by x number of pages.
+ */
#define K(x) ((x) << (PAGE_SHIFT-10))

/*
@@ -526,6 +795,8 @@
K(nr_free_pages()),
K(nr_free_highpages()));

+ /* Step through all zones in all pgdats and print out the pertinent
+ * information about them */
         while (tmpdat) {
                 zone_t *zone;
                 for (zone = tmpdat->node_zones;
@@ -546,6 +817,10 @@
                nr_inactive_pages,
                nr_free_pages());

+ /* This steps through all the zones a second time and checks how
+ * many blocks of each 2^order block of pages. This helps determine
+ * how fragmented memory is
+ */
         for (type = 0; type < MAX_NR_ZONES; type++) {
                 struct list_head *head, *curr;
                 zone_t *zone = pgdat->node_zones + type;
@@ -582,7 +857,10 @@
}

/*
- * Builds allocation fallback zone lists.
+ * Builds allocation fallback zone lists. This determines what order zones
+ * should be used to take pages from if an allocation fails. For example,
+ * an allocation for HIGHMEM will fall to NORMAL if pages are not available
+ * and in turn fall to DMA.
*/
static inline void build_zonelists(pg_data_t *pgdat)
{

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Mel: "[patch 2/5] NUMA document update"
Previous message: Richard B. Johnson: "Re: pppd[32497]: Unsupported protocol 'CallBack Control Protocol (CBCP)' (0xc029) received"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This archive was generated by hypermail 2b29 : Wed Aug 07 2002 - 22:00:32 EST