Re: Purpose of the mm/slab.c changes

From: arjan@fenrus.demon.nl
Date: Sun Sep 09 2001 - 12:27:57 EST

Next message: Andrea Arcangeli: "Re: linux-2.4.10-pre5"
Previous message: Manfred Spraul: "Re: Purpose of the mm/slab.c changes"
In reply to: Linus Torvalds: "Re: Purpose of the mm/slab.c changes"
Next in thread: Andrea Arcangeli: "Re: Purpose of the mm/slab.c changes"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

In article <Pine.LNX.4.33.0109090952380.14365-100000@penguin.transmeta.com> you wrote:

> Case (b) implies that the page allocator might also should also find
> per-CPU LIFO queues in front of the actual allocator useful. That might
> certainly be worth-while looking into - although it would also increase
> the risk of fragementation.

Hi,

Ingo Molnar did such a patch already; attached below.
This yields a 1.5% improvement on a 4 way machine using Oracle, more for
other applications (as the oracle tests is also IO bound).

Greetings,
Arjan van de Ven

diff -urN linux.org/include/linux/mmzone.h linux/include/linux/mmzone.h
--- linux.org/include/linux/mmzone.h Wed Aug 15 17:21:11 2001
+++ linux/include/linux/mmzone.h Tue Sep 4 08:12:37 2001
@@ -7,12 +7,13 @@
#include <linux/config.h>
#include <linux/spinlock.h>
#include <linux/list.h>
+#include <linux/smp.h>

/*
  * Free memory management - zoned buddy allocator.
  */

-#define MAX_ORDER 10
+#define MAX_ORDER 11

typedef struct free_area_struct {
         struct list_head free_list;
@@ -21,6 +22,14 @@

struct pglist_data;

+#define MAX_PER_CPU_PAGES 64
+
+typedef struct per_cpu_pages_s {
+ int nr_pages;
+ int max_nr_pages;
+ struct list_head head;
+} __attribute__((aligned(L1_CACHE_BYTES))) per_cpu_t;
+
/*
  * On machines where it is needed (eg PCs) we divide physical memory
  * into multiple physical zones. On a PC we have 3 zones:
@@ -33,6 +42,7 @@
         /*
          * Commonly accessed fields:
          */
+ per_cpu_t cpu_pages [NR_CPUS];
         spinlock_t lock;
         unsigned long free_pages;
         unsigned long inactive_clean_pages;
diff -urN linux.org/mm/page_alloc.c linux/mm/page_alloc.c
--- linux.org/mm/page_alloc.c Thu Aug 16 12:43:02 2001
+++ linux/mm/page_alloc.c Tue Sep 4 08:12:37 2001
@@ -7,6 +7,7 @@
  * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ * Per-CPU page pool, Ingo Molnar, Red Hat, 2001
  */

#include <linux/config.h>
@@ -67,8 +68,16 @@
         unsigned long index, page_idx, mask, flags;
         free_area_t *area;
         struct page *base;
+ per_cpu_t *per_cpu;
         zone_t *zone;

+ /*
+ * This late check is safe because reserved pages do not
+ * have a valid page->count. This trick avoids overhead
+ * in __free_pages().
+ */
+ if (PageReserved(page))
+ return;
         if (page->buffers)
                 BUG();
         if (page->mapping)
@@ -102,7 +111,18 @@

         area = zone->free_area + order;

- spin_lock_irqsave(&zone->lock, flags);
+ per_cpu = zone->cpu_pages + smp_processor_id();
+
+ __save_flags(flags);
+ __cli();
+ if (!order && (per_cpu->nr_pages < per_cpu->max_nr_pages) && (!free_shortage())) {
+ list_add(&page->list, &per_cpu->head);
+ per_cpu->nr_pages++;
+ __restore_flags(flags);
+ return;
+ }
+
+ spin_lock(&zone->lock);

         zone->free_pages -= mask;

@@ -169,16 +189,35 @@
         return page;
}

+
static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
static struct page * rmqueue(zone_t *zone, unsigned long order)
{
+ per_cpu_t *per_cpu = zone->cpu_pages + smp_processor_id();
         free_area_t * area = zone->free_area + order;
         unsigned long curr_order = order;
         struct list_head *head, *curr;
         unsigned long flags;
         struct page *page;
+ int threshold=0;

- spin_lock_irqsave(&zone->lock, flags);
+ if (!(current->flags & PF_MEMALLOC)) threshold = per_cpu->max_nr_pages/8;
+ __save_flags(flags);
+ __cli();
+
+ if (!order && (per_cpu->nr_pages>threshold)) {
+ if (list_empty(&per_cpu->head))
+ BUG();
+ page = list_entry(per_cpu->head.next, struct page, list);
+ list_del(&page->list);
+ per_cpu->nr_pages--;
+ __restore_flags(flags);
+
+ set_page_count(page, 1);
+ return page;
+ }
+
+ spin_lock(&zone->lock);
         do {
                 head = &area->free_list;
                 curr = memlist_next(head);
@@ -534,7 +573,7 @@

void __free_pages(struct page *page, unsigned long order)
{
- if (!PageReserved(page) && put_page_testzero(page))
+ if (put_page_testzero(page))
                 __free_pages_ok(page, order);
}

@@ -796,6 +835,7 @@

         offset = lmem_map - mem_map;
         for (j = 0; j < MAX_NR_ZONES; j++) {
+ int k;
                 zone_t *zone = pgdat->node_zones + j;
                 unsigned long mask;
                 unsigned long size, realsize;
@@ -807,6 +847,18 @@
                 printk("zone(%lu): %lu pages.\n", j, size);
                 zone->size = size;
                 zone->name = zone_names[j];
+
+ for (k = 0; k < NR_CPUS; k++) {
+ per_cpu_t *per_cpu = zone->cpu_pages + k;
+
+ INIT_LIST_HEAD(&per_cpu->head);
+ per_cpu->max_nr_pages = realsize / smp_num_cpus / 128;
+ if (per_cpu->max_nr_pages > MAX_PER_CPU_PAGES)
+ per_cpu->max_nr_pages = MAX_PER_CPU_PAGES;
+ else
+ if (!per_cpu->max_nr_pages)
+ per_cpu->max_nr_pages = 1;
+ }
                 zone->lock = SPIN_LOCK_UNLOCKED;
                 zone->zone_pgdat = pgdat;
                 zone->free_pages = 0;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Andrea Arcangeli: "Re: linux-2.4.10-pre5"
Previous message: Manfred Spraul: "Re: Purpose of the mm/slab.c changes"
In reply to: Linus Torvalds: "Re: Purpose of the mm/slab.c changes"
Next in thread: Andrea Arcangeli: "Re: Purpose of the mm/slab.c changes"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This archive was generated by hypermail 2b29 : Sat Sep 15 2001 - 21:00:19 EST