Re: [RFC] Event counters [3/3]: Convert NUMA counters to eventcounters

From: Christoph Lameter
Date: Wed Dec 21 2005 - 17:24:22 EST


On Tue, 20 Dec 2005, Christoph Lameter wrote:

> no longer be available (maybe one could improvise one with the stats of
> the processors on the node?)

Here is such an alternate patch. It creates a per cpu vmstat in
/sys/devices/system/cpu/cpu<nr>/vmstat (omitting the zoned counters which
are per zone and therefore node specific).
This allows one to observe VM related events occurring on a specific
processor

It also preserves /sys/devices/system/node/node<nr>/numa_stats which will
contain the summed up event counters for all the processors on the node.

This will preserve the existing numa_miss and numa_other fields and add
lots of other information that may be useful.

There is a global numa_miss and numa_other field available in /proc/vmstat
with both these patches. The fields may be helpful to check for fast
checks if allocation requests are properly satisfied.

Index: linux-2.6.15-rc5-mm3/include/linux/mmzone.h
===================================================================
--- linux-2.6.15-rc5-mm3.orig/include/linux/mmzone.h 2005-12-20 13:15:44.000000000 -0800
+++ linux-2.6.15-rc5-mm3/include/linux/mmzone.h 2005-12-21 13:28:13.000000000 -0800
@@ -79,14 +79,6 @@ struct per_cpu_pageset {
s8 vm_stat_diff[NR_STAT_ITEMS];
#endif

-#ifdef CONFIG_NUMA
- unsigned long numa_hit; /* allocated in intended node */
- unsigned long numa_miss; /* allocated in non intended node */
- unsigned long numa_foreign; /* was intended here, hit elsewhere */
- unsigned long interleave_hit; /* interleaver prefered this zone */
- unsigned long local_node; /* allocation from local node */
- unsigned long other_node; /* allocation from other node */
-#endif
} ____cacheline_aligned_in_smp;

#ifdef CONFIG_NUMA
Index: linux-2.6.15-rc5-mm3/include/linux/page-flags.h
===================================================================
--- linux-2.6.15-rc5-mm3.orig/include/linux/page-flags.h 2005-12-20 14:55:00.000000000 -0800
+++ linux-2.6.15-rc5-mm3/include/linux/page-flags.h 2005-12-21 14:03:13.000000000 -0800
@@ -103,6 +103,9 @@ enum event_item { PGPGIN, PGPGOUT, PSWPI
FOR_ALL_ZONES(PGSCAN_DIRECT),
PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+#ifdef CONFIG_NUMA
+ NUMA_MISS, NUMA_OFF_NODE,
+#endif
NR_EVENT_ITEMS
};

@@ -131,6 +134,8 @@ static inline void count_events(enum eve
__get_cpu_var(event_states).event[item] += delta;
}

+extern char *vmstat_text[];
+
#else
/* Disable counters */
#define get_cpu_events(e) 0L
Index: linux-2.6.15-rc5-mm3/mm/mempolicy.c
===================================================================
--- linux-2.6.15-rc5-mm3.orig/mm/mempolicy.c 2005-12-16 11:44:09.000000000 -0800
+++ linux-2.6.15-rc5-mm3/mm/mempolicy.c 2005-12-21 13:28:13.000000000 -0800
@@ -1072,15 +1072,9 @@ static struct page *alloc_page_interleav
unsigned nid)
{
struct zonelist *zl;
- struct page *page;

zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
- page = __alloc_pages(gfp, order, zl);
- if (page && page_zone(page) == zl->zones[0]) {
- zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
- put_cpu();
- }
- return page;
+ return __alloc_pages(gfp, order, zl);
}

/**
Index: linux-2.6.15-rc5-mm3/mm/page_alloc.c
===================================================================
--- linux-2.6.15-rc5-mm3.orig/mm/page_alloc.c 2005-12-20 15:46:51.000000000 -0800
+++ linux-2.6.15-rc5-mm3/mm/page_alloc.c 2005-12-21 13:40:34.000000000 -0800
@@ -989,24 +989,16 @@ void mark_free_pages(struct zone *zone)

#endif /* CONFIG_PM */

-static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int pages)
{
#ifdef CONFIG_NUMA
pg_data_t *pg = z->zone_pgdat;
- pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
- struct per_cpu_pageset *p;

- p = zone_pcp(z, cpu);
- if (pg == orig) {
- p->numa_hit++;
- } else {
- p->numa_miss++;
- zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
- }
- if (pg == NODE_DATA(numa_node_id()))
- p->local_node++;
- else
- p->other_node++;
+ if (pg != zonelist->zones[0]->zone_pgdat)
+ count_events(NUMA_MISS, pages);
+
+ if (pg != NODE_DATA(numa_node_id()))
+ count_events(NUMA_OFF_NODE, pages);
#endif
}

@@ -1098,7 +1090,7 @@ again:
}

count_zone_events(PGALLOC, zone, 1 << order);
- zone_statistics(zonelist, zone, cpu);
+ zone_statistics(zonelist, zone, 1 << order);
local_irq_restore(flags);
put_cpu();

@@ -2586,21 +2578,6 @@ static int zoneinfo_show(struct seq_file
pageset->pcp[j].high,
pageset->pcp[j].batch);
}
-#ifdef CONFIG_NUMA
- seq_printf(m,
- "\n numa_hit: %lu"
- "\n numa_miss: %lu"
- "\n numa_foreign: %lu"
- "\n interleave_hit: %lu"
- "\n local_node: %lu"
- "\n other_node: %lu",
- pageset->numa_hit,
- pageset->numa_miss,
- pageset->numa_foreign,
- pageset->interleave_hit,
- pageset->local_node,
- pageset->other_node);
-#endif
}
seq_printf(m,
"\n all_unreclaimable: %u"
@@ -2625,7 +2602,7 @@ struct seq_operations zoneinfo_op = {
.show = zoneinfo_show,
};

-static char *vmstat_text[] = {
+char *vmstat_text[] = {
/* Zoned VM counters */
"nr_mapped",
"nr_pagecache",
@@ -2681,7 +2658,11 @@ static char *vmstat_text[] = {
"pageoutrun",
"allocstall",

- "pgrotated"
+ "pgrotated",
+#ifdef CONFIG_NUMA
+ "numa_miss",
+ "other_node"
+#endif
#endif
};

Index: linux-2.6.15-rc5-mm3/drivers/base/node.c
===================================================================
--- linux-2.6.15-rc5-mm3.orig/drivers/base/node.c 2005-12-20 13:15:45.000000000 -0800
+++ linux-2.6.15-rc5-mm3/drivers/base/node.c 2005-12-21 14:01:38.000000000 -0800
@@ -93,41 +93,16 @@ static SYSDEV_ATTR(meminfo, S_IRUGO, nod

static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
{
- unsigned long numa_hit, numa_miss, interleave_hit, numa_foreign;
- unsigned long local_node, other_node;
- int i, cpu;
- pg_data_t *pg = NODE_DATA(dev->id);
- numa_hit = 0;
- numa_miss = 0;
- interleave_hit = 0;
- numa_foreign = 0;
- local_node = 0;
- other_node = 0;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *z = &pg->node_zones[i];
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- struct per_cpu_pageset *ps = zone_pcp(z,cpu);
- numa_hit += ps->numa_hit;
- numa_miss += ps->numa_miss;
- numa_foreign += ps->numa_foreign;
- interleave_hit += ps->interleave_hit;
- local_node += ps->local_node;
- other_node += ps->other_node;
- }
- }
- return sprintf(buf,
- "numa_hit %lu\n"
- "numa_miss %lu\n"
- "numa_foreign %lu\n"
- "interleave_hit %lu\n"
- "local_node %lu\n"
- "other_node %lu\n",
- numa_hit,
- numa_miss,
- numa_foreign,
- interleave_hit,
- local_node,
- other_node);
+ unsigned long v[NR_EVENT_ITEMS];
+ int i;
+ char *p = buf;
+
+ sum_events(v, &node_to_cpumask(dev->id));
+
+ for (i = 0; i < NR_EVENT_ITEMS; i++)
+ p += sprintf(p, "%s %ld\n", vmstat_text[NR_STAT_ITEMS + i],
+ v[i]);
+ return p - buf;
}
static SYSDEV_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);

Index: linux-2.6.15-rc5-mm3/drivers/base/cpu.c
===================================================================
--- linux-2.6.15-rc5-mm3.orig/drivers/base/cpu.c 2005-12-16 11:44:06.000000000 -0800
+++ linux-2.6.15-rc5-mm3/drivers/base/cpu.c 2005-12-21 13:59:35.000000000 -0800
@@ -8,6 +8,7 @@
#include <linux/cpu.h>
#include <linux/topology.h>
#include <linux/device.h>
+#include <linux/page-flags.h>

#include "base.h"

@@ -83,6 +84,26 @@ static inline void register_cpu_control(
}
#endif /* CONFIG_HOTPLUG_CPU */

+static ssize_t cpu_read_vmstat(struct sys_device * dev, char * buf)
+{
+ int i;
+ char *p = buf;
+ cpumask_t mask;
+ struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+ unsigned long v[NR_EVENT_ITEMS];
+
+ cpus_clear(mask);
+ cpu_set(cpu->sysdev.id, mask);
+ sum_events(v, &mask);
+
+ for (i = 0; i < NR_EVENT_ITEMS; i++)
+ p += sprintf(p, "%s %ld\n",
+ vmstat_text[NR_STAT_ITEMS + i], v[i]);
+ return p - buf;
+}
+static SYSDEV_ATTR(vmstat, S_IRUGO, cpu_read_vmstat, NULL);
+
+
#ifdef CONFIG_KEXEC
#include <linux/kexec.h>

@@ -138,6 +159,7 @@ int __devinit register_cpu(struct cpu *c
if (!error)
error = sysdev_create_file(&cpu->sysdev, &attr_crash_notes);
#endif
+ sysdev_create_file(&cpu->sysdev, &attr_vmstat);
return error;
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/