[v2] mm: make allocation counters per-order

From: Roman Gushchin
Date: Mon Jul 03 2017 - 14:02:49 EST


High-order allocations are obviously more costly, and it's very useful
to know how many of them happens, if there are any issues
(or suspicions) with memory fragmentation.

This commit changes existing per-zone allocation counters to be
per-zone per-order. These counters are displayed using a new
procfs interface (similar to /proc/buddyinfo):

$ cat /proc/allocinfo
DMA 0 0 0 0 0 \
0 0 0 0 0 0
DMA32 3 0 1 0 0 \
0 0 0 0 0 0
Normal 4997056 23594 10902 23686 931 \
23 122 786 17 1 0
Movable 0 0 0 0 0 \
0 0 0 0 0 0
Device 0 0 0 0 0 \
0 0 0 0 0 0

The existing vmstat interface remains untouched*, and still shows
the total number of single page allocations, so high-order allocations
are represented as a corresponding number of order-0 allocations.

$ cat /proc/vmstat | grep alloc
pgalloc_dma 0
pgalloc_dma32 7
pgalloc_normal 5461660
pgalloc_movable 0
pgalloc_device 0

* I've added device zone for consistency with other zones,
and to avoid messy exclusion of this zone in the code.

v2:
The functionality can be enabled/disabled by the PER_ORDER_ALLOC_COUNTERS
config option.

Signed-off-by: Roman Gushchin <guro@xxxxxx>
Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Debabrata Banerjee <dbavatar@xxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: kernel-team@xxxxxx
Cc: linux-mm@xxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
---
arch/s390/appldata/appldata_mem.c | 16 +++++
include/linux/mmzone.h | 2 +
include/linux/vm_event_item.h | 27 ++++++--
include/linux/vmstat.h | 20 ++++++
init/Kconfig | 9 +++
mm/page_alloc.c | 11 +++-
mm/vmstat.c | 128 +++++++++++++++++++++++++++++++++++---
7 files changed, 199 insertions(+), 14 deletions(-)

diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index 598df57..79679d3 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -66,6 +66,21 @@ struct appldata_mem_data {

} __packed;

+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
+{
+ int order;
+
+ for (order = 1; order < MAX_ORDER; ++order) {
+ pgalloc += ev[PGALLOC_NORMAL + order * MAX_NR_ZONES] << order;
+ pgalloc += ev[PGALLOC_DMA + order * MAX_NR_ZONES] << order;
+ }
+}
+#else
+static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
+{
+}
+#endif

/*
* appldata_get_mem_data()
@@ -92,6 +107,7 @@ static void appldata_get_mem_data(void *data)
mem_data->pswpout = ev[PSWPOUT];
mem_data->pgalloc = ev[PGALLOC_NORMAL];
mem_data->pgalloc += ev[PGALLOC_DMA];
+ sum_pgalloc_events(&mem_data->pgalloc, ev);
mem_data->pgfault = ev[PGFAULT];
mem_data->pgmajfault = ev[PGMAJFAULT];

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b..406dfc4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -66,6 +66,8 @@ enum migratetype {
/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern char * const migratetype_names[MIGRATE_TYPES];

+extern const char *zone_name(int idx);
+
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 37e8d31..da94618 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -19,12 +19,31 @@
#define HIGHMEM_ZONE(xx)
#endif

-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
+#ifdef CONFIG_ZONE_DEVICE
+#define DEVICE_ZONE(xx) xx##__DEVICE,
+#else
+#define DEVICE_ZONE(xx)
+#endif
+
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx)
+
+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+#define PGALLOC_EVENTS_SIZE (MAX_NR_ZONES * MAX_ORDER)
+#define PGALLOC_EVENTS_CUT_SIZE (MAX_NR_ZONES * (MAX_ORDER - 1))
+#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL)
+#else
+#define PGALLOC_EVENTS_SIZE MAX_NR_ZONES
+#define PGALLOC_EVENTS_CUT_SIZE 0
+#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL)
+#endif

enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
- FOR_ALL_ZONES(PGALLOC),
- FOR_ALL_ZONES(ALLOCSTALL),
- FOR_ALL_ZONES(PGSCAN_SKIP),
+ FOR_ALL_ZONES(PGALLOC)
+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+ __PGALLOC_LAST = PGALLOC_FIRST_ZONE + PGALLOC_EVENTS_SIZE - 1,
+#endif
+ FOR_ALL_ZONES(ALLOCSTALL)
+ FOR_ALL_ZONES(PGSCAN_SKIP)
PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
PGFAULT, PGMAJFAULT,
PGLAZYFREED,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b3d85f3..bca96fc 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -103,6 +103,26 @@ static inline void vm_events_fold_cpu(int cpu)
#define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)

+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+static inline void __count_alloc_event(enum zone_type zid, unsigned int order)
+{
+ enum vm_event_item item;
+
+ if (unlikely(order >= MAX_ORDER)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ item = PGALLOC_FIRST_ZONE + order * MAX_NR_ZONES + zid;
+ __count_vm_events(item, 1);
+}
+#else
+static inline void __count_alloc_event(enum zone_type zid, unsigned int order)
+{
+ __count_zid_vm_events(PGALLOC, zid, 1 << order);
+}
+#endif
+
/*
* Zone and node-based page accounting with per cpu differentials.
*/
diff --git a/init/Kconfig b/init/Kconfig
index 8514b25..164d6f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1477,6 +1477,15 @@ config VM_EVENT_COUNTERS
on EXPERT systems. /proc/vmstat will only show page counts
if VM event counters are disabled.

+config PER_ORDER_ALLOC_COUNTERS
+ bool "Per-order memory allocation counters"
+ depends on VM_EVENT_COUNTERS && PROC_FS
+ help
+ This option enables splitting per-zone allocation counters
+ into per-zone per-order counters.
+ Per-order counters are exported using the /proc/allocinfo
+ interface, and /proc/vmstat shows accumulated values.
+
config SLUB_DEBUG
default y
bool "Enable SLUB debugging support" if EXPERT
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 80e4adb..e74b327 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -233,6 +233,13 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};

+const char *zone_name(int zid)
+{
+ if (zid < MAX_NR_ZONES)
+ return zone_names[zid];
+ return NULL;
+}
+
char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
@@ -2779,7 +2786,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_alloc_event(page_zonenum(page), order);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
@@ -2827,7 +2834,7 @@ struct page *rmqueue(struct zone *preferred_zone,
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));

- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_alloc_event(page_zonenum(page), order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441b..1d468ed 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,6 +27,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/mmzone.h>

#include "internal.h"

@@ -34,18 +35,18 @@
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);

-static void sum_vm_events(unsigned long *ret)
+static void sum_vm_events(unsigned long *ret, int off, size_t nr_events)
{
int cpu;
int i;

- memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
+ memset(ret, 0, nr_events * sizeof(unsigned long));

for_each_online_cpu(cpu) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);

- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- ret[i] += this->event[i];
+ for (i = 0; i < nr_events; i++)
+ ret[i] += this->event[off + i];
}
}

@@ -57,7 +58,7 @@ static void sum_vm_events(unsigned long *ret)
void all_vm_events(unsigned long *ret)
{
get_online_cpus();
- sum_vm_events(ret);
+ sum_vm_events(ret, 0, NR_VM_EVENT_ITEMS);
put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);
@@ -915,8 +916,15 @@ int fragmentation_index(struct zone *zone, unsigned int order)
#define TEXT_FOR_HIGHMEM(xx)
#endif

+#ifdef CONFIG_ZONE_DEVICE
+#define TEXT_FOR_DEVICE(xx) xx "_device",
+#else
+#define TEXT_FOR_DEVICE(xx)
+#endif
+
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable",
+ TEXT_FOR_HIGHMEM(xx) xx "_movable", \
+ TEXT_FOR_DEVICE(xx)

const char * const vmstat_text[] = {
/* enum zone_stat_item countes */
@@ -1480,12 +1488,92 @@ enum writeback_stat_item {
NR_VM_WRITEBACK_STAT_ITEMS,
};

+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+static void sum_alloc_events(unsigned long *v)
+{
+ int zid, order, index;
+
+ for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+ for (order = 1; order < MAX_ORDER; order++) {
+ index = PGALLOC_FIRST_ZONE + zid;
+ v[index] += v[index + order * MAX_NR_ZONES] << order;
+ }
+ }
+}
+
+static int allocinfo_show(struct seq_file *m, void *arg)
+{
+ unsigned long allocs[PGALLOC_EVENTS_SIZE];
+ unsigned int order;
+ int zid;
+
+ if (arg != SEQ_START_TOKEN)
+ return 0;
+
+ get_online_cpus();
+ sum_vm_events(allocs, PGALLOC_FIRST_ZONE, PGALLOC_EVENTS_SIZE);
+ put_online_cpus();
+
+ for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+ seq_printf(m, "%8s ", zone_name(zid));
+
+ for (order = 0; order < MAX_ORDER; order++)
+ seq_printf(m, "%10lu ",
+ allocs[zid + order * MAX_NR_ZONES]);
+
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+
+static void *allocinfo_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+ return SEQ_START_TOKEN;
+}
+
+static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void allocinfo_stop(struct seq_file *m, void *arg)
+{
+}
+
+static const struct seq_operations allocinfo_op = {
+ .start = allocinfo_start,
+ .next = allocinfo_next,
+ .stop = allocinfo_stop,
+ .show = allocinfo_show,
+};
+
+static int allocinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &allocinfo_op);
+}
+
+static const struct file_operations allocinfo_file_operations = {
+ .open = allocinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#else
+static void sum_alloc_events(unsigned long *v)
+{
+}
+#endif
+
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
int i, stat_items_size;

- if (*pos >= ARRAY_SIZE(vmstat_text))
+ if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE)
return NULL;
stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
@@ -1513,6 +1601,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)

#ifdef CONFIG_VM_EVENT_COUNTERS
all_vm_events(v);
+ sum_alloc_events(v);
v[PGPGIN] /= 2; /* sectors -> kbytes */
v[PGPGOUT] /= 2;
#endif
@@ -1521,8 +1610,16 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)

static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
{
+ int alloc_event_start = NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NODE_STAT_ITEMS +
+ NR_VM_WRITEBACK_STAT_ITEMS +
+ PGALLOC_FIRST_ZONE;
+
(*pos)++;
- if (*pos >= ARRAY_SIZE(vmstat_text))
+ if (*pos == alloc_event_start + MAX_NR_ZONES)
+ *(pos) += PGALLOC_EVENTS_CUT_SIZE;
+
+ if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE)
return NULL;
return (unsigned long *)m->private + *pos;
}
@@ -1531,6 +1628,18 @@ static int vmstat_show(struct seq_file *m, void *arg)
{
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
+ int alloc_event_start = NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NODE_STAT_ITEMS +
+ NR_VM_WRITEBACK_STAT_ITEMS +
+ PGALLOC_FIRST_ZONE;
+
+ if (off >= alloc_event_start + PGALLOC_EVENTS_SIZE)
+ off -= PGALLOC_EVENTS_CUT_SIZE;
+
+ if (unlikely(off >= sizeof(vmstat_text))) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }

seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
@@ -1790,6 +1899,9 @@ void __init init_mm_internals(void)
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+ proc_create("allocinfo", 0444, NULL, &allocinfo_file_operations);
+#endif
proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
--
2.7.4