[PATCH v5 07/12] mm/vmstat: switch counter modification to cmpxchg

From: Marcelo Tosatti
Date: Mon Mar 13 2023 - 12:29:31 EST


In preparation for switching vmstat shepherd to flush
per-CPU counters remotely, switch the __{mod,inc,dec} functions that
modify the counters to use cmpxchg.

To facilitate reviewing, functions are ordered in the text file, as:

__{mod,inc,dec}_{zone,node}_page_state
#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
{mod,inc,dec}_{zone,node}_page_state
#else
{mod,inc,dec}_{zone,node}_page_state
#endif

This patch defines the __ versions for the
CONFIG_HAVE_CMPXCHG_LOCAL case to be their non-"__" counterparts:

#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
{mod,inc,dec}_{zone,node}_page_state
__{mod,inc,dec}_{zone,node}_page_state = {mod,inc,dec}_{zone,node}_page_state
#else
{mod,inc,dec}_{zone,node}_page_state
__{mod,inc,dec}_{zone,node}_page_state
#endif

To test the performance difference, a page allocator microbenchmark:
https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/bench/page_bench01.c
with loops=1000000 was used, on Intel Core i7-11850H @ 2.50GHz.

For the single_page_alloc_free test, which does

/** Loop to measure **/
for (i = 0; i < rec->loops; i++) {
my_page = alloc_page(gfp_mask);
if (unlikely(my_page == NULL))
return 0;
__free_page(my_page);
}

Unit is cycles.

Vanilla Patched Diff
115.25 117 1.4%

Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>

Index: linux-vmstat-remote/mm/vmstat.c
===================================================================
--- linux-vmstat-remote.orig/mm/vmstat.c
+++ linux-vmstat-remote/mm/vmstat.c
@@ -334,6 +334,188 @@ void set_pgdat_percpu_threshold(pg_data_
}
}

+#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ * 0 No overstepping
+ * 1 Overstepping half of threshold
+ * -1 Overstepping minus half of threshold
+ */
+static inline void mod_zone_state(struct zone *zone, enum zone_stat_item item,
+ long delta, int overstep_mode)
+{
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ long o, n, t, z;
+
+ do {
+ z = 0; /* overflow to zone counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the next
+ * counter update will apply the threshold again and
+ * therefore bring the counter under the threshold again.
+ *
+ * Most of the time the thresholds are the same anyways
+ * for all cpus in a zone.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (abs(n) > t) {
+ int os = overstep_mode * (t >> 1);
+
+ /* Overflow must be added to zone counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ long delta)
+{
+ mod_zone_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ long delta)
+{
+ mod_zone_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(__mod_zone_page_state);
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_zone_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_zone_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(__inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_zone_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+
+void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_zone_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(__dec_zone_page_state);
+
+static inline void mod_node_state(struct pglist_data *pgdat,
+ enum node_stat_item item,
+ int delta, int overstep_mode)
+{
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ long o, n, t, z;
+
+ if (vmstat_item_in_bytes(item)) {
+ /*
+ * Only cgroups use subpage accounting right now; at
+ * the global level, these items still change in
+ * multiples of whole pages. Store them as pages
+ * internally to keep the per-cpu counters compact.
+ */
+ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+ delta >>= PAGE_SHIFT;
+ }
+
+ do {
+ z = 0; /* overflow to node counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the next
+ * counter update will apply the threshold again and
+ * therefore bring the counter under the threshold again.
+ *
+ * Most of the time the thresholds are the same anyways
+ * for all cpus in a node.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (abs(n) > t) {
+ int os = overstep_mode * (t >> 1);
+
+ /* Overflow must be added to node counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ node_page_state_add(z, pgdat, item);
+}
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+ long delta)
+{
+ mod_node_state(pgdat, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+ long delta)
+{
+ mod_node_state(pgdat, item, delta, 0);
+}
+EXPORT_SYMBOL(__mod_node_page_state);
+
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ mod_node_state(pgdat, item, 1, 1);
+}
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+ mod_node_state(page_pgdat(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void __inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+ mod_node_state(page_pgdat(page), item, 1, 1);
+}
+EXPORT_SYMBOL(__inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+ mod_node_state(page_pgdat(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_node_page_state);
+
+void __dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+ mod_node_state(page_pgdat(page), item, -1, -1);
+}
+EXPORT_SYMBOL(__dec_node_page_state);
+#else
/*
* For use when we know that interrupts are disabled,
* or when we know that preemption is disabled and that
@@ -541,149 +723,6 @@ void __dec_node_page_state(struct page *
}
EXPORT_SYMBOL(__dec_node_page_state);

-#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
-/*
- * If we have cmpxchg_local support then we do not need to incur the overhead
- * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
- *
- * mod_state() modifies the zone counter state through atomic per cpu
- * operations.
- *
- * Overstep mode specifies how overstep should handled:
- * 0 No overstepping
- * 1 Overstepping half of threshold
- * -1 Overstepping minus half of threshold
-*/
-static inline void mod_zone_state(struct zone *zone,
- enum zone_stat_item item, long delta, int overstep_mode)
-{
- struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
- s8 __percpu *p = pcp->vm_stat_diff + item;
- long o, n, t, z;
-
- do {
- z = 0; /* overflow to zone counters */
-
- /*
- * The fetching of the stat_threshold is racy. We may apply
- * a counter threshold to the wrong the cpu if we get
- * rescheduled while executing here. However, the next
- * counter update will apply the threshold again and
- * therefore bring the counter under the threshold again.
- *
- * Most of the time the thresholds are the same anyways
- * for all cpus in a zone.
- */
- t = this_cpu_read(pcp->stat_threshold);
-
- o = this_cpu_read(*p);
- n = delta + o;
-
- if (abs(n) > t) {
- int os = overstep_mode * (t >> 1) ;
-
- /* Overflow must be added to zone counters */
- z = n + os;
- n = -os;
- }
- } while (this_cpu_cmpxchg(*p, o, n) != o);
-
- if (z)
- zone_page_state_add(z, zone, item);
-}
-
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- long delta)
-{
- mod_zone_state(zone, item, delta, 0);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-void inc_zone_page_state(struct page *page, enum zone_stat_item item)
-{
- mod_zone_state(page_zone(page), item, 1, 1);
-}
-EXPORT_SYMBOL(inc_zone_page_state);
-
-void dec_zone_page_state(struct page *page, enum zone_stat_item item)
-{
- mod_zone_state(page_zone(page), item, -1, -1);
-}
-EXPORT_SYMBOL(dec_zone_page_state);
-
-static inline void mod_node_state(struct pglist_data *pgdat,
- enum node_stat_item item, int delta, int overstep_mode)
-{
- struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
- s8 __percpu *p = pcp->vm_node_stat_diff + item;
- long o, n, t, z;
-
- if (vmstat_item_in_bytes(item)) {
- /*
- * Only cgroups use subpage accounting right now; at
- * the global level, these items still change in
- * multiples of whole pages. Store them as pages
- * internally to keep the per-cpu counters compact.
- */
- VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
- delta >>= PAGE_SHIFT;
- }
-
- do {
- z = 0; /* overflow to node counters */
-
- /*
- * The fetching of the stat_threshold is racy. We may apply
- * a counter threshold to the wrong the cpu if we get
- * rescheduled while executing here. However, the next
- * counter update will apply the threshold again and
- * therefore bring the counter under the threshold again.
- *
- * Most of the time the thresholds are the same anyways
- * for all cpus in a node.
- */
- t = this_cpu_read(pcp->stat_threshold);
-
- o = this_cpu_read(*p);
- n = delta + o;
-
- if (abs(n) > t) {
- int os = overstep_mode * (t >> 1) ;
-
- /* Overflow must be added to node counters */
- z = n + os;
- n = -os;
- }
- } while (this_cpu_cmpxchg(*p, o, n) != o);
-
- if (z)
- node_page_state_add(z, pgdat, item);
-}
-
-void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
- long delta)
-{
- mod_node_state(pgdat, item, delta, 0);
-}
-EXPORT_SYMBOL(mod_node_page_state);
-
-void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
-{
- mod_node_state(pgdat, item, 1, 1);
-}
-
-void inc_node_page_state(struct page *page, enum node_stat_item item)
-{
- mod_node_state(page_pgdat(page), item, 1, 1);
-}
-EXPORT_SYMBOL(inc_node_page_state);
-
-void dec_node_page_state(struct page *page, enum node_stat_item item)
-{
- mod_node_state(page_pgdat(page), item, -1, -1);
-}
-EXPORT_SYMBOL(dec_node_page_state);
-#else
/*
* Use interrupt disable to serialize counter updates
*/
Index: linux-vmstat-remote/mm/page_alloc.c
===================================================================
--- linux-vmstat-remote.orig/mm/page_alloc.c
+++ linux-vmstat-remote/mm/page_alloc.c
@@ -8628,9 +8628,6 @@ static int page_alloc_cpu_dead(unsigned
/*
* Zero the differential counters of the dead processor
* so that the vm statistics are consistent.
- *
- * This is only okay since the processor is dead and cannot
- * race with what we are doing.
*/
cpu_vm_stats_fold(cpu);