[RFC PATCH] cgroup: introduce usage expansion for memcg

From: zhaoyang.huang
Date: Wed Mar 23 2022 - 05:34:24 EST


From: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>

Some kind of memcg want to keep the memory usage in a certain range of time and
let them free when time expired. So we introduce a kind of expanding methods to
expand the usage when calculate the memcg's protection.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>
---
include/linux/memcontrol.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++
mm/memcontrol.c | 7 +++++
mm/vmscan.c | 4 +++
3 files changed, 75 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c5c403..3c7a2e4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,8 @@
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/clock.h>

struct mem_cgroup;
struct obj_cgroup;
@@ -28,6 +30,11 @@
struct mm_struct;
struct kmem_cache;

+#define MEMCG_INTERVAL (2*HZ+1) /* 2 sec intervals */
+#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
+#define EXP_60s 1981 /* 1/exp(2s/60s) */
+#define EXP_300s 2034 /* 1/exp(2s/300s) */
+
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
@@ -340,6 +347,12 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif

+ u64 avg_next_update;
+ u64 avg_last_update;
+ u64 prot_period;
+ struct page_counter memory_latest;
+ bool allow_expand;
+
struct mem_cgroup_per_node *nodeinfo[];
};

@@ -608,6 +621,57 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

+/*
+ * expand the usage via a linear proportion method
+ */
+static inline unsigned long calc_expanded_usage(struct mem_cgroup *group)
+{
+ u64 now, decay_factor;
+ u64 usage_expanded;
+ s64 growth, usage, last_usage;
+ u64 delta_time;
+
+ usage = page_counter_read(&group->memory);
+ last_usage = page_counter_read(&group->memory_latest);
+ growth = usage - last_usage;
+ usage_expanded = (unsigned long)usage;
+ now = sched_clock();
+
+ if (!usage || !group->avg_next_update) {
+ group->avg_next_update = now + group->prot_period;
+ return 0;
+ }
+
+ if (time_before((unsigned long)now, (unsigned long)group->avg_next_update))
+ return 0;
+
+ /*
+ * skip the expansion if the usage is growing while expand the usage when
+ * it remains stable or shrinking.
+ * usage_exp = usage * (1 + delta_time / 34s), which is designed as
+ * an effective way of linear calculation.
+ */
+ if (growth > 0)
+ ;
+ else {
+ delta_time = group->avg_last_update ? now - group->avg_last_update : 0;
+ /*
+ * we take 2048 as "1" and 17s decay 1/2(34bit). then we can get
+ * decay_factor = 1024 * delta_time / 17s(0x400000000)
+ * 0.5/17s = decay_factor/delta_time ==> decay_factor = delta_time >> 24
+ */
+ decay_factor = delta_time >> (34 - 10);
+ usage_expanded += usage * decay_factor / 2048;
+ /*
+ * avg_next_update: expected expire time according to current status
+ */
+ group->avg_last_update = now;
+ group->avg_next_update = now + jiffies_to_nsecs(2*HZ);
+ }
+ atomic_long_set(&group->memory_latest.usage, usage);
+ return usage_expanded;
+}
+
static inline void mem_cgroup_protection(struct mem_cgroup *root,
struct mem_cgroup *memcg,
unsigned long *min,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 508bcea..0e7b5b0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6616,6 +6616,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
{
unsigned long usage, parent_usage;
struct mem_cgroup *parent;
+ unsigned long growth;

if (mem_cgroup_disabled())
return;
@@ -6637,6 +6638,12 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
if (!usage)
return;

+ /*
+ * expand the usage by the time if it is allowed
+ */
+ if (memcg->allow_expand)
+ usage = calc_expanded_usage(memcg);
+
parent = parent_mem_cgroup(memcg);
/* No parent means a non-hierarchical mode on v1 memcg */
if (!parent)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ef4a6dc..ea56b5d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3101,8 +3101,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
* Hard protection.
* If there is no reclaimable memory, OOM.
*/
+ atomic_long_set(&memcg->memory_latest.usage,
+ page_counter_read(&memcg->memory));
continue;
} else if (mem_cgroup_below_low(memcg)) {
+ atomic_long_set(&memcg->memory_latest.usage,
+ page_counter_read(&memcg->memory));
/*
* Soft protection.
* Respect the protection only as long as
--
1.9.1