[RFC PATCH v2 1/4] sched, cpuset: Introduce infrastructure of group balancer

From: Tianchen Ding
Date: Tue Mar 08 2022 - 04:26:51 EST


Introduce CONFIG and struct about group balancer.
Add a few interfaces on cpuset and debugfs to read&write params.

In detail:
cgroup:
cpuset.gb.period_us:
To control work period.
Write a positive number to enable group balancer
for all descendants EXCEPT itself.
Write 0 to disable it.
MAX 1000000. (1s)
Can only write after partition info set.

cpuset.gb.partition:
Partition info used by group balancer.
Tasks in child cgroups (EXCEPT itself) will try to
settle at one of given partitions when gb enabled.

/sys/kernel/debug/sched/group_balancer/:
settle_period_ms:
Default settle period for each group.

settle_period_max_ms:
Max settle period for each group.

global_settle_period_ms:
Global settle period for all groups.
For one group, the operation of settling must
satisfy the periods of itself and global.

Signed-off-by: Tianchen Ding <dtcccc@xxxxxxxxxxxxxxxxx>
---
include/linux/sched/gb.h | 29 +++++++++++
init/Kconfig | 12 +++++
kernel/cgroup/cpuset.c | 109 +++++++++++++++++++++++++++++++++++++++
kernel/sched/Makefile | 1 +
kernel/sched/debug.c | 10 +++-
kernel/sched/gb.c | 14 +++++
kernel/sched/sched.h | 4 ++
7 files changed, 178 insertions(+), 1 deletion(-)
create mode 100644 include/linux/sched/gb.h
create mode 100644 kernel/sched/gb.c

diff --git a/include/linux/sched/gb.h b/include/linux/sched/gb.h
new file mode 100644
index 000000000000..63c14289a748
--- /dev/null
+++ b/include/linux/sched/gb.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Group Balancer header
+ *
+ * Copyright (C) 2021 Alibaba, Inc., Michael Wang <yun.wang@xxxxxxxxxxxxxxxxx>
+ */
+#ifndef _LINUX_SCHED_GB_H
+#define _LINUX_SCHED_GB_H
+
+#include <linux/sched.h>
+
+struct gb_info {
+ /* ---for ancestor as controller--- */
+ /* Period(ns) for task work in task tick, 0 means disabled. */
+ u64 gb_period;
+
+ /* ---for descendants who scheduled by group balancer--- */
+ /* Period for settling to specific partition. */
+ unsigned long settle_period;
+
+ /* Stamp to record next settling. */
+ unsigned long settle_next;
+};
+
+#ifdef CONFIG_GROUP_BALANCER
+extern unsigned int sysctl_gb_settle_period;
+#endif
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 8b79eda8e1d4..8f05e5e7e299 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1043,6 +1043,18 @@ config RT_GROUP_SCHED
realtime bandwidth for them.
See Documentation/scheduler/sched-rt-group.rst for more information.

+config GROUP_BALANCER
+ bool "Group balancer support for SCHED_OTHER"
+ depends on (CGROUP_SCHED && SMP)
+ default CGROUP_SCHED
+ help
+ This feature allow you to do load balance in group mode. In other
+ word, balance groups of tasks among groups of CPUs.
+
+ This can reduce the conflict between task groups and gain benefit
+ from hot cache and affinity domain, usually for the cases when there
+ are multiple apps sharing the same CPUs.
+
endif #CGROUP_SCHED

config UCLAMP_TASK_GROUP
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ef88cc366bb8..0349f3f64e3d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -65,6 +65,7 @@
#include <linux/mutex.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
+#include <linux/sched/gb.h>

DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -170,6 +171,14 @@ struct cpuset {

/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
+
+#ifdef CONFIG_GROUP_BALANCER
+ /* Partition info and configs of group balancer */
+ struct gb_info gi;
+
+ /* Point to the controller (in ancestor cgroup) who has partition info */
+ struct gb_info *control_gi;
+#endif
};

/*
@@ -215,6 +224,24 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
return css_cs(cs->css.parent);
}

+#ifdef CONFIG_GROUP_BALANCER
+static inline struct gb_info *cs_gi(struct cpuset *cs)
+{
+ return &cs->gi;
+}
+static struct gb_info *css_gi(struct cgroup_subsys_state *css, bool get_controller)
+{
+ struct cpuset *cs = css_cs(css);
+
+ return get_controller ? cs->control_gi : &cs->gi;
+}
+#else
+static inline struct gb_info *cs_gi(struct cpuset *cs)
+{
+ return NULL;
+}
+#endif /* CONFIG_GROUP_BALANCER */
+
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
@@ -2365,6 +2392,7 @@ typedef enum {
FILE_MEMORY_PRESSURE,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
+ FILE_GB_CPULIST,
} cpuset_filetype_t;

static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -2510,6 +2538,69 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
return retval ?: nbytes;
}

+#ifdef CONFIG_GROUP_BALANCER
+static inline void init_gb(struct cpuset *cs)
+{
+ struct gb_info *gi = cs_gi(cs);
+
+ gi->settle_period = msecs_to_jiffies(sysctl_gb_settle_period);
+ gi->settle_next = jiffies + gi->settle_period;
+}
+
+static u64 gb_period_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct gb_info *gi = css_gi(css, false);
+
+ return gi->gb_period / NSEC_PER_USEC;
+}
+
+#define MAX_GB_PERIOD USEC_PER_SEC /* 1s */
+
+static int gb_period_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
+{
+ struct cpuset *cs = css_cs(css);
+ struct gb_info *gi = cs_gi(cs);
+
+ if (val > MAX_GB_PERIOD)
+ return -EINVAL;
+
+ percpu_down_write(&cpuset_rwsem);
+
+ gi->gb_period = val * NSEC_PER_USEC;
+
+ percpu_up_write(&cpuset_rwsem);
+ return 0;
+}
+
+static void gb_partition_read(struct seq_file *sf, struct gb_info *gi)
+{
+ seq_putc(sf, '\n');
+}
+
+static ssize_t gb_partition_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct cpuset *cs = css_cs(css);
+ int retval = -ENODEV;
+
+ cpus_read_lock();
+ percpu_down_write(&cpuset_rwsem);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ retval = 0;
+
+out_unlock:
+ percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
+ return retval ?: nbytes;
+}
+#else
+static inline void init_gb(struct cpuset *cs) { }
+static inline void gb_partition_read(struct seq_file *sf, struct gb_info *gi) { }
+#endif /* CONFIG_GROUP_BALANCER */
+
/*
* These ascii lists should be read in a single call, by using a user
* buffer large enough to hold the entire map. If read in smaller
@@ -2542,6 +2633,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_SUBPARTS_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
break;
+ case FILE_GB_CPULIST:
+ gb_partition_read(sf, cs_gi(cs));
+ break;
default:
ret = -EINVAL;
}
@@ -2803,7 +2897,21 @@ static struct cftype dfl_files[] = {
.private = FILE_SUBPARTS_CPULIST,
.flags = CFTYPE_DEBUG,
},
+#ifdef CONFIG_GROUP_BALANCER
+ {
+ .name = "gb.period_us",
+ .read_u64 = gb_period_read_u64,
+ .write_u64 = gb_period_write_u64,
+ },

+ {
+ .name = "gb.partition",
+ .seq_show = cpuset_common_seq_show,
+ .write = gb_partition_write,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_GB_CPULIST,
+ },
+#endif
{ } /* terminate */
};

@@ -2870,6 +2978,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
+ init_gb(cs);
}
spin_unlock_irq(&callback_lock);

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index c83b37af155b..cb5b86bb161e 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
obj-$(CONFIG_PSI) += psi.o
obj-$(CONFIG_SCHED_CORE) += core_sched.o
+obj-$(CONFIG_GROUP_BALANCER) += gb.o
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 102d6f70e84d..1800bdfe1d61 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -299,7 +299,7 @@ static struct dentry *debugfs_sched;

static __init int sched_init_debug(void)
{
- struct dentry __maybe_unused *numa;
+ struct dentry __maybe_unused *numa, *gb;

debugfs_sched = debugfs_create_dir("sched", NULL);

@@ -336,6 +336,14 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
#endif

+#ifdef CONFIG_GROUP_BALANCER
+ gb = debugfs_create_dir("group_balancer", debugfs_sched);
+
+ debugfs_create_u32("settle_period_ms", 0644, gb, &sysctl_gb_settle_period);
+ debugfs_create_u32("settle_period_max_ms", 0644, gb, &sysctl_gb_settle_period_max);
+ debugfs_create_u32("global_settle_period_ms", 0644, gb, &sysctl_gb_global_settle_period);
+#endif
+
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);

return 0;
diff --git a/kernel/sched/gb.c b/kernel/sched/gb.c
new file mode 100644
index 000000000000..4a2876ec1cca
--- /dev/null
+++ b/kernel/sched/gb.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Group Balancer code
+ *
+ * Copyright (C) 2021 Alibaba, Inc., Michael Wang <yun.wang@xxxxxxxxxxxxxxxxx>
+ */
+#include <linux/sched/gb.h>
+
+#include "sched.h"
+
+/* Params about settle period. (ms) */
+unsigned int sysctl_gb_settle_period = 200;
+unsigned int sysctl_gb_settle_period_max = 10000;
+unsigned int sysctl_gb_global_settle_period = 200;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ac1f9470c617..d6555aacdf78 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2433,6 +2433,10 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
+
+extern unsigned int sysctl_gb_settle_period;
+extern unsigned int sysctl_gb_settle_period_max;
+extern unsigned int sysctl_gb_global_settle_period;
#endif

#ifdef CONFIG_SCHED_HRTICK
--
2.27.0