[RFC PATCH 3/3] sched: introduce tunables to control soft affinity
From: subhra mazumdar
Date: Wed Jun 26 2019 - 18:53:16 EST
For different workloads the optimal "softness" of soft affinity can be
different. Introduce tunables sched_allowed and sched_preferred that can
be tuned via /proc. This allows to chose at what utilization difference
the scheduler will chose cpus_allowed over cpus_preferred in the first
level of search. Depending on the extent of data sharing, cache coherency
overhead of the system etc. the optimal point may vary.
Signed-off-by: subhra mazumdar <subhra.mazumdar@xxxxxxxxxx>
---
include/linux/sched/sysctl.h | 2 ++
kernel/sched/fair.c | 19 ++++++++++++++++++-
kernel/sched/sched.h | 2 ++
kernel/sysctl.c | 14 ++++++++++++++
4 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 99ce6d7..0e75602 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -41,6 +41,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
#ifdef CONFIG_SCHED_DEBUG
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_preferred;
+extern __read_mostly unsigned int sysctl_sched_allowed;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 53aa7f2..d222d78 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -85,6 +85,8 @@ unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+const_debug unsigned int sysctl_sched_preferred = 1UL;
+const_debug unsigned int sysctl_sched_allowed = 100UL;
#ifdef CONFIG_SMP
/*
@@ -6739,7 +6741,22 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int new_cpu = prev_cpu;
int want_affine = 0;
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
- struct cpumask *cpus = &p->cpus_preferred;
+ int cpux, cpuy;
+ struct cpumask *cpus;
+
+ if (!p->affinity_unequal) {
+ cpus = &p->cpus_allowed;
+ } else {
+ cpux = cpumask_any(&p->cpus_preferred);
+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ cpumask_andnot(cpus, &p->cpus_allowed, &p->cpus_preferred);
+ cpuy = cpumask_any(cpus);
+ if (sysctl_sched_preferred * cpu_rq(cpux)->cfs.avg.util_avg >
+ sysctl_sched_allowed * cpu_rq(cpuy)->cfs.avg.util_avg)
+ cpus = &p->cpus_allowed;
+ else
+ cpus = &p->cpus_preferred;
+ }
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1a..f856bdb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1863,6 +1863,8 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+extern const_debug unsigned int sysctl_sched_preferred;
+extern const_debug unsigned int sysctl_sched_allowed;
#ifdef CONFIG_SCHED_HRTICK
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7d1008b..bdffb48 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -383,6 +383,20 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "sched_preferred",
+ .data = &sysctl_sched_preferred,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_allowed",
+ .data = &sysctl_sched_allowed,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
--
2.9.3