[PATCH 5/6] delay-injection: resource management via procrastination

From: Konstantin Khebnikov
Date: Thu Jan 15 2015 - 13:57:35 EST


From: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>

inject_delay() allows to pause current task before returning
into userspace in place where kernel doesn't hold any locks
thus wait wouldn't introduce any priority-inversion problems.

This code abuses existing task-work and 'TASK_PARKED' state.
Parked tasks are killable and don't contribute into cpu load.

Together with percpu_ratelimit this could be used in this manner:

if (percpu_ratelimit_charge(&ratelimit, events))
inject_delay(percpu_ratelimit_target(&ratelimit));

Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>
---
include/linux/sched.h | 7 ++++
include/trace/events/sched.h | 7 ++++
kernel/sched/core.c | 66 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 12 ++++++++
4 files changed, 92 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef..2363918 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1132,6 +1132,7 @@ struct sched_statistics {
u64 iowait_sum;

u64 sleep_start;
+ u64 delay_start;
u64 sleep_max;
s64 sum_sleep_runtime;

@@ -1662,6 +1663,10 @@ struct task_struct {
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns;

+ /* Pause task till this time before returning into userspace */
+ ktime_t delay_injection_target;
+ struct callback_head delay_injection_work;
+
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack */
int curr_ret_stack;
@@ -2277,6 +2282,8 @@ extern void set_curr_task(int cpu, struct task_struct *p);

void yield(void);

+extern void inject_delay(ktime_t target);
+
/*
* The default (Linux) execution domain.
*/
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 30fedaf..d35154e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -365,6 +365,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
TP_ARGS(tsk, delay));

/*
+ * Tracepoint for accounting delay-injection
+ */
+DEFINE_EVENT(sched_stat_template, sched_stat_delayed,
+ TP_PROTO(struct task_struct *tsk, u64 delay),
+ TP_ARGS(tsk, delay));
+
+/*
* Tracepoint for accounting runtime (time the task is executing
* on a CPU).
*/
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc0..7a9d6a1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -65,6 +65,7 @@
#include <linux/unistd.h>
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
+#include <linux/task_work.h>
#include <linux/tick.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
@@ -8377,3 +8378,68 @@ void dump_cpu_task(int cpu)
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
+
+#define DELAY_INJECTION_SLACK_NS (NSEC_PER_SEC / 50)
+
+static enum hrtimer_restart delay_injection_wakeup(struct hrtimer *timer)
+{
+ struct hrtimer_sleeper *t =
+ container_of(timer, struct hrtimer_sleeper, timer);
+ struct task_struct *task = t->task;
+
+ t->task = NULL;
+ if (task)
+ wake_up_state(task, TASK_PARKED);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Here delayed task sleeps in 'P'arked state.
+ */
+static void delay_injection_sleep(struct callback_head *head)
+{
+ struct task_struct *task = current;
+ struct hrtimer_sleeper t;
+
+ head->func = NULL;
+ __set_task_state(task, TASK_WAKEKILL | TASK_PARKED);
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_set_expires_range_ns(&t.timer, current->delay_injection_target,
+ DELAY_INJECTION_SLACK_NS);
+
+ t.timer.function = delay_injection_wakeup;
+ t.task = task;
+
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_task_state(task, TASK_RUNNING);
+}
+
+/*
+ * inject_delay - injects delay before returning into userspace
+ * @target: absolute monotomic timestamp to sleeping for,
+ * task will not return into userspace before this time
+ */
+void inject_delay(ktime_t target)
+{
+ struct task_struct *task = current;
+
+ if (ktime_after(target, task->delay_injection_target)) {
+ task->delay_injection_target = target;
+ if (!task->delay_injection_work.func) {
+ init_task_work(&task->delay_injection_work,
+ delay_injection_sleep);
+ task_work_add(task, &task->delay_injection_work, true);
+ }
+ }
+}
+EXPORT_SYMBOL(inject_delay);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cb..2e3269b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2944,6 +2944,15 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
+ if (se->statistics.delay_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.delay_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ se->statistics.delay_start = 0;
+ trace_sched_stat_delayed(tsk, delta);
+ }
#endif
}

@@ -3095,6 +3104,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+ if ((tsk->state & TASK_PARKED) &&
+ tsk->delay_injection_target.tv64)
+ se->statistics.delay_start = rq_clock(rq_of(cfs_rq));
}
#endif
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/