[PATCH v6 2/6] task_isolation: add initial support

From: Chris Metcalf
Date: Tue Aug 25 2015 - 15:57:28 EST


The existing nohz_full mode is designed as a "soft" isolation mode
that makes tradeoffs to minimize userspace interruptions while
still attempting to avoid overheads in the kernel entry/exit path,
to provide 100% kernel semantics, etc.

However, some applications require a "hard" commitment from the
kernel to avoid interruptions, in particular userspace device
driver style applications, such as high-speed networking code.

This change introduces a framework to allow applications
to elect to have the "hard" semantics as needed, specifying
prctl(PR_SET_TASK_ISOLATION, PR_TASK_ISOLATION_ENABLE) to do so.
Subsequent commits will add additional flags and additional
semantics.

The kernel must be built with the new TASK_ISOLATION Kconfig flag
to enable this mode, and the kernel booted with an appropriate
nohz_full=CPULIST boot argument. The "task_isolation" state is then
indicated by setting a new task struct field, task_isolation_flag,
to the value passed by prctl(). When the _ENABLE bit is set for a
task, and it is returning to userspace on a nohz_full core, it calls
the new task_isolation_enter() routine to take additional actions
to help the task avoid being interrupted in the future.

Initially, there are only three actions taken. First, the
task calls lru_add_drain() to prevent being interrupted by a
subsequent lru_add_drain_all() call on another core. Then, it calls
quiet_vmstat() to quieten the vmstat worker to avoid a follow-on
interrupt. Finally, the code checks for pending timer interrupts
and quiesces until they are no longer pending. As a result, sys
calls (and page faults, etc.) can be inordinately slow. However,
this quiescing guarantees that no unexpected interrupts will occur,
even if the application intentionally calls into the kernel.

Signed-off-by: Chris Metcalf <cmetcalf@xxxxxxxxxx>
---
arch/tile/kernel/process.c | 9 ++++++
include/linux/isolation.h | 24 +++++++++++++++
include/linux/sched.h | 3 ++
include/uapi/linux/prctl.h | 5 ++++
init/Kconfig | 20 +++++++++++++
kernel/Makefile | 1 +
kernel/context_tracking.c | 3 ++
kernel/isolation.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++
kernel/sys.c | 8 +++++
9 files changed, 148 insertions(+)
create mode 100644 include/linux/isolation.h
create mode 100644 kernel/isolation.c

diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index e036c0aa9792..1d9bd2320a50 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -70,6 +70,15 @@ void arch_cpu_idle(void)
_cpu_idle();
}

+#ifdef CONFIG_TASK_ISOLATION
+void task_isolation_wait(void)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+ _cpu_idle();
+ set_current_state(TASK_RUNNING);
+}
+#endif
+
/*
* Release a thread_info structure
*/
diff --git a/include/linux/isolation.h b/include/linux/isolation.h
new file mode 100644
index 000000000000..fd04011b1c1e
--- /dev/null
+++ b/include/linux/isolation.h
@@ -0,0 +1,24 @@
+/*
+ * Task isolation related global functions
+ */
+#ifndef _LINUX_ISOLATION_H
+#define _LINUX_ISOLATION_H
+
+#include <linux/tick.h>
+#include <linux/prctl.h>
+
+#ifdef CONFIG_TASK_ISOLATION
+static inline bool task_isolation_enabled(void)
+{
+ return tick_nohz_full_cpu(smp_processor_id()) &&
+ (current->task_isolation_flags & PR_TASK_ISOLATION_ENABLE);
+}
+
+extern void task_isolation_enter(void);
+extern void task_isolation_wait(void);
+#else
+static inline bool task_isolation_enabled(void) { return false; }
+static inline void task_isolation_enter(void) { }
+#endif
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04b5ada460b4..2acb618189d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1776,6 +1776,9 @@ struct task_struct {
unsigned long task_state_change;
#endif
int pagefault_disabled;
+#ifdef CONFIG_TASK_ISOLATION
+ unsigned int task_isolation_flags;
+#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/*
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..79da784fe17a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,9 @@ struct prctl_mm_map {
# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */
# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */

+/* Enable/disable or query task_isolation mode for NO_HZ_FULL kernels. */
+#define PR_SET_TASK_ISOLATION 47
+#define PR_GET_TASK_ISOLATION 48
+# define PR_TASK_ISOLATION_ENABLE (1 << 0)
+
#endif /* _LINUX_PRCTL_H */
diff --git a/init/Kconfig b/init/Kconfig
index af09b4fb43d2..82d313cbd70f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -795,6 +795,26 @@ config RCU_EXPEDITE_BOOT

endmenu # "RCU Subsystem"

+config TASK_ISOLATION
+ bool "Provide hard CPU isolation from the kernel on demand"
+ depends on NO_HZ_FULL
+ help
+ Allow userspace processes to place themselves on nohz_full
+ cores and run prctl(PR_SET_TASK_ISOLATION) to "isolate"
+ themselves from the kernel. On return to userspace,
+ isolated tasks will first arrange that no future kernel
+ activity will interrupt the task while the task is running
+ in userspace. This "hard" isolation from the kernel is
+ required for userspace tasks that are running hard real-time
+ tasks in userspace, such as a 10 Gbit network driver in userspace.
+
+ Without this option, but with NO_HZ_FULL enabled, the kernel
+ will make a best-faith, "soft" effort to shield a single userspace
+ process from interrupts, but makes no guarantees.
+
+ You should say "N" unless you are intending to run a
+ high-performance userspace driver or similar task.
+
config BUILD_BIN2C
bool
default n
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..9ffb5c021767 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_TASK_ISOLATION) += isolation.o

$(obj)/configs.o: $(obj)/config_data.h

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 0a495ab35bc7..c57c99f5c4d7 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -20,6 +20,7 @@
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/kprobes.h>
+#include <linux/isolation.h>

#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
@@ -99,6 +100,8 @@ void context_tracking_enter(enum ctx_state state)
* on the tick.
*/
if (state == CONTEXT_USER) {
+ if (task_isolation_enabled())
+ task_isolation_enter();
trace_user_enter(0);
vtime_user_enter(current);
}
diff --git a/kernel/isolation.c b/kernel/isolation.c
new file mode 100644
index 000000000000..d4618cd9e23d
--- /dev/null
+++ b/kernel/isolation.c
@@ -0,0 +1,75 @@
+/*
+ * linux/kernel/isolation.c
+ *
+ * Implementation for task isolation.
+ *
+ * Distributed under GPLv2.
+ */
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+#include <linux/isolation.h>
+#include "time/tick-sched.h"
+
+/*
+ * Rather than continuously polling for the next_event in the
+ * tick_cpu_device, architectures can provide a method to save power
+ * by sleeping until an interrupt arrives.
+ *
+ * Note that it must be guaranteed for a particular architecture
+ * that if next_event is not KTIME_MAX, then a timer interrupt will
+ * occur, otherwise the sleep may never awaken.
+ */
+void __weak task_isolation_wait(void)
+{
+ cpu_relax();
+}
+
+/*
+ * We normally return immediately to userspace.
+ *
+ * In task_isolation mode we wait until no more interrupts are
+ * pending. Otherwise we nap with interrupts enabled and wait for the
+ * next interrupt to fire, then loop back and retry.
+ *
+ * Note that if you schedule two task_isolation processes on the same
+ * core, neither will ever leave the kernel, and one will have to be
+ * killed manually. Otherwise in situations where another process is
+ * in the runqueue on this cpu, this task will just wait for that
+ * other task to go idle before returning to user space.
+ */
+void task_isolation_enter(void)
+{
+ struct clock_event_device *dev =
+ __this_cpu_read(tick_cpu_device.evtdev);
+ struct task_struct *task = current;
+ unsigned long start = jiffies;
+ bool warned = false;
+
+ /* Drain the pagevecs to avoid unnecessary IPI flushes later. */
+ lru_add_drain();
+
+ /* Quieten the vmstat worker so it won't interrupt us. */
+ quiet_vmstat();
+
+ while (READ_ONCE(dev->next_event.tv64) != KTIME_MAX) {
+ if (!warned && (jiffies - start) >= (5 * HZ)) {
+ pr_warn("%s/%d: cpu %d: task_isolation task blocked for %ld seconds\n",
+ task->comm, task->pid, smp_processor_id(),
+ (jiffies - start) / HZ);
+ warned = true;
+ }
+ if (should_resched())
+ schedule();
+ if (test_thread_flag(TIF_SIGPENDING))
+ break;
+ task_isolation_wait();
+ }
+ if (warned) {
+ pr_warn("%s/%d: cpu %d: task_isolation task unblocked after %ld seconds\n",
+ task->comm, task->pid, smp_processor_id(),
+ (jiffies - start) / HZ);
+ dump_stack();
+ }
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 259fda25eb6b..c7024be2d79b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2267,6 +2267,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+#ifdef CONFIG_TASK_ISOLATION
+ case PR_SET_TASK_ISOLATION:
+ me->task_isolation_flags = arg2;
+ break;
+ case PR_GET_TASK_ISOLATION:
+ error = me->task_isolation_flags;
+ break;
+#endif
default:
error = -EINVAL;
break;
--
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/