[patch 1/4] add basic task isolation prctl interface

From: Marcelo Tosatti
Date: Tue Jul 27 2021 - 06:42:15 EST


Add basic prctl task isolation interface, which allows
informing the kernel that application is executing
latency sensitive code (where interruptions are undesired).

Interface is described by task_isolation.rst (added by this patch).

Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>

Index: linux-2.6-vmstat-update/Documentation/userspace-api/task_isolation.rst
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/Documentation/userspace-api/task_isolation.rst
@@ -0,0 +1,52 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============================
+Task isolation prctl interface
+=============================
+
+Set thread isolation mode and parameters, which allows
+informing the kernel that application is
+executing latency sensitive code (where interruptions
+are undesired).
+
+Its composed of 4 prctl commands (passed as arg1 to
+prctl):
+
+PR_ISOL_SET: set isolation parameters for the task
+
+PR_ISOL_GET: get isolation parameters for the task
+
+PR_ISOL_ENTER: indicate that task should be considered
+ isolated from this point on
+
+PR_ISOL_EXIT: indicate that task should not be considered
+ isolated from this point on
+
+The isolation parameters and mode are not inherited by
+children created by fork(2) and clone(2). The setting is
+preserved across execve(2).
+
+The meaning of isolated is specified as follows, when setting arg2 to
+PR_ISOL_SET or PR_ISOL_GET, with the following arguments passed as arg3.
+
+Isolation mode (PR_ISOL_MODE):
+------------------------------
+
+- PR_ISOL_MODE_NONE (arg4): no per-task isolation (default mode).
+ PR_ISOL_EXIT sets mode to PR_ISOL_MODE_NONE.
+
+- PR_ISOL_MODE_NORMAL (arg4): applications can perform system calls normally,
+ and in case of interruption events, the notifications can be collected
+ by BPF programs.
+ In this mode, if system calls are performed, deferred actions initiated
+ by the system call will be executed before return to userspace.
+
+Other modes, which for example send signals upon interruptions events,
+can be implemented.
+
+Example
+=======
+
+The ``samples/task_isolation/`` directory contains a sample
+application.
+
Index: linux-2.6-vmstat-update/include/uapi/linux/prctl.h
===================================================================
--- linux-2.6-vmstat-update.orig/include/uapi/linux/prctl.h
+++ linux-2.6-vmstat-update/include/uapi/linux/prctl.h
@@ -267,4 +267,13 @@ struct prctl_mm_map {
# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */
# define PR_SCHED_CORE_MAX 4

+/* Task isolation control */
+#define PR_ISOL_SET 62
+#define PR_ISOL_GET 63
+#define PR_ISOL_ENTER 64
+#define PR_ISOL_EXIT 65
+# define PR_ISOL_MODE 1
+
+# define PR_ISOL_MODE_NONE 0
+# define PR_ISOL_MODE_NORMAL 1
#endif /* _LINUX_PRCTL_H */
Index: linux-2.6-vmstat-update/kernel/Makefile
===================================================================
--- linux-2.6-vmstat-update.orig/kernel/Makefile
+++ linux-2.6-vmstat-update/kernel/Makefile
@@ -132,6 +132,8 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue
obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o

+obj-$(CONFIG_CPU_ISOLATION) += task_isolation.o
+
CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
Index: linux-2.6-vmstat-update/kernel/sys.c
===================================================================
--- linux-2.6-vmstat-update.orig/kernel/sys.c
+++ linux-2.6-vmstat-update/kernel/sys.c
@@ -58,6 +58,7 @@
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
+#include <linux/task_isolation.h>
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>
@@ -2567,6 +2568,18 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_ISOL_SET:
+ error = prctl_task_isolation_set(arg2, arg3, arg4, arg5);
+ break;
+ case PR_ISOL_GET:
+ error = prctl_task_isolation_get(arg2, arg3, arg4, arg5);
+ break;
+ case PR_ISOL_ENTER:
+ error = prctl_task_isolation_enter(arg2, arg3, arg4, arg5);
+ break;
+ case PR_ISOL_EXIT:
+ error = prctl_task_isolation_exit(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
Index: linux-2.6-vmstat-update/samples/task_isolation/task_isolation.c
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/samples/task_isolation/task_isolation.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+int main(void)
+{
+ int ret;
+ void *buf = malloc(4096);
+
+ memset(buf, 1, 4096);
+ ret = mlock(buf, 4096);
+ if (ret) {
+ perror("mlock");
+ return EXIT_FAILURE;
+ }
+
+ ret = prctl(PR_ISOL_SET, PR_ISOL_MODE, PR_ISOL_MODE_NORMAL, 0, 0);
+ if (ret == -1) {
+ perror("prctl PR_ISOL_SET");
+ return EXIT_FAILURE;
+ }
+
+ ret = prctl(PR_ISOL_ENTER, 0, 0, 0, 0);
+ if (ret == -1) {
+ perror("prctl PR_ISOL_ENTER");
+ exit(0);
+ }
+
+ /* busy loop */
+ while (ret < 99999999) {
+ memset(buf, 0, 10);
+ ret = ret+1;
+ }
+
+ ret = prctl(PR_ISOL_EXIT, 0, 0, 0, 0);
+ if (ret == -1) {
+ perror("prctl PR_ISOL_EXIT");
+ return EXIT_FAILURE;
+ }
+
+ return EXIT_SUCCESS;
+}
+
Index: linux-2.6-vmstat-update/include/linux/sched.h
===================================================================
--- linux-2.6-vmstat-update.orig/include/linux/sched.h
+++ linux-2.6-vmstat-update/include/linux/sched.h
@@ -66,6 +66,7 @@ struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct isol_info;

/*
* Task state bitmask. NOTE! These bits are also
@@ -1400,6 +1401,10 @@ struct task_struct {
struct llist_head kretprobe_instances;
#endif

+#ifdef CONFIG_CPU_ISOLATION
+ struct isol_info *isol_info;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
Index: linux-2.6-vmstat-update/init/init_task.c
===================================================================
--- linux-2.6-vmstat-update.orig/init/init_task.c
+++ linux-2.6-vmstat-update/init/init_task.c
@@ -213,6 +213,9 @@ struct task_struct init_task
#ifdef CONFIG_SECCOMP_FILTER
.seccomp = { .filter_count = ATOMIC_INIT(0) },
#endif
+#ifdef CONFIG_CPU_ISOLATION
+ .isol_info = NULL,
+#endif
};
EXPORT_SYMBOL(init_task);

Index: linux-2.6-vmstat-update/kernel/fork.c
===================================================================
--- linux-2.6-vmstat-update.orig/kernel/fork.c
+++ linux-2.6-vmstat-update/kernel/fork.c
@@ -97,6 +97,7 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/task_isolation.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -734,6 +735,7 @@ void __put_task_struct(struct task_struc
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);

+ tsk_isol_exit(tsk);
io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
@@ -2084,7 +2086,9 @@ static __latent_entropy struct task_stru
#ifdef CONFIG_BPF_SYSCALL
RCU_INIT_POINTER(p->bpf_storage, NULL);
#endif
-
+#ifdef CONFIG_CPU_ISOLATION
+ p->isol_info = NULL;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
Index: linux-2.6-vmstat-update/include/linux/task_isolation.h
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/include/linux/task_isolation.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_TASK_ISOL_H
+#define __LINUX_TASK_ISOL_H
+
+#ifdef CONFIG_CPU_ISOLATION
+
+struct isol_info {
+ u8 mode;
+ u8 active;
+};
+
+extern void __tsk_isol_exit(struct task_struct *tsk);
+
+static inline void tsk_isol_exit(struct task_struct *tsk)
+{
+ if (tsk->isol_info)
+ __tsk_isol_exit(tsk);
+}
+
+int prctl_task_isolation_get(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+
+int prctl_task_isolation_set(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+
+int prctl_task_isolation_enter(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+
+int prctl_task_isolation_exit(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+
+
+#else
+
+static inline void tsk_isol_exit(struct task_struct *tsk)
+{
+}
+
+
+static inline int prctl_task_isolation_get(unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int prctl_task_isolation_set(unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int prctl_task_isolation_enter(unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int prctl_task_isolation_exit(unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_CPU_ISOLATION */
+
+#endif /* __LINUX_TASK_ISOL_H */
Index: linux-2.6-vmstat-update/kernel/task_isolation.c
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/kernel/task_isolation.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Implementation of task isolation.
+ *
+ * Authors:
+ * Chris Metcalf <cmetcalf@xxxxxxxxxxxx>
+ * Alex Belits <abelits@xxxxxxxxxxx>
+ * Yuri Norov <ynorov@xxxxxxxxxxx>
+ * Marcelo Tosatti <mtosatti@xxxxxxxxxx>
+ */
+
+#include <linux/sched.h>
+#include <linux/task_isolation.h>
+#include <linux/prctl.h>
+#include <linux/slab.h>
+
+static int tsk_isol_alloc_context(struct task_struct *task)
+{
+ struct isol_info *info;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (unlikely(!info))
+ return -ENOMEM;
+
+ task->isol_info = info;
+ return 0;
+}
+
+void __tsk_isol_exit(struct task_struct *tsk)
+{
+ kfree(tsk->isol_info);
+ tsk->isol_info = NULL;
+}
+
+int prctl_task_isolation_get(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (arg2 != PR_ISOL_MODE)
+ return -EOPNOTSUPP;
+
+ if (current->isol_info != NULL)
+ return current->isol_info->mode;
+
+ return PR_ISOL_MODE_NONE;
+}
+
+
+int prctl_task_isolation_set(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ int ret;
+
+ if (arg2 != PR_ISOL_MODE)
+ return -EOPNOTSUPP;
+
+ if (arg3 != PR_ISOL_MODE_NORMAL)
+ return -EINVAL;
+
+ ret = tsk_isol_alloc_context(current);
+ if (ret)
+ return ret;
+
+ current->isol_info->mode = arg3;
+ return 0;
+}
+
+int prctl_task_isolation_enter(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+
+ if (current->isol_info == NULL)
+ return -EINVAL;
+
+ if (current->isol_info->mode != PR_ISOL_MODE_NORMAL)
+ return -EINVAL;
+
+ current->isol_info->active = 1;
+
+ return 0;
+}
+
+int prctl_task_isolation_exit(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (current->isol_info == NULL)
+ return -EINVAL;
+
+ if (current->isol_info->mode != PR_ISOL_MODE_NORMAL)
+ return -EINVAL;
+
+ current->isol_info->active = 0;
+
+ return 0;
+}
+
+
Index: linux-2.6-vmstat-update/samples/Kconfig
===================================================================
--- linux-2.6-vmstat-update.orig/samples/Kconfig
+++ linux-2.6-vmstat-update/samples/Kconfig
@@ -223,4 +223,11 @@ config SAMPLE_WATCH_QUEUE
Build example userspace program to use the new mount_notify(),
sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function.

+config SAMPLE_TASK_ISOLATION
+ bool "task isolation sample"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build example userspace program to use prctl task isolation
+ interface.
+
endif # SAMPLES
Index: linux-2.6-vmstat-update/samples/Makefile
===================================================================
--- linux-2.6-vmstat-update.orig/samples/Makefile
+++ linux-2.6-vmstat-update/samples/Makefile
@@ -30,3 +30,4 @@ obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/
subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog
subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/
+subdir-$(CONFIG_SAMPLE_TASK_ISOLATION) += task_isolation
Index: linux-2.6-vmstat-update/samples/task_isolation/Makefile
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/samples/task_isolation/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+userprogs-always-y += task_isolation
+
+userccflags += -I usr/include