[PATCH 02/24] kernel: add a netlink interface to get information about tasks (v2)

From: Andrey Vagin
Date: Mon Jul 06 2015 - 04:50:45 EST


task_diag is based on netlink sockets and looks like socket-diag, which
is used to get information about sockets.

task_diag is a new interface which is going to raplace the proc file
system in cases when we need to get information in a binary format.

A request messages is described by the task_diag_pid structure:
struct task_diag_pid {
__u64 show_flags;
__u64 dump_strategy;

__u32 pid;
};

A respone is a set of netlink messages. Each message describes one task.
All task properties are divided on groups. A message contains the
TASK_DIAG_PID group, and other groups if they have been requested in
show_flags. For example, if show_flags contains TASK_DIAG_SHOW_BASE, a
response will contain the TASK_DIAG_CRED group which is described by the
task_diag_creds structure.

struct task_diag_base {
__u32 tgid;
__u32 pid;
__u32 ppid;
__u32 tpid;
__u32 sid;
__u32 pgid;
__u8 state;
char comm[TASK_DIAG_COMM_LEN];
};

The dump_strategy field will be used in following patches to request
information for a group of processes.

v2: A few changes from David Ahern
Use a consistent name
Add max attr enum
task diag: Send pid as u32
Change _MSG/msg references to base
Fix 8-byte alignment

Cc: David Ahern <dsahern@xxxxxxxxx>
Signed-off-by: Andrey Vagin <avagin@xxxxxxxxxx>
---
include/linux/taskstats_kern.h | 7 ++
include/uapi/linux/task_diag.h | 60 +++++++++++++++
include/uapi/linux/taskstats.h | 2 +
init/Kconfig | 12 +++
kernel/Makefile | 1 +
kernel/taskdiag.c | 168 +++++++++++++++++++++++++++++++++++++++++
kernel/taskstats.c | 25 +++++-
7 files changed, 271 insertions(+), 4 deletions(-)
create mode 100644 include/uapi/linux/task_diag.h
create mode 100644 kernel/taskdiag.c

diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 58de6ed..a1fd4f8 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -15,6 +15,8 @@
extern struct kmem_cache *taskstats_cache;
extern struct mutex taskstats_exit_mutex;

+extern struct genl_family taskstats_family;
+
static inline void taskstats_tgid_free(struct signal_struct *sig)
{
if (sig->stats)
@@ -23,6 +25,11 @@ static inline void taskstats_tgid_free(struct signal_struct *sig)

extern void taskstats_exit(struct task_struct *, int group_dead);
extern void taskstats_init_early(void);
+
+struct genl_info;
+struct sk_buff;
+int taskdiag_doit(struct sk_buff *skb, struct genl_info *info);
+
#else
static inline void taskstats_exit(struct task_struct *tsk, int group_dead)
{}
diff --git a/include/uapi/linux/task_diag.h b/include/uapi/linux/task_diag.h
new file mode 100644
index 0000000..3a1e6c4
--- /dev/null
+++ b/include/uapi/linux/task_diag.h
@@ -0,0 +1,60 @@
+#ifndef _LINUX_TASK_DIAG_H
+#define _LINUX_TASK_DIAG_H
+
+#include <linux/types.h>
+#include <linux/capability.h>
+
+enum {
+ /* optional attributes which can be specified in show_flags */
+ TASK_DIAG_BASE = 0,
+
+ /* other attributes */
+ TASK_DIAG_PID = 64, /* u32 */
+
+ __TASK_DIAG_ATTR_MAX
+#define TASK_DIAG_ATTR_MAX (__TASK_DIAG_ATTR_MAX - 1)
+};
+
+#define TASK_DIAG_SHOW_BASE (1ULL << TASK_DIAG_BASE)
+
+enum {
+ TASK_DIAG_RUNNING,
+ TASK_DIAG_INTERRUPTIBLE,
+ TASK_DIAG_UNINTERRUPTIBLE,
+ TASK_DIAG_STOPPED,
+ TASK_DIAG_TRACE_STOP,
+ TASK_DIAG_DEAD,
+ TASK_DIAG_ZOMBIE,
+};
+
+#define TASK_DIAG_COMM_LEN 16
+
+struct task_diag_base {
+ __u32 tgid;
+ __u32 pid;
+ __u32 ppid;
+ __u32 tpid;
+ __u32 sid;
+ __u32 pgid;
+ __u8 state;
+ char comm[TASK_DIAG_COMM_LEN];
+};
+
+#define TASK_DIAG_DUMP_ALL 0
+
+struct task_diag_pid {
+ __u64 show_flags;
+ __u64 dump_strategy;
+
+ __u32 pid;
+};
+
+enum {
+ TASK_DIAG_CMD_ATTR_UNSPEC = 0,
+ TASK_DIAG_CMD_ATTR_GET,
+ __TASK_DIAG_CMD_ATTR_MAX,
+};
+
+#define TASK_DIAG_CMD_ATTR_MAX (__TASK_DIAG_CMD_ATTR_MAX - 1)
+
+#endif /* _LINUX_TASK_DIAG_H */
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index a1cc91b..04b974a 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -181,6 +181,8 @@ enum {
CGROUPSTATS_CMD_GET, /* user->kernel request/get-response */
CGROUPSTATS_CMD_NEW, /* kernel->user event */

+ TASK_DIAG_CMD_GET,
+
__TASKSTATS_CMD_MAX,
};

diff --git a/init/Kconfig b/init/Kconfig
index 7d1ffd2..4d0483c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -432,6 +432,18 @@ config TASKSTATS

Say N if unsure.

+config TASK_DIAG
+ bool "Export task/process properties through netlink"
+ depends on NET && TASKSTATS
+ default n
+ help
+ Export selected properties for tasks/processes through the
+ generic netlink interface. Unlike the proc file system, task_diag
+ returns information in a binary format, allows to specify which
+ information are required.
+
+ Say N if unsure.
+
config TASK_DELAY_ACCT
bool "Enable per-task delay accounting"
depends on TASKSTATS
diff --git a/kernel/Makefile b/kernel/Makefile
index 60c302c..ed6fed5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_TASK_DIAG) += taskdiag.o

$(obj)/configs.o: $(obj)/config_data.h

diff --git a/kernel/taskdiag.c b/kernel/taskdiag.c
new file mode 100644
index 0000000..7327e08
--- /dev/null
+++ b/kernel/taskdiag.c
@@ -0,0 +1,168 @@
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <linux/task_diag.h>
+#include <net/genetlink.h>
+#include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+
+static size_t taskdiag_packet_size(u64 show_flags)
+{
+ size_t size;
+
+ size = nla_total_size(sizeof(u32)); /* PID */
+
+ if (show_flags & TASK_DIAG_SHOW_BASE)
+ size += nla_total_size(sizeof(struct task_diag_base));
+
+ return size;
+}
+
+/*
+ * The task state array is a strange "bitmap" of
+ * reasons to sleep. Thus "running" is zero, and
+ * you can test for combinations of others with
+ * simple bit tests.
+ */
+static const __u8 task_state_array[] = {
+ TASK_DIAG_RUNNING,
+ TASK_DIAG_INTERRUPTIBLE,
+ TASK_DIAG_UNINTERRUPTIBLE,
+ TASK_DIAG_STOPPED,
+ TASK_DIAG_TRACE_STOP,
+ TASK_DIAG_DEAD,
+ TASK_DIAG_ZOMBIE,
+};
+
+static inline const __u8 get_task_state(struct task_struct *tsk)
+{
+ unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
+
+ return task_state_array[fls(state)];
+}
+
+static int fill_task_base(struct task_struct *p, struct sk_buff *skb)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct task_diag_base *base;
+ struct nlattr *attr;
+ char tcomm[sizeof(p->comm)];
+ struct task_struct *tracer;
+
+ attr = nla_reserve(skb, TASK_DIAG_BASE, sizeof(struct task_diag_base));
+ if (!attr)
+ return -EMSGSIZE;
+
+ base = nla_data(attr);
+
+ rcu_read_lock();
+ base->ppid = pid_alive(p) ?
+ task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+
+ base->tpid = 0;
+ tracer = ptrace_parent(p);
+ if (tracer)
+ base->tpid = task_pid_nr_ns(tracer, ns);
+
+ base->tgid = task_tgid_nr_ns(p, ns);
+ base->pid = task_pid_nr_ns(p, ns);
+ base->sid = task_session_nr_ns(p, ns);
+ base->pgid = task_pgrp_nr_ns(p, ns);
+
+ rcu_read_unlock();
+
+ get_task_comm(tcomm, p);
+ memset(base->comm, 0, TASK_DIAG_COMM_LEN);
+ strncpy(base->comm, tcomm, TASK_DIAG_COMM_LEN);
+
+ base->state = get_task_state(p);
+
+ return 0;
+}
+
+static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb,
+ u64 show_flags, u32 portid, u32 seq)
+{
+ void *reply;
+ int err;
+ u32 pid;
+
+ reply = genlmsg_put(skb, portid, seq, &taskstats_family, 0, TASK_DIAG_CMD_GET);
+ if (reply == NULL)
+ return -EMSGSIZE;
+
+ pid = task_pid_vnr(tsk);
+ err = nla_put_u32(skb, TASK_DIAG_PID, pid);
+ if (err)
+ goto err;
+
+ if (show_flags & TASK_DIAG_SHOW_BASE) {
+ err = fill_task_base(tsk, skb);
+ if (err)
+ goto err;
+ }
+
+ genlmsg_end(skb, reply);
+ return 0;
+err:
+ genlmsg_cancel(skb, reply);
+ return err;
+}
+
+int taskdiag_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *nla = info->attrs[TASK_DIAG_CMD_ATTR_GET];
+ struct task_struct *tsk = NULL;
+ struct task_diag_pid req;
+ struct sk_buff *msg;
+ size_t size;
+ int rc;
+
+ if (!nla_data(nla))
+ return -EINVAL;
+
+ if (nla_len(nla) < sizeof(req))
+ return -EINVAL;
+
+ /*
+ * use a req variable to deal with alignment issues. task_diag_pid
+ * contains u64 elements which means extended load operations can be
+ * used and those can require 8-byte alignment (e.g., sparc)
+ */
+ memcpy(&req, nla_data(nla), sizeof(req));
+
+ size = taskdiag_packet_size(req.show_flags);
+ msg = genlmsg_new(size, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ tsk = find_task_by_vpid(req.pid);
+ if (tsk)
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (!tsk) {
+ rc = -ESRCH;
+ goto err;
+ };
+
+ if (!ptrace_may_access(tsk, PTRACE_MODE_READ)) {
+ put_task_struct(tsk);
+ rc = -EPERM;
+ goto err;
+ }
+
+ rc = task_diag_fill(tsk, msg, req.show_flags,
+ info->snd_portid, info->snd_seq);
+ put_task_struct(tsk);
+ if (rc < 0)
+ goto err;
+
+ return genlmsg_reply(msg, info);
+err:
+ nlmsg_free(msg);
+ return rc;
+}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 21f82c2..d70f1e5 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,6 +18,7 @@

#include <linux/kernel.h>
#include <linux/taskstats_kern.h>
+#include <linux/task_diag.h>
#include <linux/tsacct_kern.h>
#include <linux/delayacct.h>
#include <linux/cpumask.h>
@@ -41,7 +42,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum);
static int family_registered;
struct kmem_cache *taskstats_cache;

-static struct genl_family family = {
+struct genl_family taskstats_family = {
.id = GENL_ID_GENERATE,
.name = TASKSTATS_GENL_NAME,
.version = TASKSTATS_GENL_VERSION,
@@ -92,9 +93,9 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
if (!info) {
int seq = this_cpu_inc_return(taskstats_seqnum) - 1;

- reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
+ reply = genlmsg_put(skb, 0, seq, &taskstats_family, 0, cmd);
} else
- reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
+ reply = genlmsg_put_reply(skb, info, &taskstats_family, 0, cmd);
if (reply == NULL) {
nlmsg_free(skb);
return -EINVAL;
@@ -664,6 +665,15 @@ err:
nlmsg_free(rep_skb);
}

+#ifdef CONFIG_TASK_DIAG
+static const struct nla_policy
+ taskdiag_cmd_get_policy[TASK_DIAG_CMD_ATTR_MAX+1] = {
+ [TASK_DIAG_CMD_ATTR_GET] = { .type = NLA_UNSPEC,
+ .len = sizeof(struct task_diag_pid)
+ },
+};
+#endif
+
static const struct genl_ops taskstats_ops[] = {
{
.cmd = TASKSTATS_CMD_GET,
@@ -676,6 +686,13 @@ static const struct genl_ops taskstats_ops[] = {
.doit = cgroupstats_user_cmd,
.policy = cgroupstats_cmd_get_policy,
},
+#ifdef CONFIG_TASK_DIAG
+ {
+ .cmd = TASK_DIAG_CMD_GET,
+ .doit = taskdiag_doit,
+ .policy = taskdiag_cmd_get_policy,
+ },
+#endif
};

/* Needed early in initialization */
@@ -694,7 +711,7 @@ static int __init taskstats_init(void)
{
int rc;

- rc = genl_register_family_with_ops(&family, taskstats_ops);
+ rc = genl_register_family_with_ops(&taskstats_family, taskstats_ops);
if (rc)
return rc;

--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/