[PATCH 1/2] pidmap(2)

From: Alexey Dobriyan
Date: Tue Sep 05 2017 - 15:05:14 EST


From: Tatsiana Brouka <Tatsiana_Brouka@xxxxxxxx>

Implement system call for bulk retrieveing of pids in binary form.

Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
converting with atoi().

/proc may be not mounted especially in containers. Natural extension of
hidepid=2 efforts is to not mount /proc at all.

It could be used by programs like ps, top or CRIU. Speed increase will
become more drastic once combined with bulk retrieval of process statistics.

Sample program:

#include <stdio.h>
static inline long sys_pidmap(int *pid, unsigned int n, int start)
{
register long r10 asm ("r10") = 0;
long rv;
asm volatile (
"syscall"
: "=a" (rv)
: "0" (333), "D" (pid), "S" (n), "d" (start), "r" (r10)
: "rcx", "r11", "cc", "memory"
);
return rv;
}

int main(void)
{
int pid[5];
unsigned int start;
int n;

start = 0;
while ((n = sys_pidmap(pid, sizeof(pid)/sizeof(pid[0]), start)) > 0) {
int i;

for (i = 0; i < n; i++) {
printf(" %u", pid[i]);
}
printf("\n");
start = pid[n - 1] + 1;
}

return 0;
}

Signed-off-by: Tatsiana Brouka <Tatsiana_Brouka@xxxxxxxx>
Signed-off-by: Alexey Dobriyan <adobriyan@xxxxxxxxx>
---

arch/x86/entry/syscalls/syscall_64.tbl | 1
include/linux/syscalls.h | 4
kernel/Makefile | 2
kernel/pidmap.c | 116 ++++++++++++++
tools/testing/selftests/Makefile | 1
tools/testing/selftests/pidmap/Makefile | 5
tools/testing/selftests/pidmap/pidmap.c | 263 ++++++++++++++++++++++++++++++++
7 files changed, 392 insertions(+)

--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
332 common statx sys_statx
+333 common pidmap sys_pidmap

#
# x32-specific system call numbers start at 512 to avoid cache impact
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -923,4 +923,8 @@ asmlinkage long sys_pkey_free(int pkey);
asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);

+asmlinkage long sys_pidmap(int __user *pids,
+ unsigned int pids_count,
+ unsigned int start_pid,
+ int flags);
#endif
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,8 @@ obj-y = fork.o exec_domain.o panic.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o

+obj-y += pidmap.o
+
obj-$(CONFIG_MULTIUSER) += groups.o

ifdef CONFIG_FUNCTION_TRACER
--- /dev/null
+++ b/kernel/pidmap.c
@@ -0,0 +1,116 @@
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/kernel.h>
+#include <linux/pid.h>
+#include <linux/ptrace.h>
+#include <linux/rcupdate.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+/**
+ * pidmap - get allocated PIDs
+ * @pids: Destination buffer.
+ * @pids_count: number of elements in the buffer.
+ * @start_pid: PID to start from.
+ * @flags: flags, must be 0.
+ *
+ * Write allocated PIDs to a buffer starting from @start_pid (inclusive).
+ * PIDs are filled from pid namespace of the calling process POV:
+ * unshare(CLONE_NEWPID)+fork+pidmap in child will always return 1/1.
+ *
+ * pidmap(2) hides PIDs inaccessible at /proc mounted with "hide_pid" option.
+ *
+ * Note, pidmap(2) does not guarantee that any of returned PID exists
+ * by the time system call exits.
+ *
+ * Return: number of PIDs written to the buffer or error code otherwise.
+ */
+SYSCALL_DEFINE4(pidmap, int __user *, pids, unsigned int, pids_count,
+ unsigned int, start_pid, int, flags)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ unsigned int start_page, start_elem;
+ unsigned int last_pos = 0;
+ unsigned int last_set_pid = 0;
+ unsigned long mask;
+ bool has_perms = false;
+ unsigned int i;
+
+ if (flags)
+ return -EINVAL;
+
+ /*
+ * Pid 0 does not exist, however, corresponding bit is always set in
+ * ->pidmap[0].page, so we should skip it.
+ */
+ if (start_pid == 0)
+ start_pid = 1;
+
+ if (start_pid > ns->last_pid)
+ return 0;
+
+ if (ns->hide_pid < HIDEPID_INVISIBLE || in_group_p(ns->pid_gid))
+ has_perms = true;
+
+ start_page = start_pid / BITS_PER_PAGE;
+ start_elem = (start_pid % BITS_PER_PAGE) / BITS_PER_LONG;
+ mask = ~0UL << (start_pid % BITS_PER_LONG);
+
+ for (i = start_page; i < PIDMAP_ENTRIES; i++) {
+ unsigned int j;
+
+ /*
+ * ->pidmap[].page is set once to a valid pointer,
+ * therefore do not take any locks.
+ */
+ if (ns->pidmap[i].page == NULL)
+ continue;
+
+ for (j = start_elem; j < PAGE_SIZE/sizeof(unsigned long); j++) {
+ unsigned long val;
+
+ val = *((unsigned long *)ns->pidmap[i].page + j);
+ val &= mask;
+ mask = ~0UL;
+ while (val != 0) {
+ struct task_struct *task;
+
+ if (last_pos == pids_count)
+ return last_pos;
+
+ last_set_pid = i * BITS_PER_PAGE +
+ j * BITS_PER_LONG + __ffs(val);
+
+ if (has_perms)
+ goto write;
+
+ rcu_read_lock();
+ task = find_task_by_pid_ns(last_set_pid, ns);
+ if (!task) {
+ rcu_read_unlock();
+ goto next;
+ }
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+ rcu_read_unlock();
+ goto next;
+ }
+ rcu_read_unlock();
+write:
+ if (put_user(last_set_pid, pids + last_pos))
+ return -EFAULT;
+ last_pos++;
+ if (last_set_pid == ns->last_pid)
+ return last_pos;
+next:
+ val &= (val - 1);
+ }
+
+ }
+ start_elem = 0;
+ }
+ if (last_set_pid == 0)
+ return 0;
+ else
+ return last_pos;
+}
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -20,6 +20,7 @@ TARGETS += mount
TARGETS += mqueue
TARGETS += net
TARGETS += nsfs
+TARGETS += pidmap
TARGETS += powerpc
TARGETS += pstore
TARGETS += ptrace
--- /dev/null
+++ b/tools/testing/selftests/pidmap/Makefile
@@ -0,0 +1,5 @@
+CFLAGS = -Wall
+
+TEST_GEN_PROGS := pidmap
+
+include ../lib.mk
--- /dev/null
+++ b/tools/testing/selftests/pidmap/pidmap.c
@@ -0,0 +1,263 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <dirent.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <signal.h>
+#include "../kselftest_harness.h"
+
+#define SIZE 512
+
+static inline long pidmap(int *pid, unsigned int count, unsigned int start_pid)
+{
+ long ret;
+
+ register long r10 asm("r10") = 0;
+
+ asm volatile ("syscall" : "=a"(ret) :
+ "0"(333), "D"(pid), "S"(count), "d"(start_pid), "r"(r10) :
+ "rcx", "r11", "cc", "memory");
+ return ret;
+}
+
+static int compare(const void *a, const void *b)
+{
+ return *((int *)a) > *((int *)b);
+}
+
+int pidmap_full(int **pid, unsigned int *res_count)
+{
+ int n;
+ int start_pid = 1;
+ *pid = (int *)malloc(SIZE * sizeof(int));
+ *res_count = 0;
+
+ while ((n = pidmap(*pid + *res_count, SIZE, start_pid)) > 0) {
+ *res_count += n;
+ *pid = (int *)realloc(*pid, (*res_count + SIZE) * sizeof(int));
+ start_pid = (*pid)[*res_count - 1] + 1;
+ }
+ return n;
+}
+
+int pidmap_proc(int **pid, unsigned int *n)
+{
+ DIR *dir = opendir("/proc");
+ struct dirent *dirs;
+
+ *n = 0;
+ *pid = NULL;
+
+ while ((dirs = readdir(dir))) {
+ char dname[32] = "";
+ DIR *task_dir;
+
+ if (dirs->d_name[0] < '0' || dirs->d_name[0] > '9')
+ continue;
+
+ strcpy(dname, "/proc/");
+ strcat(dname, dirs->d_name);
+ strcat(dname, "/task");
+ task_dir = opendir(dname);
+
+ if (task_dir) {
+ struct dirent *task_dirs;
+
+ while ((task_dirs = readdir(task_dir))) {
+ if (task_dirs->d_name[0] < '0' ||
+ task_dirs->d_name[0] > '9')
+ continue;
+
+ *pid = (int *)realloc(*pid, (*n + 1) *
+ sizeof(int));
+ if (*pid == NULL)
+ return -1;
+ *(*pid + *n) = atoi(task_dirs->d_name);
+ *n += 1;
+ }
+ } else {
+ *pid = (int *)realloc(*pid, (*n + 1) * sizeof(int));
+ if (*pid == NULL)
+ return -1;
+ *(*pid + *n) = atoi(dirs->d_name);
+ *n += 1;
+ }
+ closedir(task_dir);
+ }
+ closedir(dir);
+ return 0;
+}
+
+TEST(bufsize)
+{
+ int pid[SIZE];
+
+ EXPECT_EQ(0, pidmap(pid, 0, 1));
+}
+
+TEST(get_pid)
+{
+ int pid;
+ int ret;
+
+ ret = pidmap(&pid, 1, getpid());
+ ASSERT_LE(0, ret);
+ EXPECT_EQ(getpid(), pid);
+}
+
+TEST(bad_start)
+{
+ int pid[SIZE];
+
+ ASSERT_LE(0, pidmap(pid, SIZE, -1));
+ ASSERT_LE(0, pidmap(pid, SIZE, ~0U));
+ ASSERT_LE(0, pidmap(pid, SIZE, 0));
+ EXPECT_EQ(1, pid[0]);
+}
+
+TEST(child_pid)
+{
+ pid_t pid = fork();
+
+ if (pid == 0)
+ pause();
+ else {
+ int ret;
+ int result = 0;
+
+ ret = pidmap(&result, 1, pid);
+ EXPECT_LE(0, ret);
+ EXPECT_EQ(pid, result);
+ kill(pid, SIGTERM);
+ }
+}
+
+int write_pidmax(int new_pidmax)
+{
+ char old_pidmax[32];
+ char new[32];
+ int fd = open("/proc/sys/kernel/pid_max", O_RDWR);
+
+ if (read(fd, old_pidmax, 32) <= 0)
+ printf("Read failed\n");
+ lseek(fd, 0, 0);
+ snprintf(new, sizeof(new), "%d", new_pidmax);
+ if (write(fd, new, strlen(new)) <= 0)
+ printf("Write failed\n");
+ close(fd);
+ return atoi(old_pidmax);
+}
+
+void do_forks(unsigned int n)
+{
+ while (n--) {
+ pid_t pid = fork();
+
+ if (pid == 0)
+ exit(0);
+ waitpid(pid, NULL, 0);
+ }
+}
+
+TEST(pid_max)
+{
+ int *pid;
+ unsigned int n;
+ int ret, p;
+ int old_pidmax;
+
+ old_pidmax = write_pidmax(50000);
+
+ do_forks(40000);
+
+ p = fork();
+
+ if (p == 0)
+ pause();
+
+ ret = pidmap_full(&pid, &n);
+
+ EXPECT_LE(0, ret);
+ EXPECT_EQ(p, pid[n - 1]);
+
+ kill(p, SIGKILL);
+ write_pidmax(old_pidmax);
+}
+
+TEST(compare_proc)
+{
+ pid_t pid;
+
+ if (unshare(CLONE_NEWNS | CLONE_NEWPID) == -1)
+ return;
+
+ pid = fork();
+
+ if (pid == 0) {
+ pid_t pid;
+ int i = 0;
+
+ mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL);
+ mount("none", "/proc", NULL, MS_REC | MS_PRIVATE, NULL);
+ mount("proc", "/proc", "proc",
+ MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
+
+ while (i < 150) {
+ i++;
+
+ pid = fork();
+
+ if (pid == -1) {
+ wait(NULL);
+ umount("/proc");
+ return;
+ }
+ if (pid == 0) {
+ pause();
+ return;
+ }
+ }
+
+ int *pids, *pids_proc;
+ unsigned int n = 0;
+ unsigned int n_proc = 0;
+ int ret, ret_proc;
+
+ ret = pidmap_full(&pids, &n);
+
+ ret_proc = pidmap_proc(&pids_proc, &n_proc);
+ qsort(pids_proc, n_proc, sizeof(int), compare);
+
+ EXPECT_LE(0, ret);
+ EXPECT_EQ(n_proc, n);
+
+ if (ret <= 0 || ret_proc <= 0 || n != n_proc) {
+ killpg(0, SIGTERM);
+ wait(NULL);
+ umount("/proc");
+ free(pids);
+ free(pids_proc);
+ return;
+ }
+
+ for (int i = 0; i < n; i++) {
+ EXPECT_EQ(pids_proc[i], pids[i]);
+ if (pids_proc[i] != pids[i])
+ break;
+ }
+ EXPECT_EQ(1, pids[0]);
+
+ free(pids_proc);
+ free(pids);
+ killpg(0, SIGTERM);
+ wait(NULL);
+ umount("/proc");
+ }
+}
+
+TEST_HARNESS_MAIN