[RFC PATCH v1 2/4] Move the user's process counter to ucounts

From: Alexey Gladkov
Date: Mon Nov 02 2020 - 11:52:43 EST


To count the number of user processes use the counter bound to the user
in the user namespace.

Signed-off-by: Alexey Gladkov <gladkov.alexey@xxxxxxxxx>
---
fs/exec.c | 7 ++++---
fs/io-wq.c | 14 +++++++++-----
include/linux/sched/user.h | 1 -
include/linux/user_namespace.h | 8 ++++++++
kernel/cred.c | 8 ++++----
kernel/exit.c | 2 +-
kernel/fork.c | 9 +++++----
kernel/sys.c | 6 ++++--
kernel/ucount.c | 34 ++++++++++++++++++++++++++++++++++
kernel/user.c | 3 ++-
10 files changed, 71 insertions(+), 21 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e6e8a9a70327..3f2071f7b9c7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1826,19 +1826,20 @@ static int __do_execve_file(int fd, struct filename *filename,
char *pathbuf = NULL;
struct linux_binprm *bprm;
struct files_struct *displaced;
- int retval;
+ int retval, processes;

if (IS_ERR(filename))
return PTR_ERR(filename);

+ processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
+
/*
* We move the actual failure in case of RLIMIT_NPROC excess from
* set*uid() to execve() because too many poorly written programs
* don't check setuid() return code. Here we additionally recheck
* whether NPROC limit is still exceeded.
*/
- if ((current->flags & PF_NPROC_EXCEEDED) &&
- atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
+ if ((current->flags & PF_NPROC_EXCEEDED) && processes > rlimit(RLIMIT_NPROC)) {
retval = -EAGAIN;
goto out_ret;
}
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 47c5f3aeb460..6170aee986db 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,6 +17,7 @@
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
#include <linux/task_work.h>
+#include <linux/user_namespace.h>

#include "io-wq.h"

@@ -216,7 +217,7 @@ static void io_worker_exit(struct io_worker *worker)
if (worker->flags & IO_WORKER_F_RUNNING)
atomic_dec(&acct->nr_running);
if (!(worker->flags & IO_WORKER_F_BOUND))
- atomic_dec(&wqe->wq->user->processes);
+ dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
worker->flags = 0;
preempt_enable();

@@ -349,12 +350,12 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
worker->flags |= IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
- atomic_dec(&wqe->wq->user->processes);
+ dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
} else {
worker->flags &= ~IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
- atomic_inc(&wqe->wq->user->processes);
+ inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
}
io_wqe_inc_running(wqe, worker);
}
@@ -671,7 +672,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
spin_unlock_irq(&wqe->lock);

if (index == IO_WQ_ACCT_UNBOUND)
- atomic_inc(&wq->user->processes);
+ inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC);

wake_up_process(worker->task);
return true;
@@ -754,6 +755,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
struct io_wq_work *work)
{
bool free_worker;
+ int processes;

if (!(work->flags & IO_WQ_WORK_UNBOUND))
return true;
@@ -766,7 +768,9 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
if (free_worker)
return true;

- if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
+ processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+
+ if (processes >= acct->max_workers &&
!(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
return false;

diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 917d88edb7b9..38e122bc3d07 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -12,7 +12,6 @@
*/
struct user_struct {
refcount_t __count; /* reference count */
- atomic_t processes; /* How many processes does this user have? */
atomic_t sigpending; /* How many pending signals does this user have? */
#ifdef CONFIG_FANOTIFY
atomic_t fanotify_listeners;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index fc75af812d73..6d9d180b2c9d 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -50,9 +50,13 @@ enum ucount_type {
UCOUNT_INOTIFY_INSTANCES,
UCOUNT_INOTIFY_WATCHES,
#endif
+ UCOUNT_RLIMIT_NPROC,
UCOUNT_COUNTS,
};

+#define UCOUNT_MIN_RLIMIT UCOUNT_RLIMIT_NPROC
+#define UCOUNT_MAX_RLIMIT UCOUNT_RLIMIT_NPROC
+
struct user_namespace {
struct uid_gid_map uid_map;
struct uid_gid_map gid_map;
@@ -104,6 +108,10 @@ void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);

+long get_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+struct ucounts *inc_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+void dec_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+
#ifdef CONFIG_USER_NS

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
diff --git a/kernel/cred.c b/kernel/cred.c
index 421b1149c651..b6694700e760 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -351,7 +351,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred));
- atomic_inc(&p->cred->user->processes);
+ inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
return 0;
}

@@ -384,7 +384,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif

- atomic_inc(&new->user->processes);
+ inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC);
p->cred = p->real_cred = get_cred(new);
alter_cred_subscribers(new, 2);
validate_creds(new);
@@ -486,11 +486,11 @@ int commit_creds(struct cred *new)
*/
alter_cred_subscribers(new, 2);
if (new->user != old->user)
- atomic_inc(&new->user->processes);
+ inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
- atomic_dec(&old->user->processes);
+ dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
alter_cred_subscribers(old, -2);

/* send notifications */
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..5a0d7dd1ad64 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -187,7 +187,7 @@ void release_task(struct task_struct *p)
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
- atomic_dec(&__task_cred(p)->user->processes);
+ dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
rcu_read_unlock();

cgroup_release(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index efc5493203ae..2bc8bd45179f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1844,7 +1844,7 @@ static __latent_entropy struct task_struct *copy_process(
int node,
struct kernel_clone_args *args)
{
- int pidfd = -1, retval;
+ int pidfd = -1, retval, processes;
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
@@ -1958,9 +1958,10 @@ static __latent_entropy struct task_struct *copy_process(
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
+ processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
+ UCOUNT_RLIMIT_NPROC);
retval = -EAGAIN;
- if (atomic_read(&p->real_cred->user->processes) >=
- task_rlimit(p, RLIMIT_NPROC)) {
+ if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
@@ -2361,7 +2362,7 @@ static __latent_entropy struct task_struct *copy_process(
#endif
delayacct_tsk_free(p);
bad_fork_cleanup_count:
- atomic_dec(&p->cred->user->processes);
+ dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
exit_creds(p);
bad_fork_free:
p->state = TASK_DEAD;
diff --git a/kernel/sys.c b/kernel/sys.c
index 00a96746e28a..db780ec32d86 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -461,11 +461,14 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
static int set_user(struct cred *new)
{
struct user_struct *new_user;
+ int processes;

new_user = alloc_uid(new->uid);
if (!new_user)
return -EAGAIN;

+ processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
+
/*
* We don't fail in case of NPROC limit excess here because too many
* poorly written programs don't check set*uid() return code, assuming
@@ -473,8 +476,7 @@ static int set_user(struct cred *new)
* for programs doing set*uid()+execve() by harmlessly deferring the
* failure to the execve() stage.
*/
- if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
- new_user != INIT_USER)
+ if (processes >= rlimit(RLIMIT_NPROC) && new_user != INIT_USER)
current->flags |= PF_NPROC_EXCEEDED;
else
current->flags &= ~PF_NPROC_EXCEEDED;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 7b2bca8582ef..e00d644e4ca5 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -74,6 +74,7 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
#endif
+ { },
{ }
};
#endif /* CONFIG_SYSCTL */
@@ -222,6 +223,39 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
put_ucounts(ucounts);
}

+long get_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type)
+{
+ long v;
+ struct ucounts *ucounts = get_ucounts(ns, uid);
+ if (!ucounts)
+ return LONG_MAX;
+ v = atomic_long_read(&ucounts->ucount[type]);
+ put_ucounts(ucounts);
+ return v;
+}
+
+struct ucounts *inc_rlimit_counter(struct user_namespace *ns, kuid_t uid,
+ enum ucount_type type)
+{
+ if (type < UCOUNT_MIN_RLIMIT || type > UCOUNT_MAX_RLIMIT)
+ return NULL;
+
+ return inc_ucount(ns, uid, type);
+}
+
+void dec_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type)
+{
+ struct ucounts *ucounts;
+
+ if (type < UCOUNT_MIN_RLIMIT || type > UCOUNT_MAX_RLIMIT)
+ return;
+
+ ucounts = get_ucounts(ns, uid);
+
+ if (ucounts)
+ dec_ucount(ucounts, type);
+}
+
static __init int user_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
diff --git a/kernel/user.c b/kernel/user.c
index b1635d94a1f2..5bb75ebdef4f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
.__count = REFCOUNT_INIT(1),
- .processes = ATOMIC_INIT(1),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
.uid = GLOBAL_ROOT_UID,
@@ -224,6 +223,8 @@ static int __init uid_cache_init(void)
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
spin_unlock_irq(&uidhash_lock);

+ inc_rlimit_counter(&init_user_ns, GLOBAL_ROOT_UID, UCOUNT_RLIMIT_NPROC);
+
return 0;
}
subsys_initcall(uid_cache_init);
--
2.25.4