[PATCH RFC 08/13] core: Use kin_lock synchronizations between parent and child and for thread group

From: Kirill Tkhai
Date: Mon May 25 2015 - 13:45:49 EST


real_parent->kin_lock protects:
child's threads, place in sibling list etc.

parent->kin_lock protects:
ptrace operations

Note: sighand is still stable under tasklist_lock.

Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxx>
---
fs/exec.c | 16 ++++++++---
fs/proc/array.c | 4 +--
kernel/exit.c | 68 +++++++++++++++++++++++++++++++++++++---------
kernel/fork.c | 4 +++
kernel/ptrace.c | 45 ++++++++++++++++++++++++++----
kernel/signal.c | 9 ++++++
kernel/sys.c | 8 ++++-
mm/oom_kill.c | 9 ++++--
security/keys/keyctl.c | 4 +--
security/selinux/hooks.c | 4 +--
10 files changed, 136 insertions(+), 35 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 1977c2a..4de7770 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,14 +932,16 @@ static int de_thread(struct task_struct *tsk)
for (;;) {
threadgroup_change_begin(tsk);
write_lock_irq(&tasklist_lock);
+ write_real_parent_lock(tsk);
/*
- * Do this under tasklist_lock to ensure that
- * exit_notify() can't miss ->group_exit_task
+ * Do this under real_parent's kin_lock to ensure
+ * that exit_notify() can't miss ->group_exit_task.
*/
sig->notify_count = -1;
if (likely(leader->exit_state))
break;
__set_current_state(TASK_KILLABLE);
+ write_real_parent_unlock(tsk);
write_unlock_irq(&tasklist_lock);
threadgroup_change_end(tsk);
schedule();
@@ -995,9 +997,13 @@ static int de_thread(struct task_struct *tsk)
* We are going to release_task()->ptrace_unlink() silently,
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
* the tracer wont't block again waiting for this thread.
+ *
+ * Also, we not need hold leader->parent->kin_lock, because
+ * it can't change till we release real_parent->kin_lock.
*/
if (unlikely(leader->ptrace))
__wake_up_parent(leader, leader->parent);
+ write_real_parent_unlock(tsk);
write_unlock_irq(&tasklist_lock);
threadgroup_change_end(tsk);

@@ -1029,9 +1035,11 @@ static int de_thread(struct task_struct *tsk)
sizeof(newsighand->action));

write_lock_irq(&tasklist_lock);
+ write_parent_and_real_parent_lock(tsk);
spin_lock(&oldsighand->siglock);
rcu_assign_pointer(tsk->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
+ write_parent_and_real_parent_unlock(tsk);
write_unlock_irq(&tasklist_lock);

__cleanup_sighand(oldsighand);
@@ -1042,10 +1050,10 @@ static int de_thread(struct task_struct *tsk)

killed:
/* protects against exit_notify() and __exit_signal() */
- read_lock(&tasklist_lock);
+ read_real_parent_lock(tsk);
sig->group_exit_task = NULL;
sig->notify_count = 0;
- read_unlock(&tasklist_lock);
+ read_real_parent_unlock(tsk);
return -EAGAIN;
}

diff --git a/fs/proc/array.c b/fs/proc/array.c
index e2f21c0..820611cc 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -588,7 +588,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
if (!task)
goto put_start;

- read_lock(&tasklist_lock);
+ read_lock(&start->kin_lock);
if (task->real_parent == start && !(list_empty(&task->sibling))) {
put_task_struct(task);
if (!list_is_last(&task->sibling, &start->children)) {
@@ -623,7 +623,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
}

unlock:
- read_unlock(&tasklist_lock);
+ read_unlock(&start->kin_lock);
put_start:
put_task_struct(start);
out:
diff --git a/kernel/exit.c b/kernel/exit.c
index 5e82787..a268093 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -180,6 +180,7 @@ void release_task(struct task_struct *p)
proc_flush_task(p);

write_lock_irq(&tasklist_lock);
+ write_parent_and_real_parent_lock(p);
ptrace_release_task(p);
__exit_signal(p);

@@ -202,6 +203,7 @@ void release_task(struct task_struct *p)
leader->exit_state = EXIT_DEAD;
}

+ write_parent_and_real_parent_unlock(p);
write_unlock_irq(&tasklist_lock);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
@@ -317,43 +319,50 @@ void mm_update_next_owner(struct mm_struct *mm)
return;
}

- read_lock(&tasklist_lock);
/*
* Search in the children
*/
+ read_lock(&p->kin_lock);
list_for_each_entry(c, &p->children, sibling) {
if (c->mm == mm) {
get_task_struct(c);
+ read_unlock(&p->kin_lock);
goto assign_new_owner;
}
}
+ read_unlock(&p->kin_lock);

/*
* Search in the siblings
*/
+ read_real_parent_lock(p);
list_for_each_entry(c, &p->real_parent->children, sibling) {
if (c->mm == mm) {
get_task_struct(c);
+ read_real_parent_unlock(p);
goto assign_new_owner;
}
}
+ read_real_parent_unlock(p);

/*
* Search through everything else, we should not get here often.
*/
+ rcu_read_lock();
for_each_process(g) {
if (g->flags & PF_KTHREAD)
continue;
for_each_thread(g, c) {
if (c->mm == mm) {
get_task_struct(c);
+ rcu_read_unlock();
goto assign_new_owner;
}
if (c->mm)
break;
}
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
/*
* We found no owner yet mm_users > 1: this implies that we are
* most likely racing with swapoff (try_to_unuse()) or /proc or
@@ -369,11 +378,6 @@ void mm_update_next_owner(struct mm_struct *mm)
* We always want mm->owner->mm == mm
*/
task_lock(c);
- /*
- * Delay read_unlock() till we have the task_lock()
- * to ensure that c does not slip away underneath us
- */
- read_unlock(&tasklist_lock);
if (c->mm != mm) {
task_unlock(c);
put_task_struct(c);
@@ -470,9 +474,13 @@ static void check_pid_ns_reaper_exit(struct task_struct *father)
return;

write_lock(&pid_ns->cr_lock);
+ read_real_parent_lock(father);
+
reaper = find_alive_thread(father);
if (reaper)
pid_ns->child_reaper = reaper;
+
+ read_real_parent_unlock(father);
write_unlock(&pid_ns->cr_lock);

if (reaper)
@@ -494,13 +502,22 @@ static void check_pid_ns_reaper_exit(struct task_struct *father)
* child_subreaper for its children (like a service manager)
* 3. give it to the init process (PID 1) in our pid namespace
*/
-static struct task_struct *find_new_reaper(struct task_struct *father)
+static struct task_struct *find_double_lock_new_reaper(struct task_struct *father)
{
struct task_struct *thread, *reaper, *child_reaper;

+ rcu_read_lock();
+again:
thread = find_alive_thread(father);
- if (thread)
+ if (thread) {
+ double_write_lock(&father->kin_lock, &thread->kin_lock);
+ if (thread->flags & PF_EXITING) {
+ double_write_unlock(&father->kin_lock, &thread->kin_lock);
+ goto again;
+ }
+ rcu_read_unlock();
return thread;
+ }

child_reaper = task_active_pid_ns(father)->child_reaper;

@@ -518,12 +535,26 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
break;
if (!reaper->signal->is_child_subreaper)
continue;
+find_again:
thread = find_alive_thread(reaper);
- if (thread)
- return thread;
+ if (thread) {
+ double_write_lock(&father->kin_lock,
+ &thread->kin_lock);
+ if (!(thread->flags & PF_EXITING)) {
+ rcu_read_unlock();
+ return thread;
+ }
+ double_write_unlock(&father->kin_lock,
+ &thread->kin_lock);
+ goto find_again;
+ }
}
}

+ rcu_read_unlock();
+
+ double_write_lock(&child_reaper->kin_lock, &father->kin_lock);
+
return child_reaper;
}

@@ -569,11 +600,18 @@ static void forget_original_parent(struct task_struct *father,

/* Can drop and reacquire tasklist_lock */
check_pid_ns_reaper_exit(father);
- if (list_empty(&father->children))
+
+ /* read_lock() guarantees concurrent thread sees our PF_EXITING */
+ read_lock(&father->kin_lock);
+ if (list_empty(&father->children)) {
+ read_unlock(&father->kin_lock);
return;
+ }
+ read_unlock(&father->kin_lock);

read_lock(&task_active_pid_ns(father)->cr_lock);
- reaper = find_new_reaper(father);
+ reaper = find_double_lock_new_reaper(father);
+
list_for_each_entry(p, &father->children, sibling) {
for_each_thread(p, t) {
t->real_parent = reaper;
@@ -592,6 +630,8 @@ static void forget_original_parent(struct task_struct *father,
reparent_leader(father, p, dead);
}
list_splice_tail_init(&father->children, &reaper->children);
+
+ double_write_unlock(&reaper->kin_lock, &father->kin_lock);
read_unlock(&task_active_pid_ns(father)->cr_lock);
}

@@ -608,6 +648,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
write_lock_irq(&tasklist_lock);
forget_original_parent(tsk, &dead);

+ write_parent_and_real_parent_lock(tsk);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);

@@ -631,6 +672,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
+ write_parent_and_real_parent_unlock(tsk);
write_unlock_irq(&tasklist_lock);

list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
diff --git a/kernel/fork.c b/kernel/fork.c
index ee389ba..63e225b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1524,9 +1524,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,

/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
+ write_real_parent_lock(current);
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
+ write_lock(&current->kin_lock);
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}
@@ -1550,6 +1552,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(&current->sighand->siglock);
+ write_real_parent_unlock(p);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_free_pid;
@@ -1591,6 +1594,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,

total_forks++;
spin_unlock(&current->sighand->siglock);
+ write_real_parent_unlock(p);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e05..817288d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,6 +181,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
* be changed by us so it's not changing right after this.
*/
read_lock(&tasklist_lock);
+ read_parent_lock(child);
if (child->ptrace && child->parent == current) {
WARN_ON(child->state == __TASK_TRACED);
/*
@@ -190,6 +191,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
if (ignore_state || ptrace_freeze_traced(child))
ret = 0;
}
+ read_parent_unlock(child);
read_unlock(&tasklist_lock);

if (!ret && !ignore_state) {
@@ -275,6 +277,7 @@ static int ptrace_attach(struct task_struct *task, long request,
unsigned long flags)
{
bool seize = (request == PTRACE_SEIZE);
+ struct task_struct *old_parent;
int retval;

retval = -EIO;
@@ -312,11 +315,13 @@ static int ptrace_attach(struct task_struct *task, long request,
goto unlock_creds;

write_lock_irq(&tasklist_lock);
+ write_parent_and_given_lock(task, current);
+ old_parent = task->parent;
retval = -EPERM;
- if (unlikely(task->exit_state))
- goto unlock_tasklist;
- if (task->ptrace)
- goto unlock_tasklist;
+ if (unlikely(task->exit_state) || task->ptrace) {
+ write_unlock(&old_parent->kin_lock);
+ goto unlock_current;
+ }

if (seize)
flags |= PT_SEIZED;
@@ -327,6 +332,7 @@ static int ptrace_attach(struct task_struct *task, long request,
task->ptrace = flags;

__ptrace_link(task, current);
+ write_unlock(&old_parent->kin_lock);

/* SEIZE doesn't trap tracee on attach */
if (!seize)
@@ -358,7 +364,8 @@ static int ptrace_attach(struct task_struct *task, long request,
spin_unlock(&task->sighand->siglock);

retval = 0;
-unlock_tasklist:
+unlock_current:
+ write_unlock(&current->kin_lock);
write_unlock_irq(&tasklist_lock);
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
@@ -383,6 +390,7 @@ static int ptrace_traceme(void)
int ret = -EPERM;

write_lock_irq(&tasklist_lock);
+ write_parent_lock(current);
/* Are we already being traced? */
if (!current->ptrace) {
ret = security_ptrace_traceme(current->parent);
@@ -392,10 +400,12 @@ static int ptrace_traceme(void)
* pretend ->real_parent untraces us right after return.
*/
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
+ BUG_ON(current->parent != current->real_parent);
current->ptrace = PT_PTRACED;
__ptrace_link(current, current->real_parent);
}
}
+ write_parent_unlock(current);
write_unlock_irq(&tasklist_lock);

return ret;
@@ -456,6 +466,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)

static int ptrace_detach(struct task_struct *child, unsigned int data)
{
+ struct task_struct *old_parent;
if (!valid_signal(data))
return -EIO;

@@ -464,6 +475,8 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);

write_lock_irq(&tasklist_lock);
+ write_parent_and_real_parent_lock(child);
+ old_parent = child->parent;
/*
* We rely on ptrace_freeze_traced(). It can't be killed and
* untraced by another thread, it can't be a zombie.
@@ -475,6 +488,9 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
*/
child->exit_code = data;
__ptrace_detach(current, child);
+ if (old_parent != child->real_parent)
+ write_unlock(&old_parent->kin_lock);
+ write_real_parent_unlock(child);
write_unlock_irq(&tasklist_lock);

proc_ptrace_connector(child, PTRACE_DETACH);
@@ -488,15 +504,30 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
*/
void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
{
- struct task_struct *p, *n;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ for (;;) {
+ p = list_first_or_null_rcu(&tracer->ptraced, struct task_struct,
+ ptrace_entry);
+ if (!p)
+ break;
+
+ write_parent_and_real_parent_lock(p);
+ if (p->parent != tracer) {
+ write_parent_and_real_parent_unlock(p);
+ continue;
+ }

- list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (unlikely(p->ptrace & PT_EXITKILL))
send_sig_info(SIGKILL, SEND_SIG_FORCED, p);

if (__ptrace_detach(tracer, p))
list_add(&p->ptrace_entry, dead);
+
+ write_parent_and_real_parent_unlock(p);
}
+ rcu_read_unlock();
}

int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/signal.c b/kernel/signal.c
index f19833b..8288019 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1882,6 +1882,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)

spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
+ read_parent_and_real_parent_lock(current);
if (may_ptrace_stop()) {
/*
* Notify parents of the stop.
@@ -1904,6 +1905,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
* XXX: implement read_unlock_no_resched().
*/
preempt_disable();
+ read_parent_and_real_parent_unlock(current);
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
freezable_schedule();
@@ -1925,6 +1927,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
+ read_parent_and_real_parent_unlock(current);
read_unlock(&tasklist_lock);
}

@@ -2079,7 +2082,9 @@ static bool do_signal_stop(int signr)
*/
if (notify) {
read_lock(&tasklist_lock);
+ read_parent_and_real_parent_lock(current);
do_notify_parent_cldstop(current, false, notify);
+ read_parent_and_real_parent_unlock(current);
read_unlock(&tasklist_lock);
}

@@ -2225,11 +2230,13 @@ int get_signal(struct ksignal *ksig)
* a duplicate.
*/
read_lock(&tasklist_lock);
+ read_parent_and_real_parent_lock(current->group_leader);
do_notify_parent_cldstop(current, false, why);

if (ptrace_reparented(current->group_leader))
do_notify_parent_cldstop(current->group_leader,
true, why);
+ read_parent_and_real_parent_unlock(current->group_leader);
read_unlock(&tasklist_lock);

goto relock;
@@ -2476,7 +2483,9 @@ void exit_signals(struct task_struct *tsk)
*/
if (unlikely(group_stop)) {
read_lock(&tasklist_lock);
+ read_parent_and_real_parent_lock(tsk);
do_notify_parent_cldstop(tsk, false, group_stop);
+ read_parent_and_real_parent_unlock(tsk);
read_unlock(&tasklist_lock);
}
}
diff --git a/kernel/sys.c b/kernel/sys.c
index a4e372b..783aec4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -939,7 +939,9 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
err = -ESRCH;
p = find_task_by_vpid(pid);
if (!p)
- goto out;
+ goto out2;
+
+ write_real_parent_lock(p);

err = -EINVAL;
if (!thread_group_leader(p))
@@ -981,7 +983,9 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)

err = 0;
out:
- /* All paths lead to here, thus we are safe. -DaveM */
+ /* All (almost) paths lead to here, thus we are safe. -DaveM */
+ write_real_parent_unlock(p);
+out2:
write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
return err;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2b665da..aa89beb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -538,13 +538,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* parent. This attempts to lose the minimal amount of work done while
* still freeing memory.
*/
- read_lock(&tasklist_lock);
+ rcu_read_lock();
for_each_thread(p, t) {
+ read_lock(&t->kin_lock);
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;

if (child->mm == p->mm)
- continue;
+ goto unlock;
/*
* oom_badness() returns 0 if the thread is unkillable
*/
@@ -557,8 +558,10 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
get_task_struct(victim);
}
}
+unlock:
+ read_unlock(&t->kin_lock);
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();

p = find_lock_task_mm(victim);
if (!p) {
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 0b9ec78..64eea7b 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1491,7 +1491,7 @@ long keyctl_session_to_parent(void)

me = current;
rcu_read_lock();
- write_lock_irq(&tasklist_lock);
+ write_real_parent_lock_irq(me);

ret = -EPERM;
oldwork = NULL;
@@ -1540,7 +1540,7 @@ long keyctl_session_to_parent(void)
if (!ret)
newwork = NULL;
unlock:
- write_unlock_irq(&tasklist_lock);
+ write_real_parent_unlock_irq(me);
rcu_read_unlock();
if (oldwork)
put_cred(container_of(oldwork, struct cred, rcu));
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7dade28..0feda4b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2461,9 +2461,9 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)

/* Wake up the parent if it is waiting so that it can recheck
* wait permission to the new task SID. */
- read_lock(&tasklist_lock);
+ read_real_parent_lock(current);
__wake_up_parent(current, current->real_parent);
- read_unlock(&tasklist_lock);
+ read_real_parent_unlock(current);
}

/* superblock security operations */



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/