[PATCH RFC 11/13] exit: Syncronize on kin_lock while do_notify_parent()

From: Kirill Tkhai
Date: Mon May 25 2015 - 13:46:17 EST


Note: do_wait() (working on tsk) may catch its child, which is being traced
by a thread from tsk's thread group. For proper synchronization, we must
hold both parent and real_parent locks for calling do_notify_parent().

Also delete tasklist_lock dependence in ptrace_attach() etc, because everything
is already synchronized on kin_lock (Due to previous patches).

Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxx>
---
kernel/exit.c | 42 ++++++++++++++++++++++++++++--------------
kernel/ptrace.c | 20 ++++++--------------
kernel/signal.c | 11 ++---------
3 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index bb9d165..aeded00 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -183,6 +183,7 @@ void release_task(struct task_struct *p)
write_parent_and_real_parent_lock(p);
ptrace_release_task(p);
__exit_signal(p);
+ write_unlock(&tasklist_lock);

/*
* If we are the last non-leader member of the thread
@@ -203,8 +204,7 @@ void release_task(struct task_struct *p)
leader->exit_state = EXIT_DEAD;
}

- write_parent_and_real_parent_unlock(p);
- write_unlock_irq(&tasklist_lock);
+ write_parent_and_real_parent_unlock_irq(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);

@@ -1016,7 +1016,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,

/*
* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
- * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * read_lock(wo->held_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
@@ -1036,6 +1036,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)

get_task_struct(p);
read_unlock(wo->held_lock);
+ rcu_read_unlock();
sched_annotate_sleep();

if ((exit_code & 0x7f) == 0) {
@@ -1058,6 +1059,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* We own this thread, nobody else can reap it.
*/
read_unlock(wo->held_lock);
+ rcu_read_unlock();
sched_annotate_sleep();

/*
@@ -1152,16 +1154,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
retval = pid;

if (state == EXIT_TRACE) {
- write_lock_irq(&tasklist_lock);
+ struct task_struct *old_parent;
+ write_parent_and_real_parent_lock_irq(p);
+ old_parent = p->parent;
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);

+ if (p->parent != old_parent)
+ write_unlock(&old_parent->kin_lock);
+
/* If parent wants a zombie, don't release it now */
state = EXIT_ZOMBIE;
if (do_notify_parent(p, p->exit_signal))
state = EXIT_DEAD;
p->exit_state = state;
- write_unlock_irq(&tasklist_lock);
+ write_parent_unlock_irq(p);
}
if (state == EXIT_DEAD)
release_task(p);
@@ -1191,13 +1198,13 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
*
* CONTEXT:
- * read_lock(&tasklist_lock), which is released if return value is
+ * read_lock(wo->held_lock), which is released if return value is
* non-zero. Also, grabs and releases @p->sighand->siglock.
*
* RETURNS:
* 0 if wait condition didn't exist and search for other wait conditions
* should continue. Non-zero return, -errno on failure and @p's pid on
- * success, implies that tasklist_lock is released and wait condition
+ * success, implies that wo->held_lock is released and wait condition
* search should terminate.
*/
static int wait_task_stopped(struct wait_opts *wo,
@@ -1241,13 +1248,14 @@ static int wait_task_stopped(struct wait_opts *wo,
* Now we are pretty sure this task is interesting.
* Make sure it doesn't get reaped out from under us while we
* give up the lock and then examine it below. We don't want to
- * keep holding onto the tasklist_lock while we call getrusage and
+ * keep holding onto wo->held_lock while we call getrusage and
* possibly take page faults for user memory.
*/
get_task_struct(p);
pid = task_pid_vnr(p);
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(wo->held_lock);
+ rcu_read_unlock();
sched_annotate_sleep();

if (unlikely(wo->wo_flags & WNOWAIT))
@@ -1281,7 +1289,7 @@ static int wait_task_stopped(struct wait_opts *wo,

/*
* Handle do_wait work for one task in a live, non-stopped state.
- * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * read_lock(wo->held_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
@@ -1311,6 +1319,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
pid = task_pid_vnr(p);
get_task_struct(p);
read_unlock(wo->held_lock);
+ rcu_read_unlock();
sched_annotate_sleep();

if (!wo->wo_info) {
@@ -1334,7 +1343,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
* Consider @p for a wait by @parent.
*
* -ECHILD should be in ->notask_error before the first call.
- * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns nonzero for a final return, when we have unlocked wo->held_lock.
* Returns zero if the search for a child should continue;
* then ->notask_error is 0 if @p is an eligible child,
* or another error from security_task_wait(), or still -ECHILD.
@@ -1392,6 +1401,9 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* If a ptracer wants to distinguish these two events for its
* own children it should create a separate process which takes
* the role of real parent.
+ *
+ * Since we use call do_notify_parent() under both parent's and
+ * real_parent's kin_locks, we are protected from it.
*/
if (!ptrace_reparented(p))
ptrace = 1;
@@ -1460,7 +1472,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Do the work of do_wait() for one thread in the group, @tsk.
*
* -ECHILD should be in ->notask_error before the first call.
- * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns nonzero for a final return, when we have unlocked wo->held_lock.
* Returns zero if the search for a child should continue; then
* ->notask_error is 0 if there were any eligible children,
* or another error from security_task_wait(), or still -ECHILD.
@@ -1538,9 +1550,10 @@ static long do_wait(struct wait_opts *wo)
goto notask;

set_current_state(TASK_INTERRUPTIBLE);
- read_lock(&tasklist_lock);
- wo->held_lock = &tasklist_lock;
+ rcu_read_lock();
for_each_thread(current, tsk) {
+ read_lock(&tsk->kin_lock);
+ wo->held_lock = &tsk->kin_lock;
retval = do_wait_thread(wo, tsk);
if (retval)
goto end;
@@ -1548,11 +1561,12 @@ static long do_wait(struct wait_opts *wo)
retval = ptrace_do_wait(wo, tsk);
if (retval)
goto end;
+ read_unlock(&tsk->kin_lock);

if (wo->wo_flags & __WNOTHREAD)
break;
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();

notask:
retval = wo->notask_error;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 817288d..6785f66 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -180,7 +180,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
* we are sure that this is our traced child and that can only
* be changed by us so it's not changing right after this.
*/
- read_lock(&tasklist_lock);
read_parent_lock(child);
if (child->ptrace && child->parent == current) {
WARN_ON(child->state == __TASK_TRACED);
@@ -192,7 +191,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
ret = 0;
}
read_parent_unlock(child);
- read_unlock(&tasklist_lock);

if (!ret && !ignore_state) {
if (!wait_task_inactive(child, __TASK_TRACED)) {
@@ -314,8 +312,7 @@ static int ptrace_attach(struct task_struct *task, long request,
if (retval)
goto unlock_creds;

- write_lock_irq(&tasklist_lock);
- write_parent_and_given_lock(task, current);
+ write_parent_and_given_lock_irq(task, current);
old_parent = task->parent;
retval = -EPERM;
if (unlikely(task->exit_state) || task->ptrace) {
@@ -365,8 +362,7 @@ static int ptrace_attach(struct task_struct *task, long request,

retval = 0;
unlock_current:
- write_unlock(&current->kin_lock);
- write_unlock_irq(&tasklist_lock);
+ write_unlock_irq(&current->kin_lock);
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
@@ -389,8 +385,7 @@ static int ptrace_traceme(void)
{
int ret = -EPERM;

- write_lock_irq(&tasklist_lock);
- write_parent_lock(current);
+ write_parent_lock_irq(current);
/* Are we already being traced? */
if (!current->ptrace) {
ret = security_ptrace_traceme(current->parent);
@@ -405,8 +400,7 @@ static int ptrace_traceme(void)
__ptrace_link(current, current->real_parent);
}
}
- write_parent_unlock(current);
- write_unlock_irq(&tasklist_lock);
+ write_parent_unlock_irq(current);

return ret;
}
@@ -474,8 +468,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
ptrace_disable(child);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);

- write_lock_irq(&tasklist_lock);
- write_parent_and_real_parent_lock(child);
+ write_parent_and_real_parent_lock_irq(child);
old_parent = child->parent;
/*
* We rely on ptrace_freeze_traced(). It can't be killed and
@@ -490,8 +483,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
__ptrace_detach(current, child);
if (old_parent != child->real_parent)
write_unlock(&old_parent->kin_lock);
- write_real_parent_unlock(child);
- write_unlock_irq(&tasklist_lock);
+ write_real_parent_unlock_irq(child);

proc_ptrace_connector(child, PTRACE_DETACH);

diff --git a/kernel/signal.c b/kernel/signal.c
index 8288019..2e7b622 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1732,6 +1732,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
struct sighand_struct *sighand;
cputime_t utime, stime;

+ BUG_ON(!same_thread_group(tsk, current));
+
if (for_ptracer) {
parent = tsk->parent;
} else {
@@ -1881,7 +1883,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
task_clear_jobctl_trapping(current);

spin_unlock_irq(&current->sighand->siglock);
- read_lock(&tasklist_lock);
read_parent_and_real_parent_lock(current);
if (may_ptrace_stop()) {
/*
@@ -1906,7 +1907,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
*/
preempt_disable();
read_parent_and_real_parent_unlock(current);
- read_unlock(&tasklist_lock);
preempt_enable_no_resched();
freezable_schedule();
} else {
@@ -1928,7 +1928,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
if (clear_code)
current->exit_code = 0;
read_parent_and_real_parent_unlock(current);
- read_unlock(&tasklist_lock);
}

/*
@@ -2081,11 +2080,9 @@ static bool do_signal_stop(int signr)
* TASK_TRACED.
*/
if (notify) {
- read_lock(&tasklist_lock);
read_parent_and_real_parent_lock(current);
do_notify_parent_cldstop(current, false, notify);
read_parent_and_real_parent_unlock(current);
- read_unlock(&tasklist_lock);
}

/* Now we don't run again until woken by SIGCONT or SIGKILL */
@@ -2229,7 +2226,6 @@ int get_signal(struct ksignal *ksig)
* the ptracer of the group leader too unless it's gonna be
* a duplicate.
*/
- read_lock(&tasklist_lock);
read_parent_and_real_parent_lock(current->group_leader);
do_notify_parent_cldstop(current, false, why);

@@ -2237,7 +2233,6 @@ int get_signal(struct ksignal *ksig)
do_notify_parent_cldstop(current->group_leader,
true, why);
read_parent_and_real_parent_unlock(current->group_leader);
- read_unlock(&tasklist_lock);

goto relock;
}
@@ -2482,11 +2477,9 @@ void exit_signals(struct task_struct *tsk)
* should always go to the real parent of the group leader.
*/
if (unlikely(group_stop)) {
- read_lock(&tasklist_lock);
read_parent_and_real_parent_lock(tsk);
do_notify_parent_cldstop(tsk, false, group_stop);
read_parent_and_real_parent_unlock(tsk);
- read_unlock(&tasklist_lock);
}
}




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/