Re: [PATCH] nbd: Add locking for tasks

From: Ben Hutchings
Date: Thu Oct 08 2015 - 16:14:55 EST


On Tue, 2015-10-06 at 20:03 +0200, Markus Pargmann wrote:
> The timeout handling introduced in
> > 7e2893a16d3e (nbd: Fix timeout detection)
> introduces a race condition which may lead to killing of tasks that are
> not in nbd context anymore. This was not observed or reproducable yet.
>
> This patch adds locking to critical use of task_recv and task_send to
> avoid killing tasks that already left the NBD thread functions. This
> lock is only acquired if a timeout occures or the nbd device
> starts/stops.
>
> Reported-by: Ben Hutchings <ben@xxxxxxxxxxxxxxx>
> Signed-off-by: Markus Pargmann <mpa@xxxxxxxxxxxxxx>

Reviewed-by: Ben Hutchings <ben@xxxxxxxxxxxxxxx>

You could add 'Fixes: 7e2893a16d3e ("nbd: Fix timeout detection")' to
the commit message as well.

nbd_dbg_tasks_show() can still race with thread exit and two tasks can
race to become the receive thread, but those aren't new bugs.

Ben.

> ---
> drivers/block/nbd.c | 36 ++++++++++++++++++++++++++++++------
> 1 file changed, 30 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index 039c7c4f0539..1a70852ac808 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -60,6 +60,7 @@ struct nbd_device {
> > > bool disconnect; /* a disconnect has been requested by user */
>
> > > struct timer_list timeout_timer;
> +> > spinlock_t tasks_lock;
> > > struct task_struct *task_recv;
> > > struct task_struct *task_send;
>
> @@ -140,21 +141,23 @@ static void sock_shutdown(struct nbd_device *nbd)
> static void nbd_xmit_timeout(unsigned long arg)
> {
> > > struct nbd_device *nbd = (struct nbd_device *)arg;
> -> > struct task_struct *task;
> +> > unsigned long flags;
>
> > > if (list_empty(&nbd->queue_head))
> > > > return;
>
> > > nbd->disconnect = true;
>
> -> > task = READ_ONCE(nbd->task_recv);
> -> > if (task)
> -> > > force_sig(SIGKILL, task);
> +> > spin_lock_irqsave(&nbd->tasks_lock, flags);
> +
> +> > if (nbd->task_recv)
> +> > > force_sig(SIGKILL, nbd->task_recv);
>
> -> > task = READ_ONCE(nbd->task_send);
> -> > if (task)
> +> > if (nbd->task_send)
> > > > force_sig(SIGKILL, nbd->task_send);
>
> +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
> +
> > > dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
> }
>
> @@ -403,17 +406,24 @@ static int nbd_thread_recv(struct nbd_device *nbd)
> {
> > > struct request *req;
> > > int ret;
> +> > unsigned long flags;
>
> > > BUG_ON(nbd->magic != NBD_MAGIC);
>
> > > sk_set_memalloc(nbd->sock->sk);
>
> +> > spin_lock_irqsave(&nbd->tasks_lock, flags);
> > > nbd->task_recv = current;
> +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
>
> > > ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
> > > if (ret) {
> > > > dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
> +
> +> > > spin_lock_irqsave(&nbd->tasks_lock, flags);
> > > > nbd->task_recv = NULL;
> +> > > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
> +
> > > > return ret;
> > > }
>
> @@ -429,7 +439,9 @@ static int nbd_thread_recv(struct nbd_device *nbd)
>
> > > device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
>
> +> > spin_lock_irqsave(&nbd->tasks_lock, flags);
> > > nbd->task_recv = NULL;
> +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
>
> > > if (signal_pending(current)) {
> > > > siginfo_t info;
> @@ -534,8 +546,11 @@ static int nbd_thread_send(void *data)
> {
> > > struct nbd_device *nbd = data;
> > > struct request *req;
> +> > unsigned long flags;
>
> +> > spin_lock_irqsave(&nbd->tasks_lock, flags);
> > > nbd->task_send = current;
> +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
>
> > > set_user_nice(current, MIN_NICE);
> > > while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
> @@ -572,7 +587,15 @@ static int nbd_thread_send(void *data)
> > > > nbd_handle_req(nbd, req);
> > > }
>
> +> > spin_lock_irqsave(&nbd->tasks_lock, flags);
> > > nbd->task_send = NULL;
> +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
> +
> +> > /* Clear maybe pending signals */
> +> > if (signal_pending(current)) {
> +> > > siginfo_t info;
> +> > > dequeue_signal_lock(current, Ât->blocked, &info);
> +> > }
>
> > > return 0;
> }
> @@ -1027,6 +1050,7 @@ static int __init nbd_init(void)
> > > > nbd_dev[i].magic = NBD_MAGIC;
> > > > INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
> > > > spin_lock_init(&nbd_dev[i].queue_lock);
> +> > > spin_lock_init(&nbd_dev[i].tasks_lock);
> > > > INIT_LIST_HEAD(&nbd_dev[i].queue_head);
> > > > mutex_init(&nbd_dev[i].tx_lock);
> > > > init_timer(&nbd_dev[i].timeout_timer);
--
Ben Hutchings
Once a job is fouled up, anything done to improve it makes it worse.

Attachment: signature.asc
Description: This is a digitally signed message part