Re: ipv6: tunnel: hang when destroying ipv6 tunnel

From: Tetsuo Handa
Date: Fri Apr 06 2012 - 14:24:33 EST


> Tetsuo Handa wrote:
> > Maybe you can get more useful information with below untested printk() patch.
> >
> > diff --git a/net/9p/client.c b/net/9p/client.c
> > index b23a17c..2dd447a 100644
> > --- a/net/9p/client.c
> > +++ b/net/9p/client.c
> > @@ -734,7 +734,9 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
> > } else
> > sigpending = 0;
> >
> > + printk("%u:Calling %pS\n", current->pid, c->trans_mod->request);
> > err = c->trans_mod->request(c, req);
> > + printk("%u:%pS = %d\n", current->pid, c->trans_mod->request, err);
> > if (err < 0) {
> > if (err != -ERESTARTSYS && err != -EFAULT)
> > c->status = Disconnected;
> > @@ -742,8 +744,10 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
> > }
> > again:
> > /* Wait for the response */
> > + printk("%u:req->status = %u\n", current->pid, req->status);
> > err = wait_event_interruptible(*req->wq,
> > req->status >= REQ_STATUS_RCVD);
> > + printk("%u:wait = %d\n", current->pid, err);
> >
> > if ((err == -ERESTARTSYS) && (c->status == Connected)
> > && (type == P9_TFLUSH)) {
> >

Sasha Levin wrote:
> Heya,
>
> The output from the printk confirmed that there are several threads
> waiting for RPC to complete, with the last two having and odd 'wait'
> result. This is just before the hang:
>
> [ 809.165663] 19964:Calling p9_virtio_request+0x0/0x200
> [ 809.166951] 19964:p9_virtio_request+0x0/0x200 = 0
> [ 809.167878] 19964:req->status = 3
> [ 809.803535] 19957:Calling p9_virtio_request+0x0/0x200
> [ 809.804506] 19957:p9_virtio_request+0x0/0x200 = 0
> [ 809.805332] 19957:req->status = 3
> [ 809.868591] 19955:Calling p9_virtio_request+0x0/0x200
> [ 809.869493] 19955:p9_virtio_request+0x0/0x200 = 0
> [ 809.870331] 19955:req->status = 3
> [ 811.364554] 19985:Calling p9_virtio_request+0x0/0x200
> [ 811.365498] 19985:p9_virtio_request+0x0/0x200 = 0
> [ 811.366386] 19985:req->status = 3
> [ 811.458600] 19999:wait = -512
> [ 811.459171] 19999:Calling p9_virtio_request+0x0/0x200
> [ 811.459992] 19999:p9_virtio_request+0x0/0x200 = 0
> [ 811.460822] 19999:req->status = 3
> [ 811.472175] 19994:wait = -512
> [ 811.472943] 19994:Calling p9_virtio_request+0x0/0x200
> [ 811.474195] 19994:p9_virtio_request+0x0/0x200 = 0
> [ 811.474955] 19994:req->status = 3
> [... Hang 120 sec later here]
>

Good. -512 is -ERESTARTSYS, and this hang occurs after -ERESTARTSYS is
returned. It indicates that c->trans_mod->request() is interrupted by signal.
Since c->trans_mod->request is pointing at p9_virtio_request, the location
returning that error would be

254 static int
255 p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
256 {
257 int err;
258 int in, out;
259 unsigned long flags;
260 struct virtio_chan *chan = client->trans;
261
262 p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
263
264 req->status = REQ_STATUS_SENT;
265 req_retry:
266 spin_lock_irqsave(&chan->lock, flags);
267
268 /* Handle out VirtIO ring buffers */
269 out = pack_sg_list(chan->sg, 0,
270 VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
271
272 in = pack_sg_list(chan->sg, out,
273 VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
274
275 err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc,
276 GFP_ATOMIC);
277 if (err < 0) {
278 if (err == -ENOSPC) {
279 chan->ring_bufs_avail = 0;
280 spin_unlock_irqrestore(&chan->lock, flags);
281 err = wait_event_interruptible(*chan->vc_wq,
282 chan->ring_bufs_avail);

here.

283 if (err == -ERESTARTSYS)
284 return err;
285
286 p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n");
287 goto req_retry;
288 } else {
289 spin_unlock_irqrestore(&chan->lock, flags);
290 p9_debug(P9_DEBUG_TRANS,
291 "virtio rpc add_buf returned failure\n");
292 return -EIO;
293 }
294 }
295 virtqueue_kick(chan->vq);
296 spin_unlock_irqrestore(&chan->lock, flags);
297
298 p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
299 return 0;
300 }

Comparing 3.3.1 and linux-next in my environment, there are several changes.

# diff -ur linux-3.3.1/drivers/virtio/ linux-next/drivers/virtio/ | diffstat
config.c | 1
virtio_balloon.c | 14 ----------
virtio_pci.c | 74 +++++--------------------------------------------------
3 files changed, 8 insertions(+), 81 deletions(-)
# diff -urp linux-3.3.1/fs/9p/ linux-next/fs/9p/ | diffstat
v9fs.c | 16 ++++++++--------
vfs_super.c | 5 ++---
2 files changed, 10 insertions(+), 11 deletions(-)
# diff -ur linux-3.3.1/net/9p/ linux-next/net/9p/ | diffstat
client.c | 26 +++++++++++++++++++++++---
1 file changed, 23 insertions(+), 3 deletions(-)

Most suspicious change is net/9p/client.c because it is changing handling of
ERESTARTSYS case.

--- linux-3.3.1/net/9p/client.c
+++ linux-next/net/9p/client.c
@@ -740,10 +740,18 @@
c->status = Disconnected;
goto reterr;
}
+again:
/* Wait for the response */
err = wait_event_interruptible(*req->wq,
req->status >= REQ_STATUS_RCVD);

+ if ((err == -ERESTARTSYS) && (c->status == Connected)
+ && (type == P9_TFLUSH)) {
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+ goto again;
+ }
+
if (req->status == REQ_STATUS_ERROR) {
p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
err = req->t_err;
@@ -1420,6 +1428,7 @@
int err;
struct p9_client *clnt;
struct p9_req_t *req;
+ int retries = 0;

if (!fid) {
pr_warn("%s (%d): Trying to clunk with NULL fid\n",
@@ -1428,7 +1437,9 @@
return 0;
}

- p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid);
+again:
+ p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid,
+ retries);
err = 0;
clnt = fid->clnt;

@@ -1444,8 +1455,14 @@
error:
/*
* Fid is not valid even after a failed clunk
+ * If interrupted, retry once then give up and
+ * leak fid until umount.
*/
- p9_fid_destroy(fid);
+ if (err == -ERESTARTSYS) {
+ if (retries++ == 0)
+ goto again;
+ } else
+ p9_fid_destroy(fid);
return err;
}
EXPORT_SYMBOL(p9_client_clunk);
@@ -1470,7 +1487,10 @@

p9_free_req(clnt, req);
error:
- p9_fid_destroy(fid);
+ if (err == -ERESTARTSYS)
+ p9_client_clunk(fid);
+ else
+ p9_fid_destroy(fid);
return err;
}
EXPORT_SYMBOL(p9_client_remove);

Maybe commit a314f274 "net/9p: don't allow Tflush to be interrupted" or nearby.



By the way, have you already tried 3.4-rc1?
In my environment, there is no difference between linux-next and 3.4-rc1.

# diff -ur linux-3.4.0-rc1/net/9p/ linux-next/net/9p/
# diff -ur linux-3.4.0-rc1/drivers/virtio/ linux-next/drivers/virtio/
# diff -ur linux-3.4.0-rc1/fs/9p/ linux-next/fs/9p/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/