Re: splice/tee bugs?

From: Jens Axboe
Date: Sun Jul 09 2006 - 07:15:31 EST


On Sun, Jul 09 2006, Jens Axboe wrote:
> On Sat, Jul 08 2006, Luiz Fernando N. Capitulino wrote:
> >
> > Hi Jens,
> >
> > On Sat, 8 Jul 2006 08:41:32 +0200
> > Jens Axboe <axboe@xxxxxxx> wrote:
> >
> > | On Fri, Jul 07 2006, Luiz Fernando N. Capitulino wrote:
> > | > On Fri, 7 Jul 2006 04:07:49 -0700
> > | > Andrew Morton <akpm@xxxxxxxx> wrote:
> > | >
> > | > | On Fri, 07 Jul 2006 09:07:03 +0200
> > | > | "Michael Kerrisk" <mtk-manpages@xxxxxxx> wrote:
> > | > |
> > | > | > c) Occasionally the command line just hangs, producing no output.
> > | > | > In this case I can't kill it with ^C or ^\. This is a
> > | > | > hard-to-reproduce behaviour on my (x86) system, but I have
> > | > | > seen it several times by now.
> > | > |
> > | > | aka local DoS. Please capture sysrq-T output next time.
> > | >
> > | > If I run lots of them in parallel, I get the following OOPs in a few
> > | > seconds:
> > |
> > | With the patch posted? You need the i vs nrbufs fix.
> >
> > Yes, it fixes the problem. I didn't try it before because I thought
> > you were going to double check it [1].
>
> Yeah the patch needs reworking, however the isolated i vs nrbufs fix is
> safe enough on its own. I'll post a full patch for inclusion, I'm afraid
> I wont be able to fully test it enough for submitting it until tomorrow
> though.

Something like this, testing would be appreciated! Michael, can you
repeat your testing as well? Thanks.

diff --git a/fs/splice.c b/fs/splice.c
index 05fd278..ecf72bc 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1307,6 +1307,69 @@ asmlinkage long sys_splice(int fd_in, lo
}

/*
+ * Make sure there's data to read. Wait for input if we can, otherwise
+ * return an appropriate error.
+ */
+static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+{
+ int ret = 0;
+
+ mutex_lock(&pipe->inode->i_mutex);
+
+ while (!pipe->nrbufs) {
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (!pipe->writers)
+ break;
+ if (!pipe->waiting_writers) {
+ if (flags & SPLICE_F_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ pipe_wait(pipe);
+ }
+
+ mutex_unlock(&pipe->inode->i_mutex);
+ return ret;
+}
+
+/*
+ * Make sure there's writeable room. Wait for room if we can, otherwise
+ * return an appropriate error.
+ */
+static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+{
+ int ret = 0;
+
+ mutex_lock(&pipe->inode->i_mutex);
+
+ while (pipe->nrbufs >= PIPE_BUFFERS) {
+ if (!pipe->readers) {
+ send_sig(SIGPIPE, current, 0);
+ ret = -EPIPE;
+ break;
+ }
+ if (flags & SPLICE_F_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ pipe->waiting_writers++;
+ pipe_wait(pipe);
+ pipe->waiting_writers--;
+ }
+
+ mutex_unlock(&pipe->inode->i_mutex);
+ return ret;
+}
+
+/*
* Link contents of ipipe to opipe.
*/
static int link_pipe(struct pipe_inode_info *ipipe,
@@ -1314,9 +1377,9 @@ static int link_pipe(struct pipe_inode_i
size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
- int ret, do_wakeup, i, ipipe_first;
+ int ret, i, nbuf;

- ret = do_wakeup = ipipe_first = 0;
+ i = ret = 0;

/*
* Potential ABBA deadlock, work around it by ordering lock
@@ -1324,131 +1387,65 @@ static int link_pipe(struct pipe_inode_i
* could deadlock (one doing tee from A -> B, the other from B -> A).
*/
if (ipipe->inode < opipe->inode) {
- ipipe_first = 1;
- mutex_lock(&ipipe->inode->i_mutex);
- mutex_lock(&opipe->inode->i_mutex);
+ mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_CHILD);
} else {
- mutex_lock(&opipe->inode->i_mutex);
- mutex_lock(&ipipe->inode->i_mutex);
+ mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_CHILD);
}

- for (i = 0;; i++) {
+ do {
if (!opipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
- if (ipipe->nrbufs - i) {
- ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
-
- /*
- * If we have room, fill this buffer
- */
- if (opipe->nrbufs < PIPE_BUFFERS) {
- int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);

- /*
- * Get a reference to this pipe buffer,
- * so we can copy the contents over.
- */
- ibuf->ops->get(ipipe, ibuf);
-
- obuf = opipe->bufs + nbuf;
- *obuf = *ibuf;
-
- /*
- * Don't inherit the gift flag, we need to
- * prevent multiple steals of this page.
- */
- obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
-
- if (obuf->len > len)
- obuf->len = len;
-
- opipe->nrbufs++;
- do_wakeup = 1;
- ret += obuf->len;
- len -= obuf->len;
-
- if (!len)
- break;
- if (opipe->nrbufs < PIPE_BUFFERS)
- continue;
- }
-
- /*
- * We have input available, but no output room.
- * If we already copied data, return that. If we
- * need to drop the opipe lock, it must be ordered
- * last to avoid deadlocks.
- */
- if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
- if (!ret)
- ret = -EAGAIN;
- break;
- }
- if (signal_pending(current)) {
- if (!ret)
- ret = -ERESTARTSYS;
- break;
- }
- if (do_wakeup) {
- smp_mb();
- if (waitqueue_active(&opipe->wait))
- wake_up_interruptible(&opipe->wait);
- kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
- do_wakeup = 0;
- }
+ /*
+ * If we have iterated all input buffers or ran out of
+ * output room, break.
+ */
+ if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+ break;

- opipe->waiting_writers++;
- pipe_wait(opipe);
- opipe->waiting_writers--;
- continue;
- }
+ ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
+ nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);

/*
- * No input buffers, do the usual checks for available
- * writers and blocking and wait if necessary
+ * Get a reference to this pipe buffer,
+ * so we can copy the contents over.
*/
- if (!ipipe->writers)
- break;
- if (!ipipe->waiting_writers) {
- if (ret)
- break;
- }
+ ibuf->ops->get(ipipe, ibuf);
+
+ obuf = opipe->bufs + nbuf;
+ *obuf = *ibuf;
+
/*
- * pipe_wait() drops the ipipe mutex. To avoid deadlocks
- * with another process, we can only safely do that if
- * the ipipe lock is ordered last.
+ * Don't inherit the gift flag, we need to
+ * prevent multiple steals of this page.
*/
- if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) {
- if (!ret)
- ret = -EAGAIN;
- break;
- }
- if (signal_pending(current)) {
- if (!ret)
- ret = -ERESTARTSYS;
- break;
- }
+ obuf->flags &= ~PIPE_BUF_FLAG_GIFT;

- if (waitqueue_active(&ipipe->wait))
- wake_up_interruptible_sync(&ipipe->wait);
- kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
+ if (obuf->len > len)
+ obuf->len = len;

- pipe_wait(ipipe);
- }
+ opipe->nrbufs++;
+ ret += obuf->len;
+ len -= obuf->len;
+ i++;
+ } while (len);

mutex_unlock(&ipipe->inode->i_mutex);
mutex_unlock(&opipe->inode->i_mutex);

- if (do_wakeup) {
+ if (ret) {
smp_mb();
if (waitqueue_active(&opipe->wait))
wake_up_interruptible(&opipe->wait);
kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
- }
+ } else if (flags & SPLICE_F_NONBLOCK)
+ ret = -EAGAIN;

return ret;
}
@@ -1464,12 +1461,23 @@ static long do_tee(struct file *in, stru
{
struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe;
struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe;
+ int ret;

/*
- * Link ipipe to the two output pipes, consuming as we go along.
+ * Duplicate the contents of ipipe to opipe without actually
+ * copying the data.
*/
- if (ipipe && opipe)
+ if (ipipe && opipe && ipipe != opipe) {
+ ret = link_ipipe_prep(ipipe, flags);
+ if (unlikely(ret))
+ return ret;
+
+ ret = link_opipe_prep(opipe, flags);
+ if (unlikely(ret))
+ return ret;
+
return link_pipe(ipipe, opipe, len, flags);
+ }

return -EINVAL;
}

--
Jens Axboe

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/