Re: [PATCH 0/2] pipe: Fixes [ver #2]

From: Linus Torvalds
Date: Sun Dec 08 2019 - 22:08:19 EST


On Sun, Dec 8, 2019 at 10:04 AM Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> On Sun, Dec 8, 2019 at 8:46 AM Akemi Yagi <toracat@xxxxxxxxxx> wrote:
> >
> > I forgot to mention that the aforementioned bug in make was originally
> > reported for Fedora. They now have a patched version of make
> > (make-4.2.1-15.fc32) in Rawhide.
>
> Yes, as mentioned I do expect it's some jobserver interaction - I've
> seen bad jobserver behavior before. But my 'make' hasn't been upgraded
> since May, as far as I can tell from my logs, so the huge performance
> regression was new.

Interestingly, when I'm trying to do the non-thundering-herd pipe
reads and writes, a side effect of that is that the read target is
"fair" (ie when there are multiple concurrent waiting read() calls,
it's always the first one that gets it).

And that seems to really mess up the jobserver bug, and triggers it
every single time.

Fairness is often bad for locking throughput, but this is on another
level entirely, and yeah, I see a lot of "defunct" shell and make
processes, so it does seem to be the make bug.

(Adding scheduler people to the participants list, because this patch
is the "avoid using the return value of
wait_event_interruptible_exclusive()" version because I think
wait_event_interruptible_exclusive() is itself buggy).

I built a new version of 'make' from source, and that one seems to
work fine with the attached patch. But the standard Fedora 'make'
binary completely hates this patch.

Sad. The patch does seem to be the RightThing(tm) to do, but this make
bug makes it inadvisable to apply it unless you want to play with
things.

Linus
fs/coredump.c | 4 +--
fs/pipe.c | 67 +++++++++++++++++++++++++++++++----------------
fs/splice.c | 8 +++---
include/linux/pipe_fs_i.h | 2 +-
4 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index b1ea7dfbd149..f8296a82d01d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -517,7 +517,7 @@ static void wait_for_dump_helpers(struct file *file)
pipe_lock(pipe);
pipe->readers++;
pipe->writers--;
- wake_up_interruptible_sync(&pipe->wait);
+ wake_up_interruptible_sync(&pipe->rd_wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
pipe_unlock(pipe);

@@ -525,7 +525,7 @@ static void wait_for_dump_helpers(struct file *file)
* We actually want wait_event_freezable() but then we need
* to clear TIF_SIGPENDING and improve dump_interrupted().
*/
- wait_event_interruptible(pipe->wait, pipe->readers == 1);
+ wait_event_interruptible(pipe->rd_wait, pipe->readers == 1);

pipe_lock(pipe);
pipe->readers--;
diff --git a/fs/pipe.c b/fs/pipe.c
index 87109e761fa5..1eb64cab63b5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -108,16 +108,19 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
/* Drop the inode semaphore and wait for a pipe event, atomically */
void pipe_wait(struct pipe_inode_info *pipe)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT(rdwait);
+ DEFINE_WAIT(wrwait);

/*
* Pipes are system-local resources, so sleeping on them
* is considered a noninteractive wait:
*/
- prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
pipe_unlock(pipe);
schedule();
- finish_wait(&pipe->wait, &wait);
+ finish_wait(&pipe->rd_wait, &rdwait);
+ finish_wait(&pipe->wr_wait, &wrwait);
pipe_lock(pipe);
}

@@ -286,7 +289,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
- bool was_full;
+ bool was_full, wake_next_reader = false;
ssize_t ret;

/* Null read succeeds. */
@@ -344,10 +347,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)

if (!buf->len) {
pipe_buf_release(pipe, buf);
- spin_lock_irq(&pipe->wait.lock);
+ spin_lock_irq(&pipe->rd_wait.lock);
tail++;
pipe->tail = tail;
- spin_unlock_irq(&pipe->wait.lock);
+ spin_unlock_irq(&pipe->rd_wait.lock);
}
total_len -= chars;
if (!total_len)
@@ -371,19 +374,24 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
}
__pipe_unlock(pipe);
if (was_full) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- wait_event_interruptible(pipe->wait, pipe_readable(pipe));
+ wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe));
__pipe_lock(pipe);
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
+ wake_next_reader = true;
}
+ if (pipe_empty(pipe->head, pipe->tail))
+ wake_next_reader = false;
__pipe_unlock(pipe);

if (was_full) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
+ if (wake_next_reader)
+ wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
if (ret > 0)
file_accessed(filp);
return ret;
@@ -415,6 +423,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
+ bool wake_next_writer = false;

/* Null write succeeds. */
if (unlikely(total_len == 0))
@@ -493,16 +502,16 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
- spin_lock_irq(&pipe->wait.lock);
+ spin_lock_irq(&pipe->rd_wait.lock);

head = pipe->head;
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
- spin_unlock_irq(&pipe->wait.lock);
+ spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}

pipe->head = head + 1;
- spin_unlock_irq(&pipe->wait.lock);
+ spin_unlock_irq(&pipe->rd_wait.lock);

/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
@@ -554,14 +563,17 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
*/
__pipe_unlock(pipe);
if (was_empty) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
+ wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
- wait_event_interruptible(pipe->wait, pipe_writable(pipe));
+ wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
__pipe_lock(pipe);
was_empty = pipe_empty(head, pipe->tail);
+ wake_next_writer = true;
}
out:
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ wake_next_writer = false;
__pipe_unlock(pipe);

/*
@@ -574,9 +586,11 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* wake up pending jobs
*/
if (was_empty) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
+ wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
+ if (wake_next_writer)
+ wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
int err = file_update_time(filp);
if (err)
@@ -620,12 +634,15 @@ pipe_poll(struct file *filp, poll_table *wait)
unsigned int head, tail;

/*
- * Reading only -- no need for acquiring the semaphore.
+ * Reading pipe state only -- no need for acquiring the semaphore.
*
* But because this is racy, the code has to add the
* entry to the poll table _first_ ..
*/
- poll_wait(filp, &pipe->wait, wait);
+ if (filp->f_mode & FMODE_READ)
+ poll_wait(filp, &pipe->rd_wait, wait);
+ if (filp->f_mode & FMODE_WRITE)
+ poll_wait(filp, &pipe->wr_wait, wait);

/*
* .. and only then can you do the racy tests. That way,
@@ -684,7 +701,8 @@ pipe_release(struct inode *inode, struct file *file)
pipe->writers--;

if (pipe->readers || pipe->writers) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM | EPOLLERR | EPOLLHUP);
+ wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLHUP);
+ wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM | EPOLLERR | EPOLLHUP);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
@@ -767,7 +785,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
GFP_KERNEL_ACCOUNT);

if (pipe->bufs) {
- init_waitqueue_head(&pipe->wait);
+ init_waitqueue_head(&pipe->rd_wait);
+ init_waitqueue_head(&pipe->wr_wait);
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
@@ -985,7 +1004,8 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)

static void wake_up_partner(struct pipe_inode_info *pipe)
{
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible(&pipe->rd_wait);
+ wake_up_interruptible(&pipe->wr_wait);
}

static int fifo_open(struct inode *inode, struct file *filp)
@@ -1096,13 +1116,13 @@ static int fifo_open(struct inode *inode, struct file *filp)

err_rd:
if (!--pipe->readers)
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible(&pipe->wr_wait);
ret = -ERESTARTSYS;
goto err;

err_wr:
if (!--pipe->writers)
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible(&pipe->rd_wait);
ret = -ERESTARTSYS;
goto err;

@@ -1229,7 +1249,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
pipe->max_usage = nr_slots;
pipe->tail = tail;
pipe->head = head;
- wake_up_interruptible_all(&pipe->wait);
+ wake_up_interruptible_all(&pipe->rd_wait);
+ wake_up_interruptible_all(&pipe->wr_wait);
return pipe->max_usage * PAGE_SIZE;

out_revert_acct:
diff --git a/fs/splice.c b/fs/splice.c
index 3009652a41c8..d671936d0aad 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -165,8 +165,8 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
{
smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible(&pipe->wait);
+ if (waitqueue_active(&pipe->rd_wait))
+ wake_up_interruptible(&pipe->rd_wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}

@@ -462,8 +462,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible(&pipe->wait);
+ if (waitqueue_active(&pipe->wr_wait))
+ wake_up_interruptible(&pipe->wr_wait);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index dbcfa6892384..d5765039652a 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -47,7 +47,7 @@ struct pipe_buffer {
**/
struct pipe_inode_info {
struct mutex mutex;
- wait_queue_head_t wait;
+ wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;