Re: [PATCH 0/2] pipe: Fixes [ver #2]

From: Linus Torvalds
Date: Mon Dec 09 2019 - 12:49:12 EST


[ Added DJ to the participants, since he seems to be the Fedora make
maintainer - DJ, any chance that this absolutely horrid 'make' buf can
be fixed in older versions too, not just rawhide? The bugfix is two
and a half years old by now, and the bug looks real and very serious ]

On Mon, Dec 9, 2019 at 1:54 AM Vincent Guittot
<vincent.guittot@xxxxxxxxxx> wrote:
>
> Which version of make should I use to reproduce the problem ?

So the problematic one is "make-4.2.1-13.fc30.x86_64" in Fedora 30.
I'm assuming it's fairly plain 4.2.1, but I didn't try to look into
the source rpm or anything like that.

The working one for me was just the top of -git from

https://git.savannah.gnu.org/git/make.git

which is 4.2.92 right now.

The fix is presumably commit b552b05 ("[SV 51159] Use a non-blocking
read with pselect to avoid hangs") as per Akemi. That is indeed after
4.2.1, and it looks real.

Before that commit the buggy jobserver code basically does

(1) use pselect() to wait for readable and see child deaths atomically
(2) use blocking read to get the token

and while (1) is atomic, if the child death happens between the two,
it goes into the blocking read and has SIGCHLD blocked, so it will try
to read the token from the token pipe, but it will never react to the
child death - and the child death is what is going to _release_ a
token.

So what seems to happen is that when the right timing triggers, you
end up with a lot of sub-makes waiting for a token, but they are also
all supposed to _release_ a token. So you don't have enough tokens to
go around. In the worst case, _everybody_ who has a token is also not
releasing it, and then you end up triggering the timeout code (after
one second), which will make things really go into a crawl.

And by a crawl I mean that worst-case you really end up with just one
job per second per sub-make. It will take _hours_ to compile the
kernel at that speed, when it normally finishes in 15 minutes on my
machine even when I do a from-scratch allmodconfig build.

It does seem to be a major bug in the jobserver code. In particular
with the trial fair and exclusive wakeup patch that I sent out in the
other thread, it seems to be _reliably_ much worse and triggers 100%
of the time for me.

It's possible that my trial patch is buggy, but everything else looks
fine, and with a fixed make the trial patch works for me.

I'll include the trial patch here too, I think I cc'd you on the other
thread too, but hey..

Anyway, it looks like the sync wakeup thing is more of a "get timing
right by luck" thing than anything else. Possibly it actually causes
the reverse order of reader wakeups more often (ie the most _recent_
reader is most likely to get woken up synchronously) and that may be
what really ends up masking the jobserver problem, since apparently
doing wakeups in the fair and proper order makes things much worse..

What a horrible pain that pipe rework ended up being. But I think
we're in better shape now than we used to be, it just had very
unfortunate timing issues and several real bugs.

But sadly, there's no way I can push that fair pipe wakeup thing as
long as this horribly buggy version of make is widespread.

Linus
fs/coredump.c | 4 +--
fs/pipe.c | 67 +++++++++++++++++++++++++++++++----------------
fs/splice.c | 8 +++---
include/linux/pipe_fs_i.h | 2 +-
4 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index b1ea7dfbd149..f8296a82d01d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -517,7 +517,7 @@ static void wait_for_dump_helpers(struct file *file)
pipe_lock(pipe);
pipe->readers++;
pipe->writers--;
- wake_up_interruptible_sync(&pipe->wait);
+ wake_up_interruptible_sync(&pipe->rd_wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
pipe_unlock(pipe);

@@ -525,7 +525,7 @@ static void wait_for_dump_helpers(struct file *file)
* We actually want wait_event_freezable() but then we need
* to clear TIF_SIGPENDING and improve dump_interrupted().
*/
- wait_event_interruptible(pipe->wait, pipe->readers == 1);
+ wait_event_interruptible(pipe->rd_wait, pipe->readers == 1);

pipe_lock(pipe);
pipe->readers--;
diff --git a/fs/pipe.c b/fs/pipe.c
index 87109e761fa5..1eb64cab63b5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -108,16 +108,19 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
/* Drop the inode semaphore and wait for a pipe event, atomically */
void pipe_wait(struct pipe_inode_info *pipe)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT(rdwait);
+ DEFINE_WAIT(wrwait);

/*
* Pipes are system-local resources, so sleeping on them
* is considered a noninteractive wait:
*/
- prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
pipe_unlock(pipe);
schedule();
- finish_wait(&pipe->wait, &wait);
+ finish_wait(&pipe->rd_wait, &rdwait);
+ finish_wait(&pipe->wr_wait, &wrwait);
pipe_lock(pipe);
}

@@ -286,7 +289,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
- bool was_full;
+ bool was_full, wake_next_reader = false;
ssize_t ret;

/* Null read succeeds. */
@@ -344,10 +347,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)

if (!buf->len) {
pipe_buf_release(pipe, buf);
- spin_lock_irq(&pipe->wait.lock);
+ spin_lock_irq(&pipe->rd_wait.lock);
tail++;
pipe->tail = tail;
- spin_unlock_irq(&pipe->wait.lock);
+ spin_unlock_irq(&pipe->rd_wait.lock);
}
total_len -= chars;
if (!total_len)
@@ -371,19 +374,24 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
}
__pipe_unlock(pipe);
if (was_full) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- wait_event_interruptible(pipe->wait, pipe_readable(pipe));
+ wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe));
__pipe_lock(pipe);
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
+ wake_next_reader = true;
}
+ if (pipe_empty(pipe->head, pipe->tail))
+ wake_next_reader = false;
__pipe_unlock(pipe);

if (was_full) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
+ if (wake_next_reader)
+ wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
if (ret > 0)
file_accessed(filp);
return ret;
@@ -415,6 +423,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
+ bool wake_next_writer = false;

/* Null write succeeds. */
if (unlikely(total_len == 0))
@@ -493,16 +502,16 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
- spin_lock_irq(&pipe->wait.lock);
+ spin_lock_irq(&pipe->rd_wait.lock);

head = pipe->head;
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
- spin_unlock_irq(&pipe->wait.lock);
+ spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}

pipe->head = head + 1;
- spin_unlock_irq(&pipe->wait.lock);
+ spin_unlock_irq(&pipe->rd_wait.lock);

/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
@@ -554,14 +563,17 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
*/
__pipe_unlock(pipe);
if (was_empty) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
+ wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
- wait_event_interruptible(pipe->wait, pipe_writable(pipe));
+ wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
__pipe_lock(pipe);
was_empty = pipe_empty(head, pipe->tail);
+ wake_next_writer = true;
}
out:
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ wake_next_writer = false;
__pipe_unlock(pipe);

/*
@@ -574,9 +586,11 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* wake up pending jobs
*/
if (was_empty) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
+ wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
+ if (wake_next_writer)
+ wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
int err = file_update_time(filp);
if (err)
@@ -620,12 +634,15 @@ pipe_poll(struct file *filp, poll_table *wait)
unsigned int head, tail;

/*
- * Reading only -- no need for acquiring the semaphore.
+ * Reading pipe state only -- no need for acquiring the semaphore.
*
* But because this is racy, the code has to add the
* entry to the poll table _first_ ..
*/
- poll_wait(filp, &pipe->wait, wait);
+ if (filp->f_mode & FMODE_READ)
+ poll_wait(filp, &pipe->rd_wait, wait);
+ if (filp->f_mode & FMODE_WRITE)
+ poll_wait(filp, &pipe->wr_wait, wait);

/*
* .. and only then can you do the racy tests. That way,
@@ -684,7 +701,8 @@ pipe_release(struct inode *inode, struct file *file)
pipe->writers--;

if (pipe->readers || pipe->writers) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM | EPOLLERR | EPOLLHUP);
+ wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLHUP);
+ wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM | EPOLLERR | EPOLLHUP);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
@@ -767,7 +785,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
GFP_KERNEL_ACCOUNT);

if (pipe->bufs) {
- init_waitqueue_head(&pipe->wait);
+ init_waitqueue_head(&pipe->rd_wait);
+ init_waitqueue_head(&pipe->wr_wait);
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
@@ -985,7 +1004,8 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)

static void wake_up_partner(struct pipe_inode_info *pipe)
{
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible(&pipe->rd_wait);
+ wake_up_interruptible(&pipe->wr_wait);
}

static int fifo_open(struct inode *inode, struct file *filp)
@@ -1096,13 +1116,13 @@ static int fifo_open(struct inode *inode, struct file *filp)

err_rd:
if (!--pipe->readers)
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible(&pipe->wr_wait);
ret = -ERESTARTSYS;
goto err;

err_wr:
if (!--pipe->writers)
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible(&pipe->rd_wait);
ret = -ERESTARTSYS;
goto err;

@@ -1229,7 +1249,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
pipe->max_usage = nr_slots;
pipe->tail = tail;
pipe->head = head;
- wake_up_interruptible_all(&pipe->wait);
+ wake_up_interruptible_all(&pipe->rd_wait);
+ wake_up_interruptible_all(&pipe->wr_wait);
return pipe->max_usage * PAGE_SIZE;

out_revert_acct:
diff --git a/fs/splice.c b/fs/splice.c
index 3009652a41c8..d671936d0aad 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -165,8 +165,8 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
{
smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible(&pipe->wait);
+ if (waitqueue_active(&pipe->rd_wait))
+ wake_up_interruptible(&pipe->rd_wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}

@@ -462,8 +462,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible(&pipe->wait);
+ if (waitqueue_active(&pipe->wr_wait))
+ wake_up_interruptible(&pipe->wr_wait);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index dbcfa6892384..d5765039652a 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -47,7 +47,7 @@ struct pipe_buffer {
**/
struct pipe_inode_info {
struct mutex mutex;
- wait_queue_head_t wait;
+ wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;