Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

From: Eric W. Biederman
Date: Sat May 27 2023 - 05:50:00 EST


Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> writes:

> So I'd really like to finish this. Even if we end up with a hack or
> two in signal handling that we can hopefully fix up later by having
> vhost fix up some of its current assumptions.


The real sticky widget for me is how to handle one of these processes
coredumping. It really looks like it will result in a reliable hang.

Limiting ourselves to changes that will only affect vhost, all I can
see would be allowing the vhost_worker thread to exit as soon as
get_signal reports the process is exiting. Then vhost_dev_flush
would need to process the pending work.

Something like this:

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a92af08e7864..fb5ebc50c553 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -234,14 +234,31 @@ EXPORT_SYMBOL_GPL(vhost_poll_stop);
void vhost_dev_flush(struct vhost_dev *dev)
{
struct vhost_flush_struct flush;
+ struct vhost_worker *worker = dev->worker;
+ struct llist_node *node, *head;
+
+ if (!worker)
+ return;
+
+ init_completion(&flush.wait_event);
+ vhost_work_init(&flush.work, vhost_flush_work);

- if (dev->worker) {
- init_completion(&flush.wait_event);
- vhost_work_init(&flush.work, vhost_flush_work);
+ vhost_work_queue(dev, &flush.work);

- vhost_work_queue(dev, &flush.work);
- wait_for_completion(&flush.wait_event);
+ /* Either vhost_worker runs the pending work or we do */
+ node = llist_del_all(&worker->work_list);
+ if (node) {
+ node = llist_reverse_order(node);
+ /* make sure flag is seen after deletion */
+ smp_wmb();
+ llist_for_each_entry_safe(work, work_next, node, node) {
+ clear_bit(VHOST_WORK_QUEUED, &work->flags);
+ work->fn(work);
+ cond_resched();
+ }
}
+
+ wait_for_completion(&flush.wait_event);
}
EXPORT_SYMBOL_GPL(vhost_dev_flush);

@@ -338,6 +355,7 @@ static int vhost_worker(void *data)
struct vhost_worker *worker = data;
struct vhost_work *work, *work_next;
struct llist_node *node;
+ struct ksignal ksig;

for (;;) {
/* mb paired w/ kthread_stop */
@@ -348,6 +366,9 @@ static int vhost_worker(void *data)
break;
}

+ if (get_signal(&ksig))
+ break;
+
node = llist_del_all(&worker->work_list);
if (!node)
schedule();
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index b7cbd66f889e..613d52f01c07 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -47,6 +47,7 @@ void vhost_task_stop(struct vhost_task *vtsk)
* not exiting then reap the task.
*/
kernel_wait4(pid, NULL, __WCLONE, NULL);
+ put_task_struct(vtsk->task);
kfree(vtsk);
}
EXPORT_SYMBOL_GPL(vhost_task_stop);
@@ -101,7 +102,7 @@ struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
return NULL;
}

- vtsk->task = tsk;
+ vtsk->task = get_task_struct(tsk);
return vtsk;
}
EXPORT_SYMBOL_GPL(vhost_task_create);

Eric