Re: [PATCHv4 3/3] vfio/mdev: Synchronize device create/remove with parent removal

From: Alex Williamson
Date: Wed May 29 2019 - 11:00:03 EST


On Fri, 24 May 2019 08:57:38 -0500
Parav Pandit <parav@xxxxxxxxxxxx> wrote:

> In following sequences, child devices created while removing mdev parent
> device can be left out, or it may lead to race of removing half
> initialized child mdev devices.
>
> issue-1:
> --------
> cpu-0 cpu-1
> ----- -----
> mdev_unregister_device()
> device_for_each_child()
> mdev_device_remove_cb()
> mdev_device_remove()
> create_store()
> mdev_device_create() [...]
> device_add()
> parent_remove_sysfs_files()
>
> /* BUG: device added by cpu-0
> * whose parent is getting removed
> * and it won't process this mdev.
> */
>
> issue-2:
> --------
> Below crash is observed when user initiated remove is in progress
> and mdev_unregister_driver() completes parent unregistration.
>
> cpu-0 cpu-1
> ----- -----
> remove_store()
> mdev_device_remove()
> active = false;
> mdev_unregister_device()
> parent device removed.
> [...]
> parents->ops->remove()
> /*
> * BUG: Accessing invalid parent.
> */
>
> This is similar race like create() racing with mdev_unregister_device().
>
> BUG: unable to handle kernel paging request at ffffffffc0585668
> PGD e8f618067 P4D e8f618067 PUD e8f61a067 PMD 85adca067 PTE 0
> Oops: 0000 [#1] SMP PTI
> CPU: 41 PID: 37403 Comm: bash Kdump: loaded Not tainted 5.1.0-rc6-vdevbus+ #6
> Hardware name: Supermicro SYS-6028U-TR4+/X10DRU-i+, BIOS 2.0b 08/09/2016
> RIP: 0010:mdev_device_remove+0xfa/0x140 [mdev]
> Call Trace:
> remove_store+0x71/0x90 [mdev]
> kernfs_fop_write+0x113/0x1a0
> vfs_write+0xad/0x1b0
> ksys_write+0x5a/0xe0
> do_syscall_64+0x5a/0x210
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Therefore, mdev core is improved as below to overcome above issues.
>
> Wait for any ongoing mdev create() and remove() to finish before
> unregistering parent device.
> This continues to allow multiple create and remove to progress in
> parallel for different mdev devices as most common case.
> At the same time guard parent removal while parent is being access by
> create() and remove callbacks.
> create()/remove() and unregister_device() are synchronized by the rwsem.
>
> Refactor device removal code to mdev_device_remove_common() to avoid
> acquiring unreg_sem of the parent.
>
> Fixes: 7b96953bc640 ("vfio: Mediated device Core driver")
> Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx>
> ---
> drivers/vfio/mdev/mdev_core.c | 61 ++++++++++++++++++++++++--------
> drivers/vfio/mdev/mdev_private.h | 2 ++
> 2 files changed, 49 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
> index 0bef0cae1d4b..c5401a8c6843 100644
> --- a/drivers/vfio/mdev/mdev_core.c
> +++ b/drivers/vfio/mdev/mdev_core.c
> @@ -102,11 +102,36 @@ static void mdev_put_parent(struct mdev_parent *parent)
> kref_put(&parent->ref, mdev_release_parent);
> }
>

Some sort of locking semantics comment would be useful here, ex:

/* Caller holds parent unreg_sem read or write lock */

> +static void mdev_device_remove_common(struct mdev_device *mdev)
> +{
> + struct mdev_parent *parent;
> + struct mdev_type *type;
> + int ret;
> +
> + type = to_mdev_type(mdev->type_kobj);
> + mdev_remove_sysfs_files(&mdev->dev, type);
> + device_del(&mdev->dev);
> + parent = mdev->parent;
> + ret = parent->ops->remove(mdev);
> + if (ret)
> + dev_err(&mdev->dev, "Remove failed: err=%d\n", ret);
> +
> + /* Balances with device_initialize() */
> + put_device(&mdev->dev);
> + mdev_put_parent(parent);
> +}
> +
> static int mdev_device_remove_cb(struct device *dev, void *data)
> {
> - if (dev_is_mdev(dev))
> - mdev_device_remove(dev);
> + struct mdev_parent *parent;
> + struct mdev_device *mdev;
>
> + if (!dev_is_mdev(dev))
> + return 0;
> +
> + mdev = to_mdev_device(dev);
> + parent = mdev->parent;
> + mdev_device_remove_common(mdev);

'parent' is unused here and we only use mdev once, so we probably don't
need to put it in a local variable.

> return 0;
> }
>
> @@ -148,6 +173,7 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
> }
>
> kref_init(&parent->ref);
> + init_rwsem(&parent->unreg_sem);
>
> parent->dev = dev;
> parent->ops = ops;
> @@ -206,13 +232,17 @@ void mdev_unregister_device(struct device *dev)
> dev_info(dev, "MDEV: Unregistering\n");
>
> list_del(&parent->next);
> + mutex_unlock(&parent_list_lock);
> +
> + down_write(&parent->unreg_sem);
> +
> class_compat_remove_link(mdev_bus_compat_class, dev, NULL);
>
> device_for_each_child(dev, NULL, mdev_device_remove_cb);
>
> parent_remove_sysfs_files(parent);
> + up_write(&parent->unreg_sem);
>
> - mutex_unlock(&parent_list_lock);
> mdev_put_parent(parent);
> }
> EXPORT_SYMBOL(mdev_unregister_device);
> @@ -265,6 +295,12 @@ int mdev_device_create(struct kobject *kobj,
>
> mdev->parent = parent;
>
> + ret = down_read_trylock(&parent->unreg_sem);
> + if (!ret) {
> + ret = -ENODEV;

I would have expected -EAGAIN or -EBUSY here, but I guess that since we
consider the lock-out to deterministically be the parent going away
that -ENODEV makes sense. Ok.

> + goto mdev_fail;
> + }
> +
> device_initialize(&mdev->dev);
> mdev->dev.parent = dev;
> mdev->dev.bus = &mdev_bus_type;
> @@ -287,6 +323,7 @@ int mdev_device_create(struct kobject *kobj,
>
> mdev->active = true;
> dev_dbg(&mdev->dev, "MDEV: created\n");
> + up_read(&parent->unreg_sem);
>
> return 0;
>
> @@ -295,6 +332,7 @@ int mdev_device_create(struct kobject *kobj,
> add_fail:
> parent->ops->remove(mdev);
> ops_create_fail:
> + up_read(&parent->unreg_sem);
> put_device(&mdev->dev);
> mdev_fail:
> mdev_put_parent(parent);
> @@ -305,7 +343,6 @@ int mdev_device_remove(struct device *dev)
> {
> struct mdev_device *mdev, *tmp;
> struct mdev_parent *parent;
> - struct mdev_type *type;
> int ret;
>
> mdev = to_mdev_device(dev);
> @@ -329,18 +366,14 @@ int mdev_device_remove(struct device *dev)
> mdev->active = false;
> mutex_unlock(&mdev_list_lock);
>
> - type = to_mdev_type(mdev->type_kobj);
> - mdev_remove_sysfs_files(dev, type);
> - device_del(&mdev->dev);
> parent = mdev->parent;
> - ret = parent->ops->remove(mdev);
> - if (ret)
> - dev_err(&mdev->dev, "Remove failed: err=%d\n", ret);
> -
> - /* Balances with device_initialize() */
> - put_device(&mdev->dev);
> - mdev_put_parent(parent);
> + /* Check if parent unregistration has started */
> + ret = down_read_trylock(&parent->unreg_sem);
> + if (!ret)
> + return -ENODEV;
>
> + mdev_device_remove_common(mdev);
> + up_read(&parent->unreg_sem);
> return 0;
> }
>
> diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
> index 924ed2274941..398767526276 100644
> --- a/drivers/vfio/mdev/mdev_private.h
> +++ b/drivers/vfio/mdev/mdev_private.h
> @@ -23,6 +23,8 @@ struct mdev_parent {
> struct list_head next;
> struct kset *mdev_types_kset;
> struct list_head type_list;
> + /* Synchronize device creation/removal with parent unregistration */
> + struct rw_semaphore unreg_sem;
> };
>
> struct mdev_device {