Re: [PATCH v5 6/7] s390: vfio-ap: add logging to vfio_ap driver
From: Cornelia Huck
Date: Mon Aug 12 2019 - 06:35:29 EST
On Wed, 31 Jul 2019 18:41:16 -0400
Tony Krowiak <akrowiak@xxxxxxxxxxxxx> wrote:
> Added two DBF log files for logging events and errors; one for the vfio_ap
> driver, and one for each matrix mediated device.
While the s390dbf is useful (especially for accessing the information
in dumps), trace points are a more standard interface. Have you
evaluated that as well? (We probably should add something to the
vfio/mdev code as well; tracing there is a good complement to tracing
in vendor drivers.)
Also, isn't this independent of the rest of the series?
>
> Signed-off-by: Tony Krowiak <akrowiak@xxxxxxxxxxxxx>
> ---
> drivers/s390/crypto/vfio_ap_drv.c | 34 +++++++
> drivers/s390/crypto/vfio_ap_ops.c | 187 ++++++++++++++++++++++++++++++----
> drivers/s390/crypto/vfio_ap_private.h | 20 ++++
> 3 files changed, 223 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
> index d8da520ae1fa..04a77246c22a 100644
> --- a/drivers/s390/crypto/vfio_ap_drv.c
> +++ b/drivers/s390/crypto/vfio_ap_drv.c
> @@ -22,6 +22,10 @@ MODULE_AUTHOR("IBM Corporation");
> MODULE_DESCRIPTION("VFIO AP device driver, Copyright IBM Corp. 2018");
> MODULE_LICENSE("GPL v2");
>
> +uint dbglvl = 3;
> +module_param(dbglvl, uint, 0444);
> +MODULE_PARM_DESC(dbglvl, "VFIO_AP driver debug level.");
More the default debug level, isn't it? (IIRC, you can change the level
of the s390dbfs dynamically.)
> +
> static struct ap_driver vfio_ap_drv;
>
> struct ap_matrix_dev *matrix_dev;
> @@ -158,6 +162,21 @@ static void vfio_ap_matrix_dev_destroy(void)
> root_device_unregister(root_device);
> }
>
> +static void vfio_ap_log_queues_in_use(struct ap_matrix_mdev *matrix_mdev,
> + unsigned long *apm, unsigned long *aqm)
> +{
> + unsigned long apid, apqi;
> +
> + for_each_set_bit_inv(apid, apm, AP_DEVICES) {
> + for_each_set_bit_inv(apqi, aqm, AP_DOMAINS) {
> + VFIO_AP_DBF(matrix_dev, DBF_ERR,
> + "queue %02lx.%04lx in use by mdev %s\n",
> + apid, apqi,
> + dev_name(mdev_dev(matrix_mdev->mdev)));
I remember some issues wrt %s in s390dbfs (lifetime); will this dbf
potentially outlive the mdev? Or is the string copied? (Or has s390dbf
been changed to avoid that trap? If so, please disregard my comments.)
> + }
> + }
> +}
> +
> static bool vfio_ap_resource_in_use(unsigned long *apm, unsigned long *aqm)
> {
> bool in_use = false;
> @@ -179,6 +198,8 @@ static bool vfio_ap_resource_in_use(unsigned long *apm, unsigned long *aqm)
> continue;
>
> in_use = true;
> + vfio_ap_log_queues_in_use(matrix_mdev, apm_intrsctn,
> + aqm_intrsctn);
> }
>
> mutex_unlock(&matrix_dev->lock);
> @@ -186,6 +207,16 @@ static bool vfio_ap_resource_in_use(unsigned long *apm, unsigned long *aqm)
> return in_use;
> }
>
> +static int __init vfio_ap_debug_init(void)
> +{
> + matrix_dev->dbf = debug_register(VFIO_AP_DRV_NAME, 1, 1,
> + DBF_SPRINTF_MAX_ARGS * sizeof(long));
It seems that debug_register() can possibly fail? (Unlikely, but we
should check.)
> + debug_register_view(matrix_dev->dbf, &debug_sprintf_view);
> + debug_set_level(matrix_dev->dbf, dbglvl);
> +
> + return 0;
> +}
> +
> static int __init vfio_ap_init(void)
> {
> int ret;
> @@ -219,6 +250,8 @@ static int __init vfio_ap_init(void)
> return ret;
> }
>
> + vfio_ap_debug_init();
> +
> return 0;
> }
>
> @@ -227,6 +260,7 @@ static void __exit vfio_ap_exit(void)
> vfio_ap_mdev_unregister();
> ap_driver_unregister(&vfio_ap_drv);
> vfio_ap_matrix_dev_destroy();
> + debug_unregister(matrix_dev->dbf);
> }
>
> module_init(vfio_ap_init);
> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
> index 0e748819abb6..1aa18eba43d0 100644
> --- a/drivers/s390/crypto/vfio_ap_ops.c
> +++ b/drivers/s390/crypto/vfio_ap_ops.c
> @@ -167,17 +167,23 @@ static struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q)
> case AP_RESPONSE_INVALID_ADDRESS:
> default:
> /* All cases in default means AP not operational */
> - WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> - status.response_code);
> goto end_free;
> }
> } while (retries--);
>
> - WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> - status.response_code);
> end_free:
> vfio_ap_free_aqic_resources(q);
> q->matrix_mdev = NULL;
> + if (status.response_code) {
If I read the code correctly, we consider AP_RESPONSE_OTHERWISE_CHANGED
a success as well, don't we? (Not sure what that means, though.)
> + VFIO_AP_MDEV_DBF(q->matrix_mdev, DBF_WARN,
> + "IRQ disable failed for queue %02x.%04x: status response code=%u\n",
> + AP_QID_CARD(q->apqn), AP_QID_QUEUE(q->apqn),
> + status.response_code);
> + } else {
> + VFIO_AP_MDEV_DBF(q->matrix_mdev, DBF_INFO,
> + "IRQ disabled for queue %02x.%04x\n",
> + AP_QID_CARD(q->apqn), AP_QID_QUEUE(q->apqn));
> + }
> return status;
> }
>
(...)
> @@ -321,8 +340,29 @@ static void vfio_ap_matrix_init(struct ap_config_info *info,
> matrix->adm_max = info->apxa ? info->Nd : 15;
> }
>
> +static int vfio_ap_mdev_debug_init(struct ap_matrix_mdev *matrix_mdev)
> +{
> + int ret;
> +
> + matrix_mdev->dbf = debug_register(dev_name(mdev_dev(matrix_mdev->mdev)),
> + 1, 1,
> + DBF_SPRINTF_MAX_ARGS * sizeof(long));
> +
> + if (!matrix_mdev->dbf)
> + return -ENOMEM;
Ok, here we do check for the result of debug_register().
> +
> + ret = debug_register_view(matrix_mdev->dbf, &debug_sprintf_view);
> + if (ret)
> + return ret;
Don't we need to clean up ->dbf in the failure case?
Also, we probably need to check this as well for the other dbf.
> +
> + debug_set_level(matrix_mdev->dbf, dbglvl);
> +
> + return 0;
> +}
> +
> static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
> {
> + int ret;
> struct ap_matrix_mdev *matrix_mdev;
>
> if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0))
> @@ -335,6 +375,13 @@ static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
> }
>
> matrix_mdev->mdev = mdev;
> +
> + ret = vfio_ap_mdev_debug_init(matrix_mdev);
> + if (ret) {
> + kfree(matrix_mdev);
> + return ret;
You also should bump available_instances again in the failure case.
> + }
> +
> vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix);
> mdev_set_drvdata(mdev, matrix_mdev);
> matrix_mdev->pqap_hook.hook = handle_pqap;
> @@ -350,14 +397,19 @@ static int vfio_ap_mdev_remove(struct mdev_device *mdev)
> {
> struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
>
> - if (matrix_mdev->kvm)
> + if (matrix_mdev->kvm) {
> + VFIO_AP_MDEV_DBF(matrix_mdev, DBF_ERR,
> + "remove rejected, mdev in use by %s",
> + matrix_mdev->kvm->debugfs_dentry->d_iname);
Can this be a problem when the kvm goes away (and the d_iname is gone)?
Regardless of s390dbf implementation details, is d_iname even valid in
all cases (no debugfs)?
> return -EBUSY;
> + }
>
> mutex_lock(&matrix_dev->lock);
> vfio_ap_mdev_reset_queues(mdev);
> list_del(&matrix_mdev->node);
> mutex_unlock(&matrix_dev->lock);
>
> + debug_unregister(matrix_mdev->dbf);
> kfree(matrix_mdev);
> mdev_set_drvdata(mdev, NULL);
> atomic_inc(&matrix_dev->available_instances);
> @@ -406,6 +458,22 @@ static struct attribute_group *vfio_ap_mdev_type_groups[] = {
> NULL,
> };
>
> +static void vfio_ap_mdev_log_sharing_error(struct ap_matrix_mdev *logdev,
> + const char *assigned_to,
> + unsigned long *apm,
> + unsigned long *aqm)
> +{
> + unsigned long apid, apqi;
> +
> + for_each_set_bit_inv(apid, apm, AP_DEVICES) {
> + for_each_set_bit_inv(apqi, aqm, AP_DOMAINS) {
> + VFIO_AP_MDEV_DBF(logdev, DBF_ERR,
> + "queue %02lx.%04lx already assigned to %s\n",
I'm also not 100% sure about string lifetimes here.
> + apid, apqi, assigned_to);
> + }
> + }
> +}
> +
> /**
> * vfio_ap_mdev_verify_no_sharing
> *
> @@ -448,22 +516,39 @@ static int vfio_ap_mdev_verify_no_sharing(struct ap_matrix_mdev *matrix_mdev,
> if (!bitmap_and(aqm, mdev_aqm, lstdev->matrix.aqm, AP_DOMAINS))
> continue;
>
> + vfio_ap_mdev_log_sharing_error(matrix_mdev,
> + dev_name(mdev_dev(lstdev->mdev)),
> + apm, aqm);
> +
> return -EADDRINUSE;
> }
>
> return 0;
> }
>
> -static int vfio_ap_mdev_validate_masks(struct ap_matrix_mdev *matrix_mdev,
> +static int vfio_ap_mdev_validate_masks(struct ap_matrix_mdev *logdev,
> unsigned long *apm, unsigned long *aqm)
> {
> - int ret;
> + int ret = 0;
> + unsigned long apid, apqi;
> +
> + for_each_set_bit_inv(apid, apm, AP_DEVICES) {
> + for_each_set_bit_inv(apqi, aqm, AP_DEVICES) {
> + if (!ap_owned_by_def_drv(apid, apqi))
> + continue;
> +
> + VFIO_AP_MDEV_DBF(logdev, DBF_ERR,
> + "queue %02lx.%04lx owned by zcrypt\n",
s/zcrypt/default driver/ ?
> + apid, apqi);
> +
> + ret = -EADDRNOTAVAIL;
> + }
> + }
>
> - ret = ap_apqn_in_matrix_owned_by_def_drv(apm, aqm);
> if (ret)
> - return (ret == 1) ? -EADDRNOTAVAIL : ret;
> + return ret;
>
> - return vfio_ap_mdev_verify_no_sharing(matrix_mdev, apm, aqm);
> + return vfio_ap_mdev_verify_no_sharing(logdev, apm, aqm);
> }
>
> static void vfio_ap_mdev_update_crycb(struct ap_matrix_mdev *matrix_mdev)
(...)
> @@ -1013,9 +1132,10 @@ static void vfio_ap_mdev_wait_for_qempty(ap_qid_t qid)
> msleep(20);
> break;
> default:
> - pr_warn("%s: tapq response %02x waiting for queue %04x.%02x empty\n",
> - __func__, status.response_code,
> - AP_QID_CARD(qid), AP_QID_QUEUE(qid));
> + WARN_ONCE(1,
> + "%s: tapq response %02x waiting for queue %04x.%02x empty\n",
> + __func__, status.response_code,
> + AP_QID_CARD(qid), AP_QID_QUEUE(qid));
Why this change?
> return;
> }
> } while (--retry);
(...)
> diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
> index 5cc3c2ebf151..f717e43e10cf 100644
> --- a/drivers/s390/crypto/vfio_ap_private.h
> +++ b/drivers/s390/crypto/vfio_ap_private.h
> @@ -24,6 +24,21 @@
> #define VFIO_AP_MODULE_NAME "vfio_ap"
> #define VFIO_AP_DRV_NAME "vfio_ap"
>
> +#define DBF_ERR 3 /* error conditions */
> +#define DBF_WARN 4 /* warning conditions */
> +#define DBF_INFO 5 /* informational */
> +#define DBF_DEBUG 6 /* for debugging only */
Can you reuse the LOGLEVEL_* constants instead of rolling your own?
> +
> +#define DBF_SPRINTF_MAX_ARGS 5
> +
> +#define VFIO_AP_DBF(d_matrix_dev, ...) \
> + debug_sprintf_event(d_matrix_dev->dbf, ##__VA_ARGS__)
> +
> +#define VFIO_AP_MDEV_DBF(d_matrix_mdev, ...) \
> + debug_sprintf_event(d_matrix_mdev->dbf, ##__VA_ARGS__)
> +
> +extern uint dbglvl;
> +
> /**
> * ap_matrix_dev - the AP matrix device structure
> * @device: generic device structure associated with the AP matrix device
> @@ -43,6 +58,7 @@ struct ap_matrix_dev {
> struct list_head mdev_list;
> struct mutex lock;
> struct ap_driver *vfio_ap_drv;
> + debug_info_t *dbf;
> };
>
> extern struct ap_matrix_dev *matrix_dev;
> @@ -77,6 +93,9 @@ struct ap_matrix {
> * @group_notifier: notifier block used for specifying callback function for
> * handling the VFIO_GROUP_NOTIFY_SET_KVM event
> * @kvm: the struct holding guest's state
> + * @pqap_hook: handler for PQAP instruction
> + * @mdev: the matrix mediated device
Should updating the description for these two go into a trivial
separate patch?
> + * @dbf: the debug info log
> */
> struct ap_matrix_mdev {
> struct list_head node;
> @@ -86,6 +105,7 @@ struct ap_matrix_mdev {
> struct kvm *kvm;
> struct kvm_s390_module_hook pqap_hook;
> struct mdev_device *mdev;
> + debug_info_t *dbf;
> };
>
> extern int vfio_ap_mdev_register(void);