Re: [PATCH] vfio: align capability structures

From: Stefan Hajnoczi
Date: Fri Aug 04 2023 - 09:35:24 EST


On Thu, Aug 03, 2023 at 03:18:23PM -0600, Alex Williamson wrote:
> On Thu, 3 Aug 2023 10:41:09 -0400
> Stefan Hajnoczi <stefanha@xxxxxxxxxx> wrote:
>
> > The VFIO_DEVICE_GET_INFO, VFIO_DEVICE_GET_REGION_INFO, and
> > VFIO_IOMMU_GET_INFO ioctls fill in an info struct followed by capability
> > structs:
> >
> > +------+---------+---------+-----+
> > | info | caps[0] | caps[1] | ... |
> > +------+---------+---------+-----+
> >
> > Both the info and capability struct sizes are not always multiples of
> > sizeof(u64), leaving u64 fields in later capability structs misaligned.
> >
> > Userspace applications currently need to handle misalignment manually in
> > order to support CPU architectures and programming languages with strict
> > alignment requirements.
> >
> > Make life easier for userspace by ensuring alignment in the kernel.
> > The new layout is as follows:
> >
> > +------+---+---------+---------+---+-----+
> > | info | 0 | caps[0] | caps[1] | 0 | ... |
> > +------+---+---------+---------+---+-----+
> >
> > In this example info and caps[1] have sizes that are not multiples of
> > sizeof(u64), so zero padding is added to align the subsequent structure.
> >
> > Adding zero padding between structs does not break the uapi. The memory
> > layout is specified by the info.cap_offset and caps[i].next fields
> > filled in by the kernel. Applications use these field values to locate
> > structs and are therefore unaffected by the addition of zero padding.
> >
> > Signed-off-by: Stefan Hajnoczi <stefanha@xxxxxxxxxx>
> > ---
> > include/linux/vfio.h | 2 +-
> > drivers/gpu/drm/i915/gvt/kvmgt.c | 7 +++--
> > drivers/s390/cio/vfio_ccw_ops.c | 7 +++--
> > drivers/vfio/pci/vfio_pci_core.c | 14 ++++++---
> > drivers/vfio/vfio_iommu_type1.c | 7 +++--
> > drivers/vfio/vfio_main.c | 53 +++++++++++++++++++++++++++-----
> > 6 files changed, 71 insertions(+), 19 deletions(-)
> >
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > index 2c137ea94a3e..ff0864e73cc3 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -272,7 +272,7 @@ struct vfio_info_cap {
> > struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> > size_t size, u16 id,
> > u16 version);
> > -void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
> > +ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
> >
> > int vfio_info_add_capability(struct vfio_info_cap *caps,
> > struct vfio_info_cap_header *cap, size_t size);
> > diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > index de675d799c7d..9060e9c6ac7c 100644
> > --- a/drivers/gpu/drm/i915/gvt/kvmgt.c
> > +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > @@ -1297,7 +1297,10 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
> > info.argsz = sizeof(info) + caps.size;
> > info.cap_offset = 0;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user((void __user *)arg +
> > sizeof(info), caps.buf,
> > caps.size)) {
> > @@ -1305,7 +1308,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
> > kfree(sparse);
> > return -EFAULT;
> > }
> > - info.cap_offset = sizeof(info);
> > + info.cap_offset = cap_offset;
>
> The copy_to_user() above needs to be modified to make this true:
>
> copy_to_user((void __user *)arg + cap_offset,...
>
> Same for all similar below.

vfio_info_cap_shift() inserts zero padding before the first capability
in order to achieve alignment. The zero padding is part of caps.buf, not
the info struct. Therefore the copy_to_user((void __user *)arg +
sizeof(info), ...) is correct.

It's confusing though. Your suggestion for simplifying alignment below
looks good and will avoid this issue.

>
> > }
> >
> > kfree(caps.buf);
> > diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
> > index 5b53b94f13c7..63d5163376a5 100644
> > --- a/drivers/s390/cio/vfio_ccw_ops.c
> > +++ b/drivers/s390/cio/vfio_ccw_ops.c
> > @@ -361,13 +361,16 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
> > info->argsz = sizeof(*info) + caps.size;
> > info->cap_offset = 0;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(*info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(*info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user((void __user *)arg + sizeof(*info),
> > caps.buf, caps.size)) {
> > kfree(caps.buf);
> > return -EFAULT;
> > }
> > - info->cap_offset = sizeof(*info);
> > + info->cap_offset = cap_offset;
> > }
> >
> > kfree(caps.buf);
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 20d7b69ea6ff..92c093b99187 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -966,12 +966,15 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
> > if (info.argsz < sizeof(info) + caps.size) {
> > info.argsz = sizeof(info) + caps.size;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user(arg + 1, caps.buf, caps.size)) {
> > kfree(caps.buf);
> > return -EFAULT;
> > }
> > - info.cap_offset = sizeof(*arg);
> > + info.cap_offset = cap_offset;
> > }
> >
> > kfree(caps.buf);
> > @@ -1107,12 +1110,15 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
> > info.argsz = sizeof(info) + caps.size;
> > info.cap_offset = 0;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user(arg + 1, caps.buf, caps.size)) {
> > kfree(caps.buf);
> > return -EFAULT;
> > }
> > - info.cap_offset = sizeof(*arg);
> > + info.cap_offset = cap_offset;
> > }
> >
> > kfree(caps.buf);
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index ebe0ad31d0b0..ab64b9e3ed7c 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2808,14 +2808,17 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
> > if (info.argsz < sizeof(info) + caps.size) {
> > info.argsz = sizeof(info) + caps.size;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user((void __user *)arg +
> > sizeof(info), caps.buf,
> > caps.size)) {
> > kfree(caps.buf);
> > return -EFAULT;
> > }
> > - info.cap_offset = sizeof(info);
> > + info.cap_offset = cap_offset;
> > }
> >
> > kfree(caps.buf);
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > index f0ca33b2e1df..4fc8698577a7 100644
> > --- a/drivers/vfio/vfio_main.c
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -1171,8 +1171,18 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> > {
> > void *buf;
> > struct vfio_info_cap_header *header, *tmp;
> > + size_t header_offset;
> > + size_t new_size;
> >
> > - buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
> > + /*
> > + * Reserve extra space when the previous capability was not a multiple of
> > + * the largest field size. This ensures that capabilities are properly
> > + * aligned.
> > + */
>
> If we simply start with:
>
> size = ALIGN(size, sizeof(u64));
>
> then shouldn't there never be a previous misaligned size to correct?

Yes, I applied padding at the beginning but it could be applied at the
end instead.

> I wonder if we really need all this complexity, we're drawing from a
> finite set of info structs for the initial alignment, we can pad those
> without breaking the uapi and we can introduce a warning to avoid such
> poor alignment in the future. Allocating an aligned size for each
> capability is then sufficiently trivial to handle runtime. ex:
>
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index 902f06e52c48..2d074cbd371d 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1362,6 +1362,8 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> void *buf;
> struct vfio_info_cap_header *header, *tmp;
>
> + size = ALIGN(size, sizeof(u64));
> +
> buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
> if (!buf) {
> kfree(caps->buf);
> @@ -1395,6 +1397,8 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
> struct vfio_info_cap_header *tmp;
> void *buf = (void *)caps->buf;
>
> + WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
> +
> for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
> tmp->next += offset;
> }
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index fa06e3eb4955..fd2761841ffe 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -217,6 +217,7 @@ struct vfio_device_info {
> __u32 num_regions; /* Max region index + 1 */
> __u32 num_irqs; /* Max IRQ index + 1 */
> __u32 cap_offset; /* Offset within info struct of first cap */
> + __u32 pad; /* Size must be aligned for caps */
> };
> #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
>
> @@ -1444,6 +1445,7 @@ struct vfio_iommu_type1_info {
> #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
> __u64 iova_pgsizes; /* Bitmap of supported page sizes */
> __u32 cap_offset; /* Offset within info struct of first cap */
> + __u32 pad; /* Size must be aligned for caps */
> };
>
> /*

Yes, that's much simpler. I'll send another revision that follows that
approach.

Stefan

Attachment: signature.asc
Description: PGP signature