Re: [PATCH] vfio: align capability structures
From: Alex Williamson
Date: Thu Aug 03 2023 - 17:19:20 EST
On Thu, 3 Aug 2023 10:41:09 -0400
Stefan Hajnoczi <stefanha@xxxxxxxxxx> wrote:
> The VFIO_DEVICE_GET_INFO, VFIO_DEVICE_GET_REGION_INFO, and
> VFIO_IOMMU_GET_INFO ioctls fill in an info struct followed by capability
> structs:
>
> +------+---------+---------+-----+
> | info | caps[0] | caps[1] | ... |
> +------+---------+---------+-----+
>
> Both the info and capability struct sizes are not always multiples of
> sizeof(u64), leaving u64 fields in later capability structs misaligned.
>
> Userspace applications currently need to handle misalignment manually in
> order to support CPU architectures and programming languages with strict
> alignment requirements.
>
> Make life easier for userspace by ensuring alignment in the kernel.
> The new layout is as follows:
>
> +------+---+---------+---------+---+-----+
> | info | 0 | caps[0] | caps[1] | 0 | ... |
> +------+---+---------+---------+---+-----+
>
> In this example info and caps[1] have sizes that are not multiples of
> sizeof(u64), so zero padding is added to align the subsequent structure.
>
> Adding zero padding between structs does not break the uapi. The memory
> layout is specified by the info.cap_offset and caps[i].next fields
> filled in by the kernel. Applications use these field values to locate
> structs and are therefore unaffected by the addition of zero padding.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@xxxxxxxxxx>
> ---
> include/linux/vfio.h | 2 +-
> drivers/gpu/drm/i915/gvt/kvmgt.c | 7 +++--
> drivers/s390/cio/vfio_ccw_ops.c | 7 +++--
> drivers/vfio/pci/vfio_pci_core.c | 14 ++++++---
> drivers/vfio/vfio_iommu_type1.c | 7 +++--
> drivers/vfio/vfio_main.c | 53 +++++++++++++++++++++++++++-----
> 6 files changed, 71 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 2c137ea94a3e..ff0864e73cc3 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -272,7 +272,7 @@ struct vfio_info_cap {
> struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> size_t size, u16 id,
> u16 version);
> -void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
> +ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
>
> int vfio_info_add_capability(struct vfio_info_cap *caps,
> struct vfio_info_cap_header *cap, size_t size);
> diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
> index de675d799c7d..9060e9c6ac7c 100644
> --- a/drivers/gpu/drm/i915/gvt/kvmgt.c
> +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
> @@ -1297,7 +1297,10 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
> info.argsz = sizeof(info) + caps.size;
> info.cap_offset = 0;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user((void __user *)arg +
> sizeof(info), caps.buf,
> caps.size)) {
> @@ -1305,7 +1308,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
> kfree(sparse);
> return -EFAULT;
> }
> - info.cap_offset = sizeof(info);
> + info.cap_offset = cap_offset;
The copy_to_user() above needs to be modified to make this true:
copy_to_user((void __user *)arg + cap_offset,...
Same for all similar below.
> }
>
> kfree(caps.buf);
> diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
> index 5b53b94f13c7..63d5163376a5 100644
> --- a/drivers/s390/cio/vfio_ccw_ops.c
> +++ b/drivers/s390/cio/vfio_ccw_ops.c
> @@ -361,13 +361,16 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
> info->argsz = sizeof(*info) + caps.size;
> info->cap_offset = 0;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(*info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(*info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user((void __user *)arg + sizeof(*info),
> caps.buf, caps.size)) {
> kfree(caps.buf);
> return -EFAULT;
> }
> - info->cap_offset = sizeof(*info);
> + info->cap_offset = cap_offset;
> }
>
> kfree(caps.buf);
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 20d7b69ea6ff..92c093b99187 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -966,12 +966,15 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
> if (info.argsz < sizeof(info) + caps.size) {
> info.argsz = sizeof(info) + caps.size;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user(arg + 1, caps.buf, caps.size)) {
> kfree(caps.buf);
> return -EFAULT;
> }
> - info.cap_offset = sizeof(*arg);
> + info.cap_offset = cap_offset;
> }
>
> kfree(caps.buf);
> @@ -1107,12 +1110,15 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
> info.argsz = sizeof(info) + caps.size;
> info.cap_offset = 0;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user(arg + 1, caps.buf, caps.size)) {
> kfree(caps.buf);
> return -EFAULT;
> }
> - info.cap_offset = sizeof(*arg);
> + info.cap_offset = cap_offset;
> }
>
> kfree(caps.buf);
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index ebe0ad31d0b0..ab64b9e3ed7c 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2808,14 +2808,17 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
> if (info.argsz < sizeof(info) + caps.size) {
> info.argsz = sizeof(info) + caps.size;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user((void __user *)arg +
> sizeof(info), caps.buf,
> caps.size)) {
> kfree(caps.buf);
> return -EFAULT;
> }
> - info.cap_offset = sizeof(info);
> + info.cap_offset = cap_offset;
> }
>
> kfree(caps.buf);
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index f0ca33b2e1df..4fc8698577a7 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1171,8 +1171,18 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> {
> void *buf;
> struct vfio_info_cap_header *header, *tmp;
> + size_t header_offset;
> + size_t new_size;
>
> - buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
> + /*
> + * Reserve extra space when the previous capability was not a multiple of
> + * the largest field size. This ensures that capabilities are properly
> + * aligned.
> + */
If we simply start with:
size = ALIGN(size, sizeof(u64));
then shouldn't there never be a previous misaligned size to correct?
I wonder if we really need all this complexity, we're drawing from a
finite set of info structs for the initial alignment, we can pad those
without breaking the uapi and we can introduce a warning to avoid such
poor alignment in the future. Allocating an aligned size for each
capability is then sufficiently trivial to handle runtime. ex:
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 902f06e52c48..2d074cbd371d 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1362,6 +1362,8 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
void *buf;
struct vfio_info_cap_header *header, *tmp;
+ size = ALIGN(size, sizeof(u64));
+
buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
if (!buf) {
kfree(caps->buf);
@@ -1395,6 +1397,8 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
struct vfio_info_cap_header *tmp;
void *buf = (void *)caps->buf;
+ WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
+
for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
tmp->next += offset;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index fa06e3eb4955..fd2761841ffe 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -217,6 +217,7 @@ struct vfio_device_info {
__u32 num_regions; /* Max region index + 1 */
__u32 num_irqs; /* Max IRQ index + 1 */
__u32 cap_offset; /* Offset within info struct of first cap */
+ __u32 pad; /* Size must be aligned for caps */
};
#define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
@@ -1444,6 +1445,7 @@ struct vfio_iommu_type1_info {
#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
__u64 iova_pgsizes; /* Bitmap of supported page sizes */
__u32 cap_offset; /* Offset within info struct of first cap */
+ __u32 pad; /* Size must be aligned for caps */
};
/*
Thanks,
Alex
> + header_offset = ALIGN(caps->size, sizeof(u64));
> + new_size = header_offset + size;
> +
> + buf = krealloc(caps->buf, new_size, GFP_KERNEL);
> if (!buf) {
> kfree(caps->buf);
> caps->buf = NULL;
> @@ -1181,10 +1191,10 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> }
>
> caps->buf = buf;
> - header = buf + caps->size;
> + header = buf + header_offset;
>
> /* Eventually copied to user buffer, zero */
> - memset(header, 0, size);
> + memset(buf + caps->size, 0, new_size - caps->size);
>
> header->id = id;
> header->version = version;
> @@ -1193,20 +1203,47 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> for (tmp = buf; tmp->next; tmp = buf + tmp->next)
> ; /* nothing */
>
> - tmp->next = caps->size;
> - caps->size += size;
> + tmp->next = header_offset;
> + caps->size = new_size;
>
> return header;
> }
> EXPORT_SYMBOL_GPL(vfio_info_cap_add);
>
> -void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
> +/*
> + * Adjust the capability next fields to account for the given offset at which
> + * capability structures start and any padding added for alignment. Returns the
> + * cap_offset or -errno.
> + */
> +ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
> {
> struct vfio_info_cap_header *tmp;
> + struct vfio_info_cap_header *next_tmp;
> void *buf = (void *)caps->buf;
> + size_t pad = ALIGN(offset, sizeof(u64)) - offset;
> + size_t cap_offset = offset + pad;
>
> - for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
> - tmp->next += offset;
> + /* Shift the next fields to account for offset and pad */
> + for (tmp = buf; tmp->next; tmp = next_tmp) {
> + next_tmp = buf + tmp->next;
> + tmp->next += cap_offset;
> + }
> +
> + /* Pad with zeroes so capabilities start with proper alignment */
> + buf = krealloc(caps->buf, caps->size + pad, GFP_KERNEL);
> + if (!buf) {
> + kfree(caps->buf);
> + caps->buf = NULL;
> + caps->size = 0;
> + return -ENOMEM;
> + }
> +
> + memmove(buf + pad, buf, caps->size);
> + memset(buf, 0, pad);
> +
> + caps->buf = buf;
> + caps->size += pad;
> + return cap_offset;
> }
> EXPORT_SYMBOL(vfio_info_cap_shift);
>