[PATCH v2 vfio 6/6] vfio/pds: Add multi-region support

From: Brett Creeley
Date: Thu Nov 16 2023 - 19:13:05 EST


Only supporting a single region/range is limiting,
wasteful, and in some cases broken (i.e. when there
are large gaps in the iova memory ranges). Fix this
by adding support for multiple regions based on
what the device tells the driver it can support.

Signed-off-by: Brett Creeley <brett.creeley@xxxxxxx>
Signed-off-by: Shannon Nelson <shannon.nelson@xxxxxxx>
---
drivers/vfio/pci/pds/dirty.c | 220 ++++++++++++++++++++++++-----------
drivers/vfio/pci/pds/dirty.h | 4 +-
2 files changed, 156 insertions(+), 68 deletions(-)

diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
index 4824cdfe01ed..8ddf4346fcd5 100644
--- a/drivers/vfio/pci/pds/dirty.c
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -70,7 +70,7 @@ pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio,
kfree(region_info);
}

-static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
+static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_region *region,
unsigned long bytes)
{
unsigned long *host_seq_bmp, *host_ack_bmp;
@@ -85,20 +85,27 @@ static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
return -ENOMEM;
}

- dirty->region.host_seq = host_seq_bmp;
- dirty->region.host_ack = host_ack_bmp;
- dirty->region.bmp_bytes = bytes;
+ region->host_seq = host_seq_bmp;
+ region->host_ack = host_ack_bmp;
+ region->bmp_bytes = bytes;

return 0;
}

static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty)
{
- vfree(dirty->region.host_seq);
- vfree(dirty->region.host_ack);
- dirty->region.host_seq = NULL;
- dirty->region.host_ack = NULL;
- dirty->region.bmp_bytes = 0;
+ if (!dirty->regions)
+ return;
+
+ for (int i = 0; i < dirty->num_regions; i++) {
+ struct pds_vfio_region *region = &dirty->regions[i];
+
+ vfree(region->host_seq);
+ vfree(region->host_ack);
+ region->host_seq = NULL;
+ region->host_ack = NULL;
+ region->bmp_bytes = 0;
+ }
}

static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
@@ -119,10 +126,17 @@ static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,

static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio)
{
- struct pds_vfio_region *region = &pds_vfio->dirty.region;
+ struct pds_vfio_dirty *dirty = &pds_vfio->dirty;

- if (region->sgl)
- __pds_vfio_dirty_free_sgl(pds_vfio, region);
+ if (!dirty->regions)
+ return;
+
+ for (int i = 0; i < dirty->num_regions; i++) {
+ struct pds_vfio_region *region = &dirty->regions[i];
+
+ if (region->sgl)
+ __pds_vfio_dirty_free_sgl(pds_vfio, region);
+ }
}

static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
@@ -156,22 +170,90 @@ static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
return 0;
}

+static void pds_vfio_dirty_free_regions(struct pds_vfio_dirty *dirty)
+{
+ vfree(dirty->regions);
+ dirty->regions = NULL;
+ dirty->num_regions = 0;
+}
+
+static int pds_vfio_dirty_alloc_regions(struct pds_vfio_pci_device *pds_vfio,
+ struct pds_lm_dirty_region_info *region_info,
+ u64 region_page_size, u8 num_regions)
+{
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+ u32 dev_bmp_offset_byte = 0;
+ int err;
+
+ dirty->regions = vcalloc(num_regions, sizeof(struct pds_vfio_region));
+ if (!dirty->regions)
+ return -ENOMEM;
+ dirty->num_regions = num_regions;
+
+ for (int i = 0; i < num_regions; i++) {
+ struct pds_lm_dirty_region_info *ri = &region_info[i];
+ struct pds_vfio_region *region = &dirty->regions[i];
+ u64 region_size, region_start;
+ u32 page_count;
+
+ /* page_count might be adjusted by the device */
+ page_count = le32_to_cpu(ri->page_count);
+ region_start = le64_to_cpu(ri->dma_base);
+ region_size = page_count * region_page_size;
+
+ err = pds_vfio_dirty_alloc_bitmaps(region,
+ page_count / BITS_PER_BYTE);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
+ ERR_PTR(err));
+ goto out_free_regions;
+ }
+
+ err = pds_vfio_dirty_alloc_sgl(pds_vfio, region, page_count);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
+ ERR_PTR(err));
+ goto out_free_regions;
+ }
+
+ region->size = region_size;
+ region->start = region_start;
+ region->page_size = region_page_size;
+ region->dev_bmp_offset_start_byte = dev_bmp_offset_byte;
+
+ dev_bmp_offset_byte += page_count / BITS_PER_BYTE;
+ if (dev_bmp_offset_byte % BITS_PER_BYTE) {
+ dev_err(&pdev->dev, "Device bitmap offset is mis-aligned\n");
+ err = -EINVAL;
+ goto out_free_regions;
+ }
+ }
+
+ return 0;
+
+out_free_regions:
+ pds_vfio_dirty_free_bitmaps(dirty);
+ pds_vfio_dirty_free_sgl(pds_vfio);
+ pds_vfio_dirty_free_regions(dirty);
+
+ return err;
+}
+
static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
struct rb_root_cached *ranges, u32 nnodes,
u64 *page_size)
{
struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
struct device *pdsc_dev = &pci_physfn(pdev)->dev;
- struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
- u64 region_start, region_size, region_page_size;
struct pds_lm_dirty_region_info *region_info;
struct interval_tree_node *node = NULL;
+ u64 region_page_size = *page_size;
u8 max_regions = 0, num_regions;
dma_addr_t regions_dma = 0;
u32 num_ranges = nnodes;
- u32 page_count;
- u16 len;
int err;
+ u16 len;

dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
pds_vfio->vf_id);
@@ -198,39 +280,38 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
return -EOPNOTSUPP;
}

- /*
- * Only support 1 region for now. If there are any large gaps in the
- * VM's address regions, then this would be a waste of memory as we are
- * generating 2 bitmaps (ack/seq) from the min address to the max
- * address of the VM's address regions. In the future, if we support
- * more than one region in the device/driver we can split the bitmaps
- * on the largest address region gaps. We can do this split up to the
- * max_regions times returned from the dirty_status command.
- */
- max_regions = 1;
if (num_ranges > max_regions) {
vfio_combine_iova_ranges(ranges, nnodes, max_regions);
num_ranges = max_regions;
}

+ region_info = kcalloc(num_ranges, sizeof(*region_info), GFP_KERNEL);
+ if (!region_info)
+ return -ENOMEM;
+ len = num_ranges * sizeof(*region_info);
+
node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
if (!node)
return -EINVAL;
+ for (int i = 0; i < num_ranges; i++) {
+ struct pds_lm_dirty_region_info *ri = &region_info[i];
+ u64 region_size = node->last - node->start + 1;
+ u64 region_start = node->start;
+ u32 page_count;

- region_size = node->last - node->start + 1;
- region_start = node->start;
- region_page_size = *page_size;
+ page_count = DIV_ROUND_UP(region_size, region_page_size);

- len = sizeof(*region_info);
- region_info = kzalloc(len, GFP_KERNEL);
- if (!region_info)
- return -ENOMEM;
+ ri->dma_base = cpu_to_le64(region_start);
+ ri->page_count = cpu_to_le32(page_count);
+ ri->page_size_log2 = ilog2(region_page_size);

- page_count = DIV_ROUND_UP(region_size, region_page_size);
+ dev_dbg(&pdev->dev,
+ "region_info[%d]: region_start 0x%llx region_end 0x%lx region_size 0x%llx page_count %u page_size %llu\n",
+ i, region_start, node->last, region_size, page_count,
+ region_page_size);

- region_info->dma_base = cpu_to_le64(region_start);
- region_info->page_count = cpu_to_le32(page_count);
- region_info->page_size_log2 = ilog2(region_page_size);
+ node = interval_tree_iter_next(node, 0, ULONG_MAX);
+ }

regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
DMA_BIDIRECTIONAL);
@@ -239,39 +320,20 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
goto out_free_region_info;
}

- err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
+ err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, num_ranges);
dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
if (err)
goto out_free_region_info;

- /*
- * page_count might be adjusted by the device,
- * update it before freeing region_info DMA
- */
- page_count = le32_to_cpu(region_info->page_count);
-
- dev_dbg(&pdev->dev,
- "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n",
- regions_dma, region_start, page_count,
- (u8)ilog2(region_page_size));
-
- err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
- if (err) {
- dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
- ERR_PTR(err));
- goto out_free_region_info;
- }
-
- err = pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->region, page_count);
+ err = pds_vfio_dirty_alloc_regions(pds_vfio, region_info,
+ region_page_size, num_ranges);
if (err) {
- dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
- ERR_PTR(err));
- goto out_free_bitmaps;
+ dev_err(&pdev->dev,
+ "Failed to allocate %d regions for tracking dirty regions: %pe\n",
+ num_regions, ERR_PTR(err));
+ goto out_dirty_disable;
}

- dirty->region.start = region_start;
- dirty->region.size = region_size;
- dirty->region.page_size = region_page_size;
pds_vfio_dirty_set_enabled(pds_vfio);

pds_vfio_print_guest_region_info(pds_vfio, max_regions);
@@ -280,8 +342,8 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,

return 0;

-out_free_bitmaps:
- pds_vfio_dirty_free_bitmaps(dirty);
+out_dirty_disable:
+ pds_vfio_dirty_disable_cmd(pds_vfio);
out_free_region_info:
kfree(region_info);
return err;
@@ -295,6 +357,7 @@ void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd)
pds_vfio_dirty_disable_cmd(pds_vfio);
pds_vfio_dirty_free_sgl(pds_vfio);
pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
+ pds_vfio_dirty_free_regions(&pds_vfio->dirty);
}

if (send_cmd)
@@ -365,6 +428,7 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,

num_sge = sg_table.nents;
size = num_sge * sizeof(struct pds_lm_sg_elem);
+ offset += region->dev_bmp_offset_start_byte;
dma_sync_single_for_device(pdsc_dev, region->sgl_addr, size, dma_dir);
err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, region->sgl_addr, num_sge,
offset, bmp_bytes, read_seq);
@@ -437,13 +501,28 @@ static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio,
return 0;
}

+static struct pds_vfio_region *
+pds_vfio_get_region(struct pds_vfio_pci_device *pds_vfio, unsigned long iova)
+{
+ struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+
+ for (int i = 0; i < dirty->num_regions; i++) {
+ struct pds_vfio_region *region = &dirty->regions[i];
+
+ if (iova >= region->start &&
+ iova < (region->start + region->size))
+ return region;
+ }
+
+ return NULL;
+}
+
static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
struct iova_bitmap *dirty_bitmap,
unsigned long iova, unsigned long length)
{
struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
- struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
- struct pds_vfio_region *region = &dirty->region;
+ struct pds_vfio_region *region;
u64 bmp_offset, bmp_bytes;
u64 bitmap_size, pages;
int err;
@@ -456,6 +535,13 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
return -EINVAL;
}

+ region = pds_vfio_get_region(pds_vfio, iova);
+ if (!region) {
+ dev_err(dev, "vf%u: Failed to find region that contains iova 0x%lx length 0x%lx\n",
+ pds_vfio->vf_id, iova, length);
+ return -EINVAL;
+ }
+
pages = DIV_ROUND_UP(length, region->page_size);
bitmap_size =
round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h
index a1f6d894f913..c8e23018b801 100644
--- a/drivers/vfio/pci/pds/dirty.h
+++ b/drivers/vfio/pci/pds/dirty.h
@@ -13,11 +13,13 @@ struct pds_vfio_region {
u64 page_size;
struct pds_lm_sg_elem *sgl;
dma_addr_t sgl_addr;
+ u32 dev_bmp_offset_start_byte;
u16 num_sge;
};

struct pds_vfio_dirty {
- struct pds_vfio_region region;
+ struct pds_vfio_region *regions;
+ u8 num_regions;
bool is_enabled;
};

--
2.17.1