[PATCH 2/3] iopmem : Add a block device driver for PCIe attached IO memory.

From: Stephen Bates
Date: Tue Oct 18 2016 - 18:46:46 EST


Add a new block device driver that binds to PCIe devices and turns
PCIe BARs into DAX capable block devices.

Signed-off-by: Stephen Bates <sbates@xxxxxxxxxxxx>
Signed-off-by: Logan Gunthorpe <logang@xxxxxxxxxxxx>
---
MAINTAINERS | 7 ++
drivers/block/Kconfig | 27 ++++
drivers/block/Makefile | 1 +
drivers/block/iopmem.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 368 insertions(+)
create mode 100644 drivers/block/iopmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7..c379f9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6510,6 +6510,13 @@ S: Maintained
F: Documentation/devicetree/bindings/iommu/
F: drivers/iommu/

+IOPMEM BLOCK DEVICE DRVIER
+M: Stephen Bates <sbates@xxxxxxxxxxxx>
+L: linux-block@xxxxxxxxxxxxxxx
+S: Maintained
+F: drivers/block/iopmem.c
+F: Documentation/blockdev/iopmem.txt
+
IP MASQUERADING
M: Juanjo Ciarlante <jjciarla@xxxxxxxxxxxxxxxx>
S: Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 39dd30b..13ae1e7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -537,4 +537,31 @@ config BLK_DEV_RSXX
To compile this driver as a module, choose M here: the
module will be called rsxx.

+config BLK_DEV_IOPMEM
+ tristate "Persistent block device backed by PCIe Memory"
+ depends on ZONE_DEVICE
+ default n
+ help
+ Say Y here if you want to include a generic device driver
+ that can create a block device from persistent PCIe attached
+ IO memory.
+
+ To compile this driver as a module, choose M here: The
+ module will be called iopmem. A block device will be created
+ for each PCIe attached device that matches the vendor and
+ device ID as specified in the module. Alternativel this
+ driver can be bound to any aribtary PCIe function using the
+ sysfs bind entry.
+
+ This block device supports direct access (DAX) file systems
+ and supports struct page backing for the IO Memory. This
+ makes the underlying memory suitable for things like RDMA
+ Memory Regions and Direct IO which is useful for PCIe
+ peer-to-peer DMA operations.
+
+ Note that persistent is only assured if the memory on the
+ PCIe card has some form of power loss protection. This could
+ be provided via some form of battery, a supercap/NAND combo
+ or some exciting new persistent memory technology.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e..1f4f69b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_IOPMEM) += iopmem.o

skd-y := skd_main.o
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/iopmem.c b/drivers/block/iopmem.c
new file mode 100644
index 0000000..4a1e693
--- /dev/null
+++ b/drivers/block/iopmem.c
@@ -0,0 +1,333 @@
+/*
+ * IOPMEM Block Device Driver
+ * Copyright (c) 2016, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/pmem.c.
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+
+static const int BAR_ID = 4;
+
+static struct pci_device_id iopmem_id_table[] = {
+ { PCI_DEVICE(0x11f8, 0xf115) },
+ { 0, }
+};
+MODULE_DEVICE_TABLE(pci, iopmem_id_table);
+
+struct iopmem_device {
+ struct request_queue *queue;
+ struct gendisk *disk;
+ struct device *dev;
+
+ int instance;
+
+ /* One contiguous memory region per device */
+ phys_addr_t phys_addr;
+ void *virt_addr;
+ size_t size;
+};
+
+ /*
+ * We can only access the iopmem device with full 32-bit word
+ * accesses which cannot be gaurantee'd by the regular memcpy
+ */
+
+static void memcpy_from_iopmem(void *dst, const void *src, size_t sz)
+{
+ u64 *wdst = dst;
+ const u64 *wsrc = src;
+ u64 tmp;
+
+ while (sz >= sizeof(*wdst)) {
+ *wdst++ = *wsrc++;
+ sz -= sizeof(*wdst);
+ }
+
+ if (!sz)
+ return;
+
+ tmp = *wsrc;
+ memcpy(wdst, &tmp, sz);
+}
+
+static void write_iopmem(void *iopmem_addr, struct page *page,
+ unsigned int off, unsigned int len)
+{
+ void *mem = kmap_atomic(page);
+
+ memcpy(iopmem_addr, mem + off, len);
+ kunmap_atomic(mem);
+}
+
+static void read_iopmem(struct page *page, unsigned int off,
+ void *iopmem_addr, unsigned int len)
+{
+ void *mem = kmap_atomic(page);
+
+ memcpy_from_iopmem(mem + off, iopmem_addr, len);
+ kunmap_atomic(mem);
+}
+
+static void iopmem_do_bvec(struct iopmem_device *iopmem, struct page *page,
+ unsigned int len, unsigned int off, bool is_write,
+ sector_t sector)
+{
+ phys_addr_t iopmem_off = sector * 512;
+ void *iopmem_addr = iopmem->virt_addr + iopmem_off;
+
+ if (!is_write) {
+ read_iopmem(page, off, iopmem_addr, len);
+ flush_dcache_page(page);
+ } else {
+ flush_dcache_page(page);
+ write_iopmem(iopmem_addr, page, off, len);
+ }
+}
+
+static blk_qc_t iopmem_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct iopmem_device *iopmem = q->queuedata;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ iopmem_do_bvec(iopmem, bvec.bv_page, bvec.bv_len,
+ bvec.bv_offset, op_is_write(bio_op(bio)),
+ iter.bi_sector);
+ }
+
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+}
+
+static int iopmem_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, bool is_write)
+{
+ struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+
+ iopmem_do_bvec(iopmem, page, PAGE_SIZE, 0, is_write, sector);
+ page_endio(page, is_write, 0);
+ return 0;
+}
+
+static long iopmem_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, pfn_t *pfn, long size)
+{
+ struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+ resource_size_t offset = sector * 512;
+
+ if (!iopmem)
+ return -ENODEV;
+
+ *kaddr = iopmem->virt_addr + offset;
+ *pfn = phys_to_pfn_t(iopmem->phys_addr + offset, PFN_DEV | PFN_MAP);
+
+ return iopmem->size - offset;
+}
+
+static const struct block_device_operations iopmem_fops = {
+ .owner = THIS_MODULE,
+ .rw_page = iopmem_rw_page,
+ .direct_access = iopmem_direct_access,
+};
+
+static DEFINE_IDA(iopmem_instance_ida);
+static DEFINE_SPINLOCK(ida_lock);
+
+static int iopmem_set_instance(struct iopmem_device *iopmem)
+{
+ int instance, error;
+
+ do {
+ if (!ida_pre_get(&iopmem_instance_ida, GFP_KERNEL))
+ return -ENODEV;
+
+ spin_lock(&ida_lock);
+ error = ida_get_new(&iopmem_instance_ida, &instance);
+ spin_unlock(&ida_lock);
+
+ } while (error == -EAGAIN);
+
+ if (error)
+ return -ENODEV;
+
+ iopmem->instance = instance;
+ return 0;
+}
+
+static void iopmem_release_instance(struct iopmem_device *iopmem)
+{
+ spin_lock(&ida_lock);
+ ida_remove(&iopmem_instance_ida, iopmem->instance);
+ spin_unlock(&ida_lock);
+}
+
+static int iopmem_attach_disk(struct iopmem_device *iopmem)
+{
+ struct gendisk *disk;
+ int nid = dev_to_node(iopmem->dev);
+ struct request_queue *q = iopmem->queue;
+
+ blk_queue_write_cache(q, true, true);
+ blk_queue_make_request(q, iopmem_make_request);
+ blk_queue_physical_block_size(q, PAGE_SIZE);
+ blk_queue_max_hw_sectors(q, UINT_MAX);
+ blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+ queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
+ q->queuedata = iopmem;
+
+ disk = alloc_disk_node(0, nid);
+ if (unlikely(!disk))
+ return -ENOMEM;
+
+ disk->fops = &iopmem_fops;
+ disk->queue = q;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ sprintf(disk->disk_name, "iopmem%d", iopmem->instance);
+ set_capacity(disk, iopmem->size / 512);
+ iopmem->disk = disk;
+
+ device_add_disk(iopmem->dev, disk);
+ revalidate_disk(disk);
+
+ return 0;
+}
+
+static void iopmem_detach_disk(struct iopmem_device *iopmem)
+{
+ del_gendisk(iopmem->disk);
+ put_disk(iopmem->disk);
+}
+
+static int iopmem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct iopmem_device *iopmem;
+ struct device *dev;
+ int err = 0;
+ int nid = dev_to_node(&pdev->dev);
+
+ if (pci_enable_device_mem(pdev) < 0) {
+ dev_err(&pdev->dev, "unable to enable device!\n");
+ goto out;
+ }
+
+ iopmem = kzalloc(sizeof(*iopmem), GFP_KERNEL);
+ if (unlikely(!iopmem)) {
+ err = -ENOMEM;
+ goto out_disable_device;
+ }
+
+ iopmem->phys_addr = pci_resource_start(pdev, BAR_ID);
+ iopmem->size = pci_resource_end(pdev, BAR_ID) - iopmem->phys_addr + 1;
+ iopmem->dev = dev = get_device(&pdev->dev);
+ pci_set_drvdata(pdev, iopmem);
+
+ err = iopmem_set_instance(iopmem);
+ if (err)
+ goto out_put_device;
+
+ dev_info(dev, "bar space 0x%llx len %lld\n",
+ (unsigned long long) iopmem->phys_addr,
+ (unsigned long long) iopmem->size);
+
+ if (!devm_request_mem_region(dev, iopmem->phys_addr,
+ iopmem->size, dev_name(dev))) {
+ dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
+ &iopmem->phys_addr, iopmem->size);
+ err = -EBUSY;
+ goto out_release_instance;
+ }
+
+ iopmem->queue = blk_alloc_queue_node(GFP_KERNEL, nid);
+ if (!iopmem->queue) {
+ err = -ENOMEM;
+ goto out_release_instance;
+ }
+
+ iopmem->virt_addr = devm_memremap_pages(dev, &pdev->resource[BAR_ID],
+ &iopmem->queue->q_usage_counter,
+ NULL, MEMREMAP_WC);
+ if (IS_ERR(iopmem->virt_addr)) {
+ err = -ENXIO;
+ goto out_free_queue;
+ }
+
+ err = iopmem_attach_disk(iopmem);
+ if (err)
+ goto out_free_queue;
+
+ return 0;
+
+out_free_queue:
+ blk_cleanup_queue(iopmem->queue);
+out_release_instance:
+ iopmem_release_instance(iopmem);
+out_put_device:
+ put_device(&pdev->dev);
+ kfree(iopmem);
+out_disable_device:
+ pci_disable_device(pdev);
+out:
+ return err;
+}
+
+static void iopmem_remove(struct pci_dev *pdev)
+{
+ struct iopmem_device *iopmem = pci_get_drvdata(pdev);
+
+ blk_set_queue_dying(iopmem->queue);
+ iopmem_detach_disk(iopmem);
+ blk_cleanup_queue(iopmem->queue);
+ iopmem_release_instance(iopmem);
+ put_device(iopmem->dev);
+ kfree(iopmem);
+ pci_disable_device(pdev);
+}
+
+static struct pci_driver iopmem_pci_driver = {
+ .name = "iopmem",
+ .id_table = iopmem_id_table,
+ .probe = iopmem_probe,
+ .remove = iopmem_remove,
+};
+
+static int __init iopmem_init(void)
+{
+ int rc;
+
+ rc = pci_register_driver(&iopmem_pci_driver);
+ if (rc)
+ return rc;
+
+ pr_info("iopmem: module loaded\n");
+ return 0;
+}
+
+static void __exit iopmem_exit(void)
+{
+ pci_unregister_driver(&iopmem_pci_driver);
+ pr_info("iopmem: module unloaded\n");
+}
+
+MODULE_AUTHOR("Logan Gunthorpe <logang@xxxxxxxxxxxx>");
+MODULE_LICENSE("GPL");
+module_init(iopmem_init);
+module_exit(iopmem_exit);
--
2.1.4