[PATCH v2 2/2] dax,pmem: Implement pmem based dax data recovery

From: Jane Chu
Date: Fri Nov 05 2021 - 21:18:45 EST


For /dev/pmem based dax, enable DAX_OP_RECOVERY mode for
dax_direct_access to translate 'kaddr' over a range that
may contain poison(s); and enable dax_copy_to_iter to
read as much data as possible up till a poisoned page is
encountered; and enable dax_copy_from_iter to clear poison
among a page-aligned range, and then write the good data over.

Signed-off-by: Jane Chu <jane.chu@xxxxxxxxxx>
---
drivers/md/dm.c | 2 ++
drivers/nvdimm/pmem.c | 75 ++++++++++++++++++++++++++++++++++++++++---
fs/dax.c | 24 +++++++++++---
3 files changed, 92 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dc354db22ef9..9b3dac916f22 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1043,6 +1043,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
if (!ti)
goto out;
if (!ti->type->dax_copy_from_iter) {
+ WARN_ON(mode == DAX_OP_RECOVERY);
ret = copy_from_iter(addr, bytes, i);
goto out;
}
@@ -1067,6 +1068,7 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
if (!ti)
goto out;
if (!ti->type->dax_copy_to_iter) {
+ WARN_ON(mode == DAX_OP_RECOVERY);
ret = copy_to_iter(addr, bytes, i);
goto out;
}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 3dc99e0bf633..8ae6aa678c51 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;

if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
- PFN_PHYS(nr_pages))))
+ PFN_PHYS(nr_pages)) && mode == DAX_OP_NORMAL))
return -EIO;

if (kaddr)
@@ -303,20 +303,85 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
}

/*
- * Use the 'no check' versions of copy_from_iter_flushcache() and
- * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
- * checking, both file offset and device offset, is handled by
- * dax_iomap_actor()
+ * Even though the 'no check' versions of copy_from_iter_flushcache()
+ * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
+ * 'read'/'write' aren't always safe when poison is consumed. They happen
+ * to be safe because the 'read'/'write' range has been guaranteed
+ * be free of poison(s) by a prior call to dax_direct_access() on the
+ * caller stack.
+ * But on a data recovery code path, the 'read'/'write' range is expected
+ * to contain poison(s), and so poison(s) is explicit checked, such that
+ * 'read' can fetch data from clean page(s) up till the first poison is
+ * encountered, and 'write' requires the range be page aligned in order
+ * to restore the poisoned page's memory type back to "rw" after clearing
+ * the poison(s).
+ * In the event of poison related failure, (size_t) -EIO is returned and
+ * caller may check the return value after casting it to (ssize_t).
+ *
+ * TODO: add support for CPUs that support MOVDIR64B instruction for
+ * faster poison clearing, and possibly smaller error blast radius.
*/
static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i, int mode)
{
+ phys_addr_t pmem_off;
+ size_t len, lead_off;
+ struct pmem_device *pmem = dax_get_private(dax_dev);
+ struct device *dev = pmem->bb.dev;
+
+ if (unlikely(mode == DAX_OP_RECOVERY)) {
+ lead_off = (unsigned long)addr & ~PAGE_MASK;
+ len = PFN_PHYS(PFN_UP(lead_off + bytes));
+ if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
+ if (lead_off || !(PAGE_ALIGNED(bytes))) {
+ dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
+ addr, bytes);
+ return (size_t) -EIO;
+ }
+ pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+ if (pmem_clear_poison(pmem, pmem_off, bytes) !=
+ BLK_STS_OK)
+ return (size_t) -EIO;
+ }
+ }
+
return _copy_from_iter_flushcache(addr, bytes, i);
}

static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i, int mode)
{
+ int num_bad;
+ size_t len, lead_off;
+ unsigned long bad_pfn;
+ bool bad_pmem = false;
+ size_t adj_len = bytes;
+ sector_t sector, first_bad;
+ struct pmem_device *pmem = dax_get_private(dax_dev);
+ struct device *dev = pmem->bb.dev;
+
+ if (unlikely(mode == DAX_OP_RECOVERY)) {
+ sector = PFN_PHYS(pgoff) / 512;
+ lead_off = (unsigned long)addr & ~PAGE_MASK;
+ len = PFN_PHYS(PFN_UP(lead_off + bytes));
+ if (pmem->bb.count)
+ bad_pmem = !!badblocks_check(&pmem->bb, sector,
+ len / 512, &first_bad, &num_bad);
+ if (bad_pmem) {
+ bad_pfn = PHYS_PFN(first_bad * 512);
+ if (bad_pfn == pgoff) {
+ dev_warn(dev, "Found poison in page: pgoff(%#lx)\n",
+ pgoff);
+ return -EIO;
+ }
+ adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off;
+ dev_WARN_ONCE(dev, (adj_len > bytes),
+ "out-of-range first_bad?");
+ }
+ if (adj_len == 0)
+ return (size_t) -EIO;
+ }
+
return _copy_mc_to_iter(addr, bytes, i);
}

diff --git a/fs/dax.c b/fs/dax.c
index bea6df1498c3..7640be6b6a97 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1219,6 +1219,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
unsigned offset = pos & (PAGE_SIZE - 1);
const size_t size = ALIGN(length + offset, PAGE_SIZE);
const sector_t sector = dax_iomap_sector(iomap, pos);
+ long nr_page = PHYS_PFN(size);
+ int dax_mode = DAX_OP_NORMAL;
ssize_t map_len;
pgoff_t pgoff;
void *kaddr;
@@ -1232,8 +1234,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (ret)
break;

- map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
- DAX_OP_NORMAL, &kaddr, NULL);
+ map_len = dax_direct_access(dax_dev, pgoff, nr_page, dax_mode,
+ &kaddr, NULL);
+ if (unlikely(map_len == -EIO)) {
+ dax_mode = DAX_OP_RECOVERY;
+ map_len = dax_direct_access(dax_dev, pgoff, nr_page,
+ dax_mode, &kaddr, NULL);
+ }
if (map_len < 0) {
ret = map_len;
break;
@@ -1252,11 +1259,20 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
*/
if (iov_iter_rw(iter) == WRITE)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
- map_len, iter, DAX_OP_NORMAL);
+ map_len, iter, dax_mode);
else
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
- map_len, iter, DAX_OP_NORMAL);
+ map_len, iter, dax_mode);

+ /*
+ * If dax data recovery is enacted via DAX_OP_RECOVERY,
+ * recovery failure would be indicated by a -EIO return
+ * in 'xfer' casted as (size_t).
+ */
+ if ((ssize_t)xfer == -EIO) {
+ ret = -EIO;
+ break;
+ }
pos += xfer;
length -= xfer;
done += xfer;
--
2.18.4