[PATCH 08/20] drbd: add DAX/PMEM support for metadata access
From: Christoph Böhmwalder
Date: Fri Mar 27 2026 - 18:43:19 EST
When DRBD's metadata device resides on persistent memory (PMEM/NVDIMM),
accessing it by reading and writing full blocks is unnecessarily
costly.
Add a DAX-based metadata path that directly maps the metadata region,
enabling byte-granular, IRQ-safe access without having to go through
the block layer.
The PMEM path also introduces a more efficient activity log layout:
instead of writing journal transactions, the in-memory LRU-cache hash
table is stored directly in persistent memory and updated in-place.
Similarly, the resync bitmap is accessed directly from PMEM rather than
being loaded into and flushed from DRAM.
This is compiled in only when CONFIG_DEV_DAX_PMEM is enabled.
Co-developed-by: Philipp Reisner <philipp.reisner@xxxxxxxxxx>
Signed-off-by: Philipp Reisner <philipp.reisner@xxxxxxxxxx>
Co-developed-by: Lars Ellenberg <lars.ellenberg@xxxxxxxxxx>
Signed-off-by: Lars Ellenberg <lars.ellenberg@xxxxxxxxxx>
Co-developed-by: Joel Colledge <joel.colledge@xxxxxxxxxx>
Signed-off-by: Joel Colledge <joel.colledge@xxxxxxxxxx>
Co-developed-by: Christoph Böhmwalder <christoph.boehmwalder@xxxxxxxxxx>
Signed-off-by: Christoph Böhmwalder <christoph.boehmwalder@xxxxxxxxxx>
---
drivers/block/drbd/Makefile | 1 +
drivers/block/drbd/drbd_dax_pmem.c | 158 +++++++++++++++++++++++++++++
drivers/block/drbd/drbd_dax_pmem.h | 40 ++++++++
3 files changed, 199 insertions(+)
create mode 100644 drivers/block/drbd/drbd_dax_pmem.c
create mode 100644 drivers/block/drbd/drbd_dax_pmem.h
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
index 7f2655a206aa..4b58eb83fc22 100644
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -5,6 +5,7 @@ drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
drbd-y += drbd_interval.o drbd_state.o
drbd-y += drbd_nla.o
drbd-y += drbd_transport.o
+drbd-$(CONFIG_DEV_DAX_PMEM) += drbd_dax_pmem.o
drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_dax_pmem.c b/drivers/block/drbd/drbd_dax_pmem.c
new file mode 100644
index 000000000000..6f29dfd763a3
--- /dev/null
+++ b/drivers/block/drbd/drbd_dax_pmem.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ drbd_dax.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2017, LINBIT HA-Solutions GmbH.
+
+
+ */
+
+/*
+ In case DRBD's meta-data resides in persistent memory do a few things
+ different.
+
+ 1 direct access the bitmap in place. Do not load it into DRAM, do not
+ write it back from DRAM.
+ 2 Use a better fitting format for the on-disk activity log. Instead of
+ writing transactions, the unmangled LRU-cache hash table is there.
+*/
+
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/dax.h>
+#include <linux/libnvdimm.h>
+#include <linux/blkdev.h>
+#include "drbd_int.h"
+#include "drbd_dax_pmem.h"
+#include "drbd_meta_data.h"
+
+static int map_superblock_for_dax(struct drbd_backing_dev *bdev, struct dax_device *dax_dev)
+{
+ long want = 1;
+ pgoff_t pgoff = bdev->md.md_offset >> (PAGE_SHIFT - SECTOR_SHIFT);
+ void *kaddr;
+ long len;
+ int id;
+
+ id = dax_read_lock();
+ len = dax_direct_access(dax_dev, pgoff, want, DAX_ACCESS, &kaddr, NULL);
+ dax_read_unlock(id);
+
+ if (len < want)
+ return -EIO;
+
+ bdev->md_on_pmem = kaddr;
+
+ return 0;
+}
+
+/**
+ * drbd_dax_open() - Open device for dax and map metadata superblock
+ * @bdev: backing device to be opened
+ */
+int drbd_dax_open(struct drbd_backing_dev *bdev)
+{
+ struct dax_device *dax_dev;
+ int err;
+ u64 part_off;
+
+ dax_dev = fs_dax_get_by_bdev(bdev->md_bdev, &part_off, NULL, NULL);
+ if (!dax_dev)
+ return -ENODEV;
+
+ err = map_superblock_for_dax(bdev, dax_dev);
+ if (!err)
+ bdev->dax_dev = dax_dev;
+ else
+ put_dax(dax_dev);
+
+ return err;
+}
+
+void drbd_dax_close(struct drbd_backing_dev *bdev)
+{
+ put_dax(bdev->dax_dev);
+}
+
+/**
+ * drbd_dax_map() - Map metadata for dax
+ * @bdev: backing device whose metadata is to be mapped
+ */
+int drbd_dax_map(struct drbd_backing_dev *bdev)
+{
+ struct dax_device *dax_dev = bdev->dax_dev;
+ sector_t first_sector = drbd_md_first_sector(bdev);
+ sector_t al_sector = bdev->md.md_offset + bdev->md.al_offset;
+ long want = (drbd_md_last_sector(bdev) + 1 - first_sector) >> (PAGE_SHIFT - SECTOR_SHIFT);
+ pgoff_t pgoff = first_sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+ long md_offset_byte = (bdev->md.md_offset - first_sector) << SECTOR_SHIFT;
+ long al_offset_byte = (al_sector - first_sector) << SECTOR_SHIFT;
+ void *kaddr;
+ long len;
+ int id;
+
+ id = dax_read_lock();
+ len = dax_direct_access(dax_dev, pgoff, want, DAX_ACCESS, &kaddr, NULL);
+ dax_read_unlock(id);
+
+ if (len < want)
+ return -EIO;
+
+ bdev->md_on_pmem = kaddr + md_offset_byte;
+ bdev->al_on_pmem = kaddr + al_offset_byte;
+
+ return 0;
+}
+
+void drbd_dax_al_update(struct drbd_device *device, struct lc_element *al_ext)
+{
+ struct al_on_pmem *al_on_pmem = device->ldev->al_on_pmem;
+ __be32 *slot = &al_on_pmem->slots[al_ext->lc_index];
+
+ *slot = cpu_to_be32(al_ext->lc_new_number);
+ arch_wb_cache_pmem(slot, sizeof(*slot));
+}
+
+
+void drbd_dax_al_begin_io_commit(struct drbd_device *device)
+{
+ struct lc_element *e;
+
+ spin_lock_irq(&device->al_lock);
+
+ list_for_each_entry(e, &device->act_log->to_be_changed, list)
+ drbd_dax_al_update(device, e);
+
+ lc_committed(device->act_log);
+
+ spin_unlock_irq(&device->al_lock);
+}
+
+int drbd_dax_al_initialize(struct drbd_device *device)
+{
+ struct al_on_pmem *al_on_pmem = device->ldev->al_on_pmem;
+ __be32 *slots = al_on_pmem->slots;
+ int i, al_slots = (device->ldev->md.al_size_4k << (12 - 2)) - 1;
+
+ al_on_pmem->magic = cpu_to_be32(DRBD_AL_PMEM_MAGIC);
+ /* initialize all slots rather than just the configured number in case
+ * the configuration is later changed */
+ for (i = 0; i < al_slots; i++) {
+ unsigned int extent_nr = i < device->act_log->nr_elements ?
+ lc_element_by_index(device->act_log, i)->lc_number :
+ LC_FREE;
+ slots[i] = cpu_to_be32(extent_nr);
+ }
+
+ return 0;
+}
+
+void *drbd_dax_bitmap(struct drbd_device *device, unsigned long want)
+{
+ struct drbd_backing_dev *bdev = device->ldev;
+ unsigned char *md_on_pmem = (unsigned char *)bdev->md_on_pmem;
+
+ return md_on_pmem + (long)bdev->md.bm_offset * SECTOR_SIZE;
+}
diff --git a/drivers/block/drbd/drbd_dax_pmem.h b/drivers/block/drbd/drbd_dax_pmem.h
new file mode 100644
index 000000000000..9a929969ff27
--- /dev/null
+++ b/drivers/block/drbd/drbd_dax_pmem.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef DRBD_DAX_H
+#define DRBD_DAX_H
+
+#include <linux/kconfig.h>
+
+#if IS_ENABLED(CONFIG_DEV_DAX_PMEM)
+
+int drbd_dax_open(struct drbd_backing_dev *bdev);
+void drbd_dax_close(struct drbd_backing_dev *bdev);
+int drbd_dax_map(struct drbd_backing_dev *bdev);
+void drbd_dax_al_update(struct drbd_device *device, struct lc_element *al_ext);
+void drbd_dax_al_begin_io_commit(struct drbd_device *device);
+int drbd_dax_al_initialize(struct drbd_device *device);
+void *drbd_dax_bitmap(struct drbd_device *device, unsigned long want);
+
+static inline bool drbd_md_dax_active(struct drbd_backing_dev *bdev)
+{
+ return bdev->dax_dev != NULL;
+}
+static inline struct meta_data_on_disk_9 *drbd_dax_md_addr(struct drbd_backing_dev *bdev)
+{
+ return bdev->md_on_pmem;
+}
+#else
+
+#define drbd_dax_open(B) do { } while (0)
+#define drbd_dax_close(B) do { } while (0)
+#define drbd_dax_map(B) (-ENOTSUPP)
+#define drbd_dax_al_begin_io_commit(D) do { } while (0)
+#define drbd_dax_al_initialize(D) (-EIO)
+#define drbd_dax_bitmap(D, L) (NULL)
+#define drbd_md_dax_active(B) (false)
+#define drbd_dax_md_addr(B) (NULL)
+
+#define arch_wb_cache_pmem(A, L) do { } while (0)
+
+#endif /* IS_ENABLED(CONFIG_DEV_DAX_PMEM) */
+
+#endif /* DRBD_DAX_H */
--
2.53.0