[PATCH 03/13] libmultipath: Add path selection support

From: John Garry

Date: Wed Feb 25 2026 - 10:35:11 EST


Add code for path selection.

NVMe ANA is abstracted into enum mpath_access_state. The motivation here is
so that SCSI ALUA can be used. Callbacks .is_disabled, .is_optimized,
.get_access_state are added to get the path access state.

Path selection modes round-robin, NUMA, and queue-depth are added, same
as NVMe supports.

NVMe has almost like-for-like equivalents here:
- __mpath_find_path() -> __nvme_find_path()
- mpath_find_path() -> nvme_find_path()

and similar for all introduced callee functions.

Functions mpath_set_iopolicy() and mpath_get_iopolicy() are added for
setting default iopolicy.

A separate mpath_iopolicy structure is introduced. There is no iopolicy
member included in the mpath_head structure as it may not suit NVMe, where
iopolicy is per-subsystem and not per namespace.

Signed-off-by: John Garry <john.g.garry@xxxxxxxxxx>
---
include/linux/multipath.h | 36 ++++++
lib/multipath.c | 251 ++++++++++++++++++++++++++++++++++++++
2 files changed, 287 insertions(+)

diff --git a/include/linux/multipath.h b/include/linux/multipath.h
index be9dd9fb83345..c964a1aba9c42 100644
--- a/include/linux/multipath.h
+++ b/include/linux/multipath.h
@@ -7,6 +7,22 @@

extern const struct block_device_operations mpath_ops;

+enum mpath_iopolicy_e {
+ MPATH_IOPOLICY_NUMA,
+ MPATH_IOPOLICY_RR,
+ MPATH_IOPOLICY_QD,
+};
+
+struct mpath_iopolicy {
+ enum mpath_iopolicy_e iopolicy;
+};
+
+enum mpath_access_state {
+ MPATH_STATE_OPTIMIZED,
+ MPATH_STATE_ACTIVE,
+ MPATH_STATE_INVALID = 0xFF
+};
+
struct mpath_disk {
struct gendisk *disk;
struct kref ref;
@@ -18,10 +34,16 @@ struct mpath_disk {

struct mpath_device {
struct list_head siblings;
+ atomic_t nr_active;
struct gendisk *disk;
+ int numa_node;
};

struct mpath_head_template {
+ bool (*is_disabled)(struct mpath_device *);
+ bool (*is_optimized)(struct mpath_device *);
+ enum mpath_access_state (*get_access_state)(struct mpath_device *);
+ enum mpath_iopolicy_e (*get_iopolicy)(struct mpath_head *);
const struct attribute_group **device_groups;
};

@@ -50,6 +72,14 @@ static inline struct mpath_disk *mpath_gendisk_to_disk(struct gendisk *disk)
return mpath_bd_device_to_disk(disk_to_dev(disk));
}

+static inline enum mpath_iopolicy_e mpath_read_iopolicy(
+ struct mpath_iopolicy *mpath_iopolicy)
+{
+ return READ_ONCE(mpath_iopolicy->iopolicy);
+}
+void mpath_synchronize(struct mpath_head *mpath_head);
+int mpath_set_iopolicy(const char *val, int *iopolicy);
+int mpath_get_iopolicy(char *buf, int iopolicy);
int mpath_get_head(struct mpath_head *mpath_head);
void mpath_put_head(struct mpath_head *mpath_head);
struct mpath_head *mpath_alloc_head(void);
@@ -66,4 +96,10 @@ static inline bool is_mpath_head(struct gendisk *disk)
{
return disk->fops == &mpath_ops;
}
+
+static inline bool mpath_qd_iopolicy(struct mpath_iopolicy *mpath_iopolicy)
+{
+ return mpath_read_iopolicy(mpath_iopolicy) == MPATH_IOPOLICY_QD;
+}
+
#endif // _LIBMULTIPATH_H
diff --git a/lib/multipath.c b/lib/multipath.c
index 88efb0ae16acb..65a0d2d2bf524 100644
--- a/lib/multipath.c
+++ b/lib/multipath.c
@@ -6,8 +6,243 @@
#include <linux/module.h>
#include <linux/multipath.h>

+static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head);
+
static struct workqueue_struct *mpath_wq;

+static const char *mpath_iopolicy_names[] = {
+ [MPATH_IOPOLICY_NUMA] = "numa",
+ [MPATH_IOPOLICY_RR] = "round-robin",
+ [MPATH_IOPOLICY_QD] = "queue-depth",
+};
+
+int mpath_set_iopolicy(const char *val, int *iopolicy)
+{
+ if (!val)
+ return -EINVAL;
+ if (!strncmp(val, "numa", 4))
+ *iopolicy = MPATH_IOPOLICY_NUMA;
+ else if (!strncmp(val, "round-robin", 11))
+ *iopolicy = MPATH_IOPOLICY_RR;
+ else if (!strncmp(val, "queue-depth", 11))
+ *iopolicy = MPATH_IOPOLICY_QD;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mpath_set_iopolicy);
+
+int mpath_get_iopolicy(char *buf, int iopolicy)
+{
+ return sprintf(buf, "%s\n", mpath_iopolicy_names[iopolicy]);
+}
+EXPORT_SYMBOL_GPL(mpath_get_iopolicy);
+
+
+void mpath_synchronize(struct mpath_head *mpath_head)
+{
+ synchronize_srcu(&mpath_head->srcu);
+}
+EXPORT_SYMBOL_GPL(mpath_synchronize);
+
+static bool mpath_path_is_disabled(struct mpath_head *mpath_head,
+ struct mpath_device *mpath_device)
+{
+ return mpath_head->mpdt->is_disabled(mpath_device);
+}
+
+static struct mpath_device *__mpath_find_path(struct mpath_head *mpath_head,
+ enum mpath_iopolicy_e iopolicy, int node)
+{
+ int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
+ struct mpath_device *mpath_dev_found, *mpath_dev_fallback,
+ *mpath_device;
+
+ list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
+ srcu_read_lock_held(&mpath_head->srcu)) {
+ if (mpath_path_is_disabled(mpath_head, mpath_device))
+ continue;
+
+ if (mpath_device->numa_node != NUMA_NO_NODE &&
+ (iopolicy == MPATH_IOPOLICY_NUMA))
+ distance = node_distance(node, mpath_device->numa_node);
+ else
+ distance = LOCAL_DISTANCE;
+
+ switch(mpath_head->mpdt->get_access_state(mpath_device)) {
+ case MPATH_STATE_OPTIMIZED:
+ if (distance < found_distance) {
+ found_distance = distance;
+ mpath_dev_found = mpath_device;
+ }
+ break;
+ case MPATH_STATE_ACTIVE:
+ if (distance < fallback_distance) {
+ fallback_distance = distance;
+ mpath_dev_fallback = mpath_device;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!mpath_dev_found)
+ mpath_dev_found = mpath_dev_fallback;
+
+ if (mpath_dev_found)
+ rcu_assign_pointer(mpath_head->current_path[node],
+ mpath_dev_found);
+
+ return mpath_dev_found;
+}
+
+static struct mpath_device *mpath_next_dev(struct mpath_head *mpath_head,
+ struct mpath_device *mpath_dev)
+{
+ mpath_dev = list_next_or_null_rcu(&mpath_head->dev_list,
+ &mpath_dev->siblings, struct mpath_device,
+ siblings);
+
+ if (mpath_dev)
+ return mpath_dev;
+ return list_first_or_null_rcu(&mpath_head->dev_list,
+ struct mpath_device, siblings);
+}
+
+static struct mpath_device *mpath_round_robin_path(
+ struct mpath_head *mpath_head,
+ enum mpath_iopolicy_e iopolicy)
+{
+ struct mpath_device *mpath_device, *found = NULL;
+ int node = numa_node_id();
+ enum mpath_access_state access_state_old;
+ struct mpath_device *old =
+ srcu_dereference(mpath_head->current_path[node],
+ &mpath_head->srcu);
+
+ if (unlikely(!old))
+ return __mpath_find_path(mpath_head, iopolicy, node);
+
+ if (list_is_singular(&mpath_head->dev_list)) {
+ if (mpath_path_is_disabled(mpath_head, old))
+ return NULL;
+ return old;
+ }
+
+ for (mpath_device = mpath_next_dev(mpath_head, old);
+ mpath_device && mpath_device != old;
+ mpath_device = mpath_next_dev(mpath_head, mpath_device)) {
+ enum mpath_access_state access_state;
+
+ if (mpath_path_is_disabled(mpath_head, mpath_device))
+ continue;
+ access_state = mpath_head->mpdt->get_access_state(mpath_device);
+ if (access_state == MPATH_STATE_OPTIMIZED) {
+ found = mpath_device;
+ goto out;
+ }
+ if (access_state == MPATH_STATE_ACTIVE)
+ found = mpath_device;
+ }
+
+ /*
+ * The loop above skips the current path for round-robin semantics.
+ * Fall back to the current path if either:
+ * - no other optimized path found and current is optimized,
+ * - no other usable path found and current is usable.
+ */
+ access_state_old = mpath_head->mpdt->get_access_state(old);
+ if (!mpath_path_is_disabled(mpath_head, old) &&
+ (access_state_old == MPATH_STATE_OPTIMIZED ||
+ (!found && access_state_old == MPATH_STATE_ACTIVE)))
+ return old;
+
+ if (!found)
+ return NULL;
+out:
+ rcu_assign_pointer(mpath_head->current_path[node], found);
+
+ return found;
+}
+
+static struct mpath_device *mpath_queue_depth_path(struct mpath_head *mpath_head)
+{
+ struct mpath_device *best_opt = NULL, *mpath_device;
+ struct mpath_device *best_nonopt = NULL;
+ unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+ unsigned int depth;
+
+ list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
+ srcu_read_lock_held(&mpath_head->srcu)) {
+
+ if (mpath_path_is_disabled(mpath_head, mpath_device))
+ continue;
+
+ depth = atomic_read(&mpath_device->nr_active);
+
+ switch (mpath_head->mpdt->get_access_state(mpath_device)) {
+ case MPATH_STATE_OPTIMIZED:
+ if (depth < min_depth_opt) {
+ min_depth_opt = depth;
+ best_opt = mpath_device;
+ }
+ break;
+ case MPATH_STATE_ACTIVE:
+ if (depth < min_depth_nonopt) {
+ min_depth_nonopt = depth;
+ best_nonopt = mpath_device;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (min_depth_opt == 0)
+ return best_opt;
+ }
+
+ return best_opt ? best_opt : best_nonopt;
+}
+
+static inline bool mpath_path_is_optimized(struct mpath_head *mpath_head,
+ struct mpath_device *mpath_device)
+{
+ return mpath_head->mpdt->is_optimized(mpath_device);
+}
+
+static struct mpath_device *mpath_numa_path(struct mpath_head *mpath_head,
+ enum mpath_iopolicy_e iopolicy)
+{
+ int node = numa_node_id();
+ struct mpath_device *mpath_device;
+
+ mpath_device = srcu_dereference(mpath_head->current_path[node],
+ &mpath_head->srcu);
+ if (unlikely(!mpath_device))
+ return __mpath_find_path(mpath_head, iopolicy, node);
+ if (unlikely(!mpath_path_is_optimized(mpath_head, mpath_device)))
+ return __mpath_find_path(mpath_head, iopolicy, node);
+ return mpath_device;
+}
+
+__maybe_unused
+static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
+{
+ enum mpath_iopolicy_e iopolicy =
+ mpath_head->mpdt->get_iopolicy(mpath_head);
+
+ switch (iopolicy) {
+ case MPATH_IOPOLICY_QD:
+ return mpath_queue_depth_path(mpath_head);
+ case MPATH_IOPOLICY_RR:
+ return mpath_round_robin_path(mpath_head, iopolicy);
+ default:
+ return mpath_numa_path(mpath_head, iopolicy);
+ }
+}
+
static void mpath_free_head(struct kref *ref)
{
struct mpath_head *mpath_head =
@@ -99,6 +334,7 @@ void mpath_remove_disk(struct mpath_disk *mpath_disk)
if (test_and_clear_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags)) {
struct gendisk *disk = mpath_disk->disk;

+ mpath_synchronize(mpath_head);
del_gendisk(disk);
}
}
@@ -158,6 +394,21 @@ void mpath_device_set_live(struct mpath_disk *mpath_disk,
}
queue_work(mpath_wq, &mpath_disk->partition_scan_work);
}
+
+ mutex_lock(&mpath_head->lock);
+ if (mpath_path_is_optimized(mpath_head, mpath_device)) {
+ int node, srcu_idx;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ for_each_online_node(node)
+ __mpath_find_path(mpath_head,
+ mpath_head->mpdt->get_iopolicy(mpath_head),
+ node);
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+ }
+ mutex_unlock(&mpath_head->lock);
+
+ mpath_synchronize(mpath_head);
}
EXPORT_SYMBOL_GPL(mpath_device_set_live);

--
2.43.5