Re: [PATCH v5 4/7] cxl: Add multi-function sibling coordination for CXL reset
From: Alex Williamson
Date: Fri Mar 06 2026 - 18:35:59 EST
On Fri, 6 Mar 2026 09:23:19 +0000
<smadhavan@xxxxxxxxxx> wrote:
> From: Srirangan Madhavan <smadhavan@xxxxxxxxxx>
>
> Add sibling PCI function save/disable/restore coordination for CXL
> reset. Before reset, all CXL.cachemem sibling functions are locked,
> saved, and disabled; after reset they are restored. The Non-CXL Function
> Map DVSEC and per-function DVSEC capability register are consulted to
> skip non-CXL and CXL.io-only functions. A global mutex serializes
> concurrent resets to prevent deadlocks between sibling functions.
>
> Signed-off-by: Srirangan Madhavan <smadhavan@xxxxxxxxxx>
> ---
> drivers/cxl/core/pci.c | 137 +++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 137 insertions(+)
>
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 9e6f0c4b3cb6..b6f10a2cb404 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -15,6 +15,9 @@
> #include "core.h"
> #include "trace.h"
>
> +/* Initial sibling array capacity: covers max non-ARI functions per slot */
> +#define CXL_RESET_SIBLINGS_INIT 8
> +
> /**
> * DOC: cxl core pci
> *
> @@ -979,3 +982,137 @@ static int __maybe_unused cxl_reset_flush_cpu_caches(struct cxl_memdev *cxlmd)
> device_for_each_child(&endpoint->dev, NULL, cxl_decoder_flush_cache);
> return 0;
> }
> +
> +/*
> + * Serialize all CXL reset operations globally.
> + */
> +static DEFINE_MUTEX(cxl_reset_mutex);
> +
> +struct cxl_reset_context {
> + struct pci_dev *target;
> + struct pci_dev **pci_functions;
> + int pci_func_count;
> + int pci_func_cap;
> +};
> +
> +/*
> + * Check if a sibling function is non-CXL using the Non-CXL Function Map
> + * DVSEC. Returns true if fn is listed as non-CXL, false otherwise (including
> + * on any read failure).
> + */
> +static bool cxl_is_non_cxl_function(struct pci_dev *pdev,
> + u16 func_map_dvsec, int fn)
I think we can do better, how about:
static bool is_cxl_sibling(struct pci_dev *pdev,
struct pci_dev *sibling,
unsigned long *non_cxl_func_map)
{
if (pci_ari_enabled(pdev->bus))
return !test_bit(sibling->devfn, non_cxl_func_map);
return PCI_SLOT(pdev->devfn) == PCI_SLOT(sibling->devfn) ?
!test_bit(PCI_FUNC(sibling->devfn) * 32 +
PCI_SLOT(sibling->devfn), non_cxl_func_map) :
false;
}
To get this we'd eliminate ari and func_map_dvsec from the walk data
and replace with with a bitmap of all 8 non-cxl functions pre-filled
before the walk.
> +{
> + int reg, bit;
> + u32 map;
> +
> + if (pci_ari_enabled(pdev->bus)) {
> + reg = fn / 32;
> + bit = fn % 32;
> + } else {
> + reg = fn;
> + bit = PCI_SLOT(pdev->devfn);
> + }
> +
> + if (pci_read_config_dword(pdev,
> + func_map_dvsec + PCI_DVSEC_CXL_FUNCTION_MAP_REG + (reg * 4),
> + &map))
> + return false;
> +
> + return map & BIT(bit);
> +}
> +
> +struct cxl_reset_walk_ctx {
> + struct cxl_reset_context *ctx;
> + u16 func_map_dvsec;
> + bool ari;
> +};
> +
> +static int cxl_reset_collect_sibling(struct pci_dev *func, void *data)
> +{
> + struct cxl_reset_walk_ctx *wctx = data;
> + struct cxl_reset_context *ctx = wctx->ctx;
> + struct pci_dev *pdev = ctx->target;
> + u16 dvsec, cap;
> + int fn;
> +
> + if (func == pdev)
> + return 0;
> +
> + if (!wctx->ari &&
> + PCI_SLOT(func->devfn) != PCI_SLOT(pdev->devfn))
> + return 0;
> +
> + fn = wctx->ari ? func->devfn : PCI_FUNC(func->devfn);
> + if (wctx->func_map_dvsec &&
> + cxl_is_non_cxl_function(pdev, wctx->func_map_dvsec, fn))
> + return 0;
The above, since the identity check, becomes:
if (!is_cxl_sibling(pdev, func, non_cxl_func_map))
return 0;
> +
> + /* Only coordinate with siblings that have CXL.cachemem */
Is this meant to read as "cache/mem" since we test for either below?
> + dvsec = pci_find_dvsec_capability(func, PCI_VENDOR_ID_CXL,
> + PCI_DVSEC_CXL_DEVICE);
> + if (!dvsec)
> + return 0;
> + if (pci_read_config_word(func, dvsec + PCI_DVSEC_CXL_CAP, &cap))
> + return 0;
> + if (!(cap & (PCI_DVSEC_CXL_CACHE_CAPABLE |
> + PCI_DVSEC_CXL_MEM_CAPABLE)))
> + return 0;
> +
> + /* Grow sibling array; double capacity for ARI devices when running out of space */
> + if (ctx->pci_func_count >= ctx->pci_func_cap) {
> + struct pci_dev **new;
> + int new_cap = ctx->pci_func_cap ? ctx->pci_func_cap * 2
> + : CXL_RESET_SIBLINGS_INIT;
> +
> + new = krealloc(ctx->pci_functions,
> + new_cap * sizeof(*new), GFP_KERNEL);
> + if (!new)
> + return 1;
> + ctx->pci_functions = new;
> + ctx->pci_func_cap = new_cap;
> + }
> +
> + pci_dev_get(func);
> + ctx->pci_functions[ctx->pci_func_count++] = func;
> + return 0;
> +}
> +
> +static void __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx)
> +{
> + struct pci_dev *pdev = ctx->target;
> + struct cxl_reset_walk_ctx wctx;
> + int i;
> +
> + ctx->pci_func_count = 0;
> + ctx->pci_functions = NULL;
> + ctx->pci_func_cap = 0;
> +
> + wctx.ctx = ctx;
> + wctx.ari = pci_ari_enabled(pdev->bus);
> + wctx.func_map_dvsec = pci_find_dvsec_capability(pdev,
> + PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_FUNCTION_MAP);
> +
> + /* Collect CXL.cachemem siblings under pci_bus_sem */
> + pci_walk_bus(pdev->bus, cxl_reset_collect_sibling, &wctx);
> +
> + /* Lock and save/disable siblings outside pci_bus_sem */
> + for (i = 0; i < ctx->pci_func_count; i++) {
> + pci_dev_lock(ctx->pci_functions[i]);
> + pci_dev_save_and_disable(ctx->pci_functions[i]);
> + }
We also need to trigger the pci_dev_reset_iommu_{prepare,done}() hook
around reset.
Also, I think this whole path needs to be able to return error. We've
got the krealloc in the walk function that can fail and is just
silently ignored and also the pci_dev_lock() here, where we've
eliminated the PCI bus semaphore issue, but it's still prone to dead-
lock. We should use trylock and support unwind when it fails. Return
an errno to the sysfs interface.
An example deadlock that could occur is a CXL reset initiated on fn1,
where we then try to lock fn0, fn2, etc (fn1 already locked) while
another interface walks fn0, fn1, etc. Thanks,
Alex
> +}
> +
> +static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context *ctx)
> +{
> + int i;
> +
> + for (i = 0; i < ctx->pci_func_count; i++) {
> + pci_dev_restore(ctx->pci_functions[i]);
> + pci_dev_unlock(ctx->pci_functions[i]);
> + pci_dev_put(ctx->pci_functions[i]);
> + }
> + kfree(ctx->pci_functions);
> + ctx->pci_functions = NULL;
> + ctx->pci_func_count = 0;
> +}
> --
> 2.43.0
>