RE: [PATCH 3/5] cxl/pci: Discover and cache pointer to RCD dport's CXL RAS registers
From: Dan Williams
Date: Sat Oct 22 2022 - 18:45:13 EST
Terry Bowman wrote:
> CXL RAS information resides in a RAS capability structure located in
> CXL.cache and CXL.mem registers.[1] The RAS capability provides CXL
> specific error information that can be helpful in debugging. This
> information is not currently logged but needs to be logged during PCIe AER
> error handling.
>
> Update the CXL driver to find and cache a pointer to the CXL RAS
> capability. The RAS registers resides in the downport's component register
> block. Note:RAS registers are not in the upport. The component registers
> can be found by first using the RCRB to goto the downport. Next, the
> downport's 64-bit BAR[0] will point to the component register block.
>
> [1] CXL3.0 Spec, '8.2.5 CXL.cache and CXL.mem Registers'
>
> Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
> ---
> drivers/cxl/cxl.h | 4 +++
> drivers/cxl/cxlmem.h | 1 +
> drivers/cxl/pci.c | 72 ++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 77 insertions(+)
>
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 7d507ab80a78..69b50131ad86 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -36,6 +36,10 @@
> #define CXL_CM_CAP_CAP_ID_HDM 0x5
> #define CXL_CM_CAP_CAP_HDM_VERSION 1
>
> +/* CXL 3.0 8.2.4.2 CXL RAS Capability Header */
> +#define CXL_CM_CAP_ID_RAS 0x2
> +#define CXL_CM_CAP_SIZE_RAS 0x5C
> +
> /* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */
> #define CXL_HDM_DECODER_CAP_OFFSET 0x0
> #define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0)
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 079db2e15acc..515273e224ea 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -243,6 +243,7 @@ struct cxl_dev_state {
> u64 next_persistent_bytes;
>
> struct cxl_register_map aer_map;
> + struct cxl_register_map ras_map;
>
> resource_size_t component_reg_phys;
> u64 serial;
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 2287b5225862..7f717fb47a36 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -586,6 +586,78 @@ void cxl_pci_aer_init(struct cxl_memdev *cxlmd)
> }
> EXPORT_SYMBOL_NS_GPL(cxl_pci_aer_init, CXL);
>
> +static resource_size_t cxl_get_dport_ras_base(struct cxl_memdev *cxlmd)
> +{
> + resource_size_t component_reg_phys, offset = 0;
> + struct cxl_dev_state *cxlds = cxlmd->cxlds;
> + void *cap_hdr_addr, *comp_reg_mapped;
> + u32 cap_hdr, ras_cap_hdr;
> + int cap_ndx;
> +
> + comp_reg_mapped = ioremap(cxlds->component_reg_phys +
> + CXL_CM_OFFSET, CXL_COMPONENT_REG_BLOCK_SIZE);
> + if (!comp_reg_mapped)
> + return 0;
> +
> + cap_hdr_addr = comp_reg_mapped;
> + cap_hdr = readl(cap_hdr_addr);
> + for (cap_ndx = 0;
> + cap_ndx < FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_hdr);
> + cap_ndx++) {
> + ras_cap_hdr = readl(cap_hdr_addr + cap_ndx*sizeof(u32));
> +
> + if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) == CXL_CM_CAP_ID_RAS) {
> + pr_debug("RAS cap header = %X @ %pa, cap_ndx = %d\n",
> + ras_cap_hdr, cap_hdr_addr, cap_ndx);
> + break;
> + }
> + }
> +
> + offset = CXL_CM_OFFSET + PCI_EXT_CAP_NEXT(ras_cap_hdr);
> +
> + iounmap(comp_reg_mapped);
> +
> + if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) != CXL_CM_CAP_ID_RAS)
> + return 0;
> +
> + pr_debug("Found RAS capability @ %llX (%X)\n",
> + component_reg_phys + offset, *((u32 *)(comp_reg_mapped + offset)));
> +
> + return component_reg_phys + offset;
For the RAS capability in the cxl_pci device this patch needs to be
reconciled with this effort:
https://lore.kernel.org/linux-cxl/166336972295.3803215.1047199449525031921.stgit@xxxxxxxxxxxxxxxxxxxxxxxxxx/
I think we will want RCD and VH RAS capability reporting to happen in
the same place, and that can not be cxl_pci because cxl_pci has no way
to find the RAS registers on its own. It needs the help from cxl_mem to
do the upstream cxl_port associtation first.
Given CXL switches will have their own RAS capabilities to report it
feels like the cxl_port driver is where all of this should be
centralized.
> +}
> +
> +static int cxl_setup_dport_ras(struct cxl_memdev *cxlmd, resource_size_t resource)
> +{
> + struct cxl_register_map *map = &cxlmd->cxlds->ras_map;
> + struct pci_dev *pdev = to_pci_dev(&cxlmd->dev);
> +
> + if (!resource) {
> + pr_err("%s():%d: RAS resource ptr is NULL\n", __func__, __LINE__);
> + return -EINVAL;
> + }
> +
> + map->base = devm_cxl_iomap_block(&pdev->dev, resource, CXL_CM_CAP_SIZE_RAS);
> + if (!map->base)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +void cxl_pci_ras_init(struct cxl_memdev *cxlmd)
> +{
> + resource_size_t cap;
> +
> + /*
> + * TODO - CXL2.0 will need change to support PCI config space.
> + */
> + if (!is_rcd(cxlmd))
> + return;
> +
> + cap = cxl_get_dport_ras_base(cxlmd);
> + cxl_setup_dport_ras(cxlmd, cap);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_pci_ras_init, CXL);
> +
> static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> {
> struct cxl_register_map map;
> --
> 2.34.1
>