Re: [PATCH 14/26] cxl/region: Read existing extents on region creation

From: Ira Weiny
Date: Wed Apr 10 2024 - 02:29:28 EST


Jørgen Hansen wrote:
> On 3/25/24 00:18, ira.weiny@xxxxxxxxx wrote:
> > From: Navneet Singh <navneet.singh@xxxxxxxxx>
> >
> > Dynamic capacity device extents may be left in an accepted state on a
> > device due to an unexpected host crash. In this case creation of a new
> > region on top of the DC partition (region) is expected to expose those
> > extents for continued use.
> >
> > Once all endpoint decoders are part of a region and the region is being
> > realized read the device extent list. For ease of review, this patch
> > stops after reading the extent list and leaves realization of the region
> > extents to a future patch.
> >
> > Signed-off-by: Navneet Singh <navneet.singh@xxxxxxxxx>
> > Co-developed-by: Ira Weiny <ira.weiny@xxxxxxxxx>
> > Signed-off-by: Ira Weiny <ira.weiny@xxxxxxxxx>
> >
> > ---
> > Changes for v1:
> > [iweiny: remove extent list xarray]
> > [iweiny: Update spec references to 3.1]
> > [iweiny: use struct range in extents]
> > [iweiny: remove all reference tracking and let regions track extents
> > through the extent devices.]
> > [djbw/Jonathan/Fan: move extent tracking to endpoint decoders]
> > ---
> > drivers/cxl/core/core.h | 9 +++
> > drivers/cxl/core/mbox.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++
> > drivers/cxl/core/region.c | 29 +++++++
> > drivers/cxl/cxlmem.h | 49 ++++++++++++
> > 4 files changed, 279 insertions(+)
> >
> > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> > index 91abeffbe985..119b12362977 100644
> > --- a/drivers/cxl/core/core.h
> > +++ b/drivers/cxl/core/core.h
> > @@ -4,6 +4,8 @@
> > #ifndef __CXL_CORE_H__
> > #define __CXL_CORE_H__
> >
> > +#include <cxlmem.h>
> > +
> > extern const struct device_type cxl_nvdimm_bridge_type;
> > extern const struct device_type cxl_nvdimm_type;
> > extern const struct device_type cxl_pmu_type;
> > @@ -28,6 +30,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled);
> > int cxl_region_init(void);
> > void cxl_region_exit(void);
> > int cxl_get_poison_by_endpoint(struct cxl_port *port);
> > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> > + struct cxl_dc_extent *dc_extent);
> > #else
> > static inline int cxl_get_poison_by_endpoint(struct cxl_port *port)
> > {
> > @@ -43,6 +47,11 @@ static inline int cxl_region_init(void)
> > static inline void cxl_region_exit(void)
> > {
> > }
> > +static inline int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> > + struct cxl_dc_extent *dc_extent)
> > +{
> > + return 0;
> > +}
> > #define CXL_REGION_ATTR(x) NULL
> > #define CXL_REGION_TYPE(x) NULL
> > #define SET_CXL_REGION_ATTR(x)
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 58b31fa47b93..9e33a0976828 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
> > }
> > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
> >
> > +static int cxl_validate_extent(struct cxl_memdev_state *mds,
> > + struct cxl_dc_extent *dc_extent)
> > +{
> > + struct device *dev = mds->cxlds.dev;
> > + uint64_t start, len;
> > +
> > + start = le64_to_cpu(dc_extent->start_dpa);
> > + len = le64_to_cpu(dc_extent->length);
> > +
> > + /* Extents must not cross region boundary's */
> > + for (int i = 0; i < mds->nr_dc_region; i++) {
> > + struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> > +
> > + if (dcr->base <= start &&
> > + (start + len) <= (dcr->base + dcr->decode_len)) {
> > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n",
> > + start, start + len - 1, i, start - dcr->base);
> > + return 0;
> > + }
> > + }
> > +
> > + dev_err_ratelimited(dev,
> > + "DC extent DPA %#llx - %#llx is not in any DC region\n",
> > + start, start + len - 1);
> > + return -EINVAL;
> > +}
> > +
> > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> > + struct cxl_dc_extent *extent)
> > +{
> > + uint64_t start = le64_to_cpu(extent->start_dpa);
> > + uint64_t length = le64_to_cpu(extent->length);
> > + struct range ext_range = (struct range){
> > + .start = start,
> > + .end = start + length - 1,
> > + };
> > + struct range ed_range = (struct range) {
> > + .start = cxled->dpa_res->start,
> > + .end = cxled->dpa_res->end,
> > + };
> > +
> > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n",
> > + cxled->dpa_res, start, length);
> > +
> > + return range_contains(&ed_range, &ext_range);
> > +}
> > +
> > void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> > enum cxl_event_log_type type,
> > enum cxl_event_type event_type,
> > @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
> > return rc;
> > }
> >
> > +static struct cxl_memdev_state *
> > +cxled_to_mds(struct cxl_endpoint_decoder *cxled)
> > +{
> > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> > + struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +
> > + return container_of(cxlds, struct cxl_memdev_state, cxlds);
> > +}
> > +
> > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
> > enum cxl_event_log_type type)
> > {
> > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> > }
> > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> >
> > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds,
> > + unsigned int *extent_gen_num)
> > +{
> > + struct cxl_mbox_get_dc_extent_in get_dc_extent;
> > + struct cxl_mbox_get_dc_extent_out dc_extents;
> > + struct cxl_mbox_cmd mbox_cmd;
> > + unsigned int count;
> > + int rc;
> > +
> > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) {
> > + .extent_cnt = cpu_to_le32(0),
> > + .start_extent_index = cpu_to_le32(0),
> > + };
> > +
> > + mbox_cmd = (struct cxl_mbox_cmd) {
> > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST,
> > + .payload_in = &get_dc_extent,
> > + .size_in = sizeof(get_dc_extent),
> > + .size_out = sizeof(dc_extents),
> > + .payload_out = &dc_extents,
> > + .min_out = 1,
> > + };
> > +
> > + rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> > + if (rc < 0)
> > + return rc;
> > +
> > + count = le32_to_cpu(dc_extents.total_extent_cnt);
> > + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num);
> > +
> > + return count;
> > +}
> > +
> > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled,
> > + unsigned int start_gen_num,
> > + unsigned int exp_cnt)
> > +{
> > + struct cxl_memdev_state *mds = cxled_to_mds(cxled);
> > + unsigned int start_index, total_read;
> > + struct device *dev = mds->cxlds.dev;
> > + struct cxl_mbox_cmd mbox_cmd;
> > +
> > + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) =
> > + kvmalloc(mds->payload_size, GFP_KERNEL);
> > + if (!dc_extents)
> > + return -ENOMEM;
> > +
> > + total_read = 0;
> > + start_index = 0;
> > + do {
> > + unsigned int nr_ext, total_extent_cnt, gen_num;
> > + struct cxl_mbox_get_dc_extent_in get_dc_extent;
> > + int rc;
> > +
> > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) {
> > + .extent_cnt = cpu_to_le32(exp_cnt - start_index),
> > + .start_extent_index = cpu_to_le32(start_index),
> > + };
> > +
> > + mbox_cmd = (struct cxl_mbox_cmd) {
> > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST,
> > + .payload_in = &get_dc_extent,
> > + .size_in = sizeof(get_dc_extent),
> > + .size_out = mds->payload_size,
> > + .payload_out = dc_extents,
> > + .min_out = 1,
> > + };
> > +
> > + rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> > + if (rc < 0)
> > + return rc;
> > +
> > + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt);
> > + total_read += nr_ext;
> > + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt);
> > + gen_num = le32_to_cpu(dc_extents->extent_list_num);
> > +
> > + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n",
> > + total_extent_cnt, gen_num);
> > +
> > + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) {
> > + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n",
> > + gen_num, start_gen_num, exp_cnt, total_extent_cnt);
> > + return -EIO;
> > + }
> > +
> > + for (int i = 0; i < nr_ext ; i++) {
> > + dev_dbg(dev, "Processing extent %d/%d\n",
> > + start_index + i, exp_cnt);
> > + rc = cxl_validate_extent(mds, &dc_extents->extent[i]);
> > + if (rc)
> > + continue;
> > + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i]))
> > + continue;
> > + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]);
> > + if (rc)
> > + return rc;
> > + }
> > +
> > + start_index += nr_ext;
> > + } while (exp_cnt > total_read);
> > +
> > + return 0;
> > +}
> > +
> > +/**
> > + * cxl_read_dc_extents() - Read any existing extents
> > + * @cxled: Endpoint decoder which is part of a region
> > + *
> > + * Issue the Get Dynamic Capacity Extent List command to the device
> > + * and add any existing extents found which belong to this decoder.
> > + *
> > + * Return: 0 if command was executed successfully, -ERRNO on error.
> > + */
> > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled)
> > +{
> > + struct cxl_memdev_state *mds = cxled_to_mds(cxled);
> > + struct device *dev = mds->cxlds.dev;
> > + unsigned int extent_gen_num;
> > + int rc;
> > +
> > + if (!cxl_dcd_supported(mds)) {
> > + dev_dbg(dev, "DCD unsupported\n");
> > + return 0;
> > + }
> > +
> > + rc = cxl_dev_get_dc_extent_cnt(mds, &extent_gen_num);
> > + dev_dbg(mds->cxlds.dev, "Extent count: %d Generation Num: %d\n",
> > + rc, extent_gen_num);
> > + if (rc <= 0) /* 0 == no records found */
> > + return rc;
> > +
> > + return cxl_dev_get_dc_extents(cxled, extent_gen_num, rc);
>
> Is it necessary to spend a device interaction to get the generation
> number?

Not completely necessary no.

> Couldn't cxl_dev_get_dc_extents obtain that as part of the first
> call to the device, and then use it to ensure the consistency of any
> remaining calls, if any are necessary?

.. However, this is not a critical path and the extra query to hardware makes
the code a bit easier to follow IMO. There are 2 distinct steps.

1) get expected number of extents and the current generation number
2) query for that number whilst checking that the gen number is stable

Doing what you suggest results in special casing the first query within the
loop which is kind of ugly IMO.

That said, with the new retry requirement Fan pointed out I'll consider this in
that new algorithm context.

Ira