Re: [PATCH RFC v2 12/18] cxl/region: Notify regions of DC changes
From: Ira Weiny
Date: Mon Sep 18 2023 - 13:46:39 EST
Jørgen Hansen wrote:
> On 8/29/23 07:21, Ira Weiny wrote:
> >
> > In order for a user to use dynamic capacity effectively they need to
> > know when dynamic capacity is available. Thus when Dynamic Capacity
> > (DC) extents are added or removed by a DC device the regions affected
> > need to be notified. Ultimately the DAX region uses the memory
> > associated with DC extents. However, remember that CXL DAX regions
> > maintain any interleave details between devices.
> >
> > When a DCD event occurs, iterate all CXL endpoint decoders and notify
> > regions which contain the endpoints affected by the event. In turn
> > notify the DAX regions of the changes to the DAX region extents.
> >
> > For now interleave is handled by creating simple 1:1 mappings between
> > the CXL DAX region and DAX region layers. Future implementations will
> > need to resolve when to actually surface a DAX region extent and pass
> > the notification along.
> >
> > Remember that adding capacity is safe because there is no chance of the
> > memory being in use. Also remember at this point releasing capacity is
> > straight forward because DAX devices do not yet have references to the
> > extents. Future patches will handle that complication.
> >
> > Signed-off-by: Ira Weiny <ira.weiny@xxxxxxxxx>
> >
> > ---
> > Changes from v1:
> > [iweiny: Rewrite]
> > ---
> > drivers/cxl/core/mbox.c | 39 +++++++++++++--
> > drivers/cxl/core/region.c | 123 +++++++++++++++++++++++++++++++++++++++++-----
> > drivers/cxl/cxl.h | 22 +++++++++
> > drivers/cxl/mem.c | 50 +++++++++++++++++++
> > drivers/dax/cxl.c | 99 ++++++++++++++++++++++++++++++-------
> > drivers/dax/dax-private.h | 3 ++
> > drivers/dax/extent.c | 14 ++++++
> > 7 files changed, 317 insertions(+), 33 deletions(-)
> >
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 5472ab1d0370..9d9c13e13ecf 100644
>
> [snip]
>
> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index 0aeea50550f6..a0c1f2793dd7 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -1547,8 +1547,8 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
> > return 0;
> > }
> >
> > -static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> > - struct cxl_dc_extent_data *extent)
> > +bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> > + struct cxl_dc_extent_data *extent)
> > {
> > struct range dpa_range = (struct range){
> > .start = extent->dpa_start,
> > @@ -1567,14 +1567,66 @@ static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> > return (cxled->dpa_res->start <= dpa_range.start &&
> > dpa_range.end <= cxled->dpa_res->end);
> > }
> > +EXPORT_SYMBOL_NS_GPL(cxl_dc_extent_in_ed, CXL);
> > +
> > +static int cxl_region_notify_extent(struct cxl_endpoint_decoder *cxled,
> > + enum dc_event event,
> > + struct cxl_dr_extent *cxl_dr_ext)
> > +{
> > + struct cxl_dax_region *cxlr_dax;
> > + struct device *dev;
> > + int rc = 0;
> > +
> > + cxlr_dax = cxled->cxld.region->cxlr_dax;
> > + dev = &cxlr_dax->dev;
> > + dev_dbg(dev, "Trying notify: type %d HPA:%llx LEN:%llx\n",
> > + event, cxl_dr_ext->hpa_offset, cxl_dr_ext->hpa_length);
> > +
> > + device_lock(dev);
> > + if (dev->driver) {
> > + struct cxl_driver *reg_drv = to_cxl_drv(dev->driver);
> > + struct cxl_drv_nd nd = (struct cxl_drv_nd) {
> > + .event = event,
> > + .cxl_dr_ext = cxl_dr_ext,
> > + };
> > +
> > + if (reg_drv->notify) {
> > + dev_dbg(dev, "Notify: type %d HPA:%llx LEN:%llx\n",
> > + event, cxl_dr_ext->hpa_offset,
> > + cxl_dr_ext->hpa_length);
> > + rc = reg_drv->notify(dev, &nd);
> > + }
> > + }
> > + device_unlock(dev);
> > + return rc;
> > +}
> > +
> > +static resource_size_t
> > +cxl_dc_extent_to_hpa_offset(struct cxl_endpoint_decoder *cxled,
> > + struct cxl_dc_extent_data *extent)
> > +{
> > + struct cxl_dax_region *cxlr_dax;
> > + resource_size_t dpa_offset, hpa;
> > + struct range *ed_hpa_range;
> > +
> > + cxlr_dax = cxled->cxld.region->cxlr_dax;
> > +
> > + /*
> > + * Without interleave...
> > + * HPA offset == DPA offset
> > + * ... but do the math anyway
> > + */
> > + dpa_offset = extent->dpa_start - cxled->dpa_res->start;
> > + ed_hpa_range = &cxled->cxld.hpa_range;
> > + hpa = ed_hpa_range->start + dpa_offset;
> > + return hpa - cxlr_dax->hpa_range.start;
> > +}
> >
> > static int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> > struct cxl_dc_extent_data *extent)
> > {
> > struct cxl_dr_extent *cxl_dr_ext;
> > struct cxl_dax_region *cxlr_dax;
> > - resource_size_t dpa_offset, hpa;
> > - struct range *ed_hpa_range;
> > struct device *dev;
> > int rc;
> >
> > @@ -1601,15 +1653,7 @@ static int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> > cxl_dr_ext->extent = extent;
> > kref_init(&cxl_dr_ext->region_ref);
> >
> > - /*
> > - * Without interleave...
> > - * HPA offset == DPA offset
> > - * ... but do the math anyway
> > - */
> > - dpa_offset = extent->dpa_start - cxled->dpa_res->start;
> > - ed_hpa_range = &cxled->cxld.hpa_range;
> > - hpa = ed_hpa_range->start + dpa_offset;
> > - cxl_dr_ext->hpa_offset = hpa - cxlr_dax->hpa_range.start;
> > + cxl_dr_ext->hpa_offset = cxl_dc_extent_to_hpa_offset(cxled, extent);
> >
> > /* Without interleave carry length and label through */
> > cxl_dr_ext->hpa_length = extent->length;
> > @@ -1626,6 +1670,7 @@ static int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> > }
> > /* Put in cxl_dr_release() */
> > cxl_dc_extent_get(cxl_dr_ext->extent);
> > + cxl_region_notify_extent(cxled, DCD_ADD_CAPACITY, cxl_dr_ext);
> > return 0;
> > }
> >
> > @@ -1663,6 +1708,58 @@ static int cxl_ed_add_extents(struct cxl_endpoint_decoder *cxled)
> > return 0;
> > }
> >
> > +static int cxl_ed_rm_dc_extent(struct cxl_endpoint_decoder *cxled,
> > + enum dc_event event,
> > + struct cxl_dc_extent_data *extent)
> > +{
> > + struct cxl_region *cxlr = cxled->cxld.region;
> > + struct cxl_dax_region *cxlr_dax = cxlr->cxlr_dax;
> > + struct cxl_dr_extent *cxl_dr_ext;
> > + resource_size_t hpa_offset;
> > +
> > + hpa_offset = cxl_dc_extent_to_hpa_offset(cxled, extent);
> > +
> > + /*
> > + * NOTE on Interleaving: There is no need to 'break up' the cxl_dr_ext.
> > + * If one of the extents comprising it is gone it should be removed
> > + * from the region to prevent future use. Later code may save other
> > + * extents for future processing. But for now the corelation is 1:1:1
> > + * so just erase the extent.
> > + */
> > + cxl_dr_ext = xa_erase(&cxlr_dax->extents, hpa_offset);
> > +
> > + dev_dbg(&cxlr_dax->dev, "Remove DAX region ext HPA:%llx\n",
> > + cxl_dr_ext->hpa_offset);
> > + cxl_region_notify_extent(cxled, event, cxl_dr_ext);
> > + cxl_dr_extent_put(cxl_dr_ext);
> > + return 0;
> > +}
> > +
> > +int cxl_ed_notify_extent(struct cxl_endpoint_decoder *cxled,
> > + struct cxl_drv_nd *nd)
> > +{
> > + int rc = 0;
> > +
> > + switch (nd->event) {
> > + case DCD_ADD_CAPACITY:
> > + if (cxl_dc_extent_get_not_zero(nd->extent)) {
> > + rc = cxl_ed_add_one_extent(cxled, nd->extent);
> > + if (rc)
> > + cxl_dc_extent_put(nd->extent);
>
> Hi,
> when playing around with adding and releasing DCD extents through the
> qmp interface for the QEMU DCD emulation, I noticed that extents weren't
> handed back to the device. It looks like there is a refcounting issue,
> as the kref never drops below 2 for the dc extents. So I was wondering
> whether we should only put the dc extent here on error or maybe always
> put it? cxl_ed_add_one_extent() also grabs a reference to the dc
> extent, and that one is put in cxl_dr_release(), but I couldn't find a
> matching put for this get_not_zero.
This is a bug I have fixed in the next version.
Yes the put needs to happen regardless of the return value.
...
case DCD_ADD_CAPACITY:
if (cxl_dc_extent_get_not_zero(nd->extent)) {
rc = cxl_ed_add_one_extent(cxled, nd->extent);
cxl_dc_extent_put(nd->extent);
}
...
Please let me know if that does not work. And thanks for the testing,
Ira