Re: [PATCH v3 24/25] tools/testing/cxl: Make event logs dynamic
From: Ira Weiny
Date: Mon Sep 09 2024 - 10:00:21 EST
Jonathan Cameron wrote:
> On Fri, 16 Aug 2024 09:44:32 -0500
> Ira Weiny <ira.weiny@xxxxxxxxx> wrote:
>
> > The test event logs were created as static arrays as an easy way to mock
> > events. Dynamic Capacity Device (DCD) test support requires events be
> > generated dynamically when extents are created or destroyed.
> >
> > Modify the event log storage to be dynamically allocated. Reuse the
> > static event data to create the dynamic events in the new logs without
> > inventing complex event injection for the previous tests. Simplify the
> > processing of the logs by using the event log array index as the handle.
> > Add a lock to manage concurrency required when user space is allowed to
> > control DCD extents
> >
> > Signed-off-by: Ira Weiny <ira.weiny@xxxxxxxxx>
> Probably make sense to spinkle some guard() magic in here
> to avoid all the places where you goto end of function to release the lock
Yes. Sorry this patch did not get as much self-review as it should have.
> >
> > ---
> > Changes:
> > [iweiny: rebase]
> > ---
> > tools/testing/cxl/test/mem.c | 278 ++++++++++++++++++++++++++-----------------
> > 1 file changed, 171 insertions(+), 107 deletions(-)
> >
> > diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
> > index 129f179b0ac5..674fc7f086cd 100644
> > --- a/tools/testing/cxl/test/mem.c
> > +++ b/tools/testing/cxl/test/mem.c
> > @@ -125,18 +125,27 @@ static struct {
> >
> > #define PASS_TRY_LIMIT 3
> >
> > -#define CXL_TEST_EVENT_CNT_MAX 15
> > +#define CXL_TEST_EVENT_CNT_MAX 17
>
> Seems you added a couple more. Don't do that in a patch
> just changing allocation approach.
>
> I could find 1 but not sure where other one came from!
I wasn't sure either. see below...
[snip]
> > @@ -233,8 +254,8 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd)
> > {
> > struct cxl_get_event_payload *pl;
> > struct mock_event_log *log;
> > - u16 nr_overflow;
> > u8 log_type;
> > + u16 handle;
> > int i;
> >
> > if (cmd->size_in != sizeof(log_type))
> > @@ -254,29 +275,39 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd)
> > memset(cmd->payload_out, 0, struct_size(pl, records, 0));
> >
> > log = event_find_log(dev, log_type);
> > - if (!log || event_log_empty(log))
> > + if (!log)
> > return 0;
> >
> > pl = cmd->payload_out;
> >
> > - for (i = 0; i < ret_limit && !event_log_empty(log); i++) {
> > - memcpy(&pl->records[i], event_get_current(log),
> > - sizeof(pl->records[i]));
> > - pl->records[i].event.generic.hdr.handle =
> > - event_get_cur_event_handle(log);
> > - log->cur_idx++;
> > + read_lock(&log->lock);
> > +
> > + handle = log->cur_handle;
> > + dev_dbg(dev, "Get log %d handle %u next %u\n",
> > + log_type, handle, log->next_handle);
> > + for (i = 0;
> > + i < ret_limit && handle != log->next_handle;
> As below, maybe combine 2 lines above into 1.
Ok. done.
>
>
> > + i++, event_inc_handle(&handle)) {
> > + struct cxl_event_record_raw *cur;
> > +
> > + cur = log->events[handle];
> > + dev_dbg(dev, "Sending event log %d handle %d idx %u\n",
> > + log_type, le16_to_cpu(cur->event.generic.hdr.handle),
> > + handle);
> > + memcpy(&pl->records[i], cur, sizeof(pl->records[i]));
> > + pl->records[i].event.generic.hdr.handle = cpu_to_le16(handle);
> > }
> >
> > cmd->size_out = struct_size(pl, records, i);
> > pl->record_count = cpu_to_le16(i);
> > - if (!event_log_empty(log))
> > + if (log->nr_events > i)
> > pl->flags |= CXL_GET_EVENT_FLAG_MORE_RECORDS;
> >
> > if (log->nr_overflow) {
> > u64 ns;
> >
> > pl->flags |= CXL_GET_EVENT_FLAG_OVERFLOW;
> > - pl->overflow_err_count = cpu_to_le16(nr_overflow);
> > + pl->overflow_err_count = cpu_to_le16(log->nr_overflow);
> > ns = ktime_get_real_ns();
> > ns -= 5000000000; /* 5s ago */
> > pl->first_overflow_timestamp = cpu_to_le64(ns);
> > @@ -285,16 +316,17 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd)
> > pl->last_overflow_timestamp = cpu_to_le64(ns);
> > }
> >
> > + read_unlock(&log->lock);
> Another one maybe for guard()
done.
>
> > return 0;
> > }
> >
> > static int mock_clear_event(struct device *dev, struct cxl_mbox_cmd *cmd)
> > {
> > struct cxl_mbox_clear_event_payload *pl = cmd->payload_in;
> > - struct mock_event_log *log;
> > u8 log_type = pl->event_log;
> > + struct mock_event_log *log;
> > + int nr, rc = 0;
> > u16 handle;
> > - int nr;
> >
> > if (log_type >= CXL_EVENT_TYPE_MAX)
> > return -EINVAL;
> > @@ -303,24 +335,23 @@ static int mock_clear_event(struct device *dev, struct cxl_mbox_cmd *cmd)
> > if (!log)
> > return 0; /* No mock data in this log */
> >
> > - /*
> > - * This check is technically not invalid per the specification AFAICS.
> > - * (The host could 'guess' handles and clear them in order).
> > - * However, this is not good behavior for the host so test it.
> > - */
> > - if (log->clear_idx + pl->nr_recs > log->cur_idx) {
> > - dev_err(dev,
> > - "Attempting to clear more events than returned!\n");
> > - return -EINVAL;
> > - }
> > + write_lock(&log->lock);
> Use a guard()?
done.
> >
> > /* Check handle order prior to clearing events */
> > - for (nr = 0, handle = event_get_clear_handle(log);
> > - nr < pl->nr_recs;
> > - nr++, handle++) {
> > + handle = log->cur_handle;
> > + for (nr = 0;
> > + nr < pl->nr_recs && handle != log->next_handle;
>
> I'd combine the two lines above.
Ok. done.
>
> > + nr++, event_inc_handle(&handle)) {
> > +
> > + dev_dbg(dev, "Checking clear of %d handle %u plhandle %u\n",
> > + log_type, handle,
> > + le16_to_cpu(pl->handles[nr]));
> > +
> > if (handle != le16_to_cpu(pl->handles[nr])) {
> > - dev_err(dev, "Clearing events out of order\n");
> > - return -EINVAL;
> > + dev_err(dev, "Clearing events out of order %u %u\n",
> > + handle, le16_to_cpu(pl->handles[nr]));
> > + rc = -EINVAL;
> > + goto unlock;
> > }
> > }
> >
> > @@ -328,25 +359,12 @@ static int mock_clear_event(struct device *dev, struct cxl_mbox_cmd *cmd)
> > log->nr_overflow = 0;
> >
> > /* Clear events */
> > - log->clear_idx += pl->nr_recs;
> > - return 0;
> > -}
>
> >
> > struct cxl_event_record_raw maint_needed = {
> > @@ -475,8 +493,27 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds,
> > return 0;
> > }
> >
>
> > +static void cxl_mock_add_event_logs(struct cxl_mockmem_data *mdata)
> > {
> > + struct mock_event_store *mes = &mdata->mes;
> > + struct device *dev = mdata->mds->cxlds.dev;
> > +
> > put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK,
> > &gen_media.rec.media_hdr.validity_flags);
> >
> > @@ -484,43 +521,60 @@ static void cxl_mock_add_event_logs(struct mock_event_store *mes)
> > CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN,
> > &dram.rec.media_hdr.validity_flags);
> >
> > - mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed);
> > - mes_add_event(mes, CXL_EVENT_TYPE_INFO,
> > + dev_dbg(dev, "Generating fake event logs %d\n",
> > + CXL_EVENT_TYPE_INFO);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_INFO, &maint_needed);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_INFO,
> > (struct cxl_event_record_raw *)&gen_media);
> > - mes_add_event(mes, CXL_EVENT_TYPE_INFO,
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_INFO,
> > (struct cxl_event_record_raw *)&mem_module);
> > mes->ev_status |= CXLDEV_EVENT_STATUS_INFO;
> >
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &maint_needed);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL,
> > + dev_dbg(dev, "Generating fake event logs %d\n",
> > + CXL_EVENT_TYPE_FAIL);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &maint_needed);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL,
> > + (struct cxl_event_record_raw *)&mem_module);
>
> So this one is new? I can't spot the other one...
Its coming back to me now. The cxl-events.sh test relied on an expected number
of each type of event (Including an overflow count) which were completely
fabricated previous to this patch.
num_overflow_expected=1
num_fatal_expected=2
num_failure_expected=16
num_info_expected=3
To maintain backwards compatibility this new code needed to preserve those
counts. The buffers and number of entries were adjusted to make the output
match. However now the logs need to actually over flow to create the overflow
error. Furthermore, the handles are the array entries. cxl-events.sh passes
before and after this patch.
That said, my math was wrong. A max of 16 with 16+ entries added to the
failure log should result in the counts above. I added a couple extra to the
overflow though.
Good catch on this. I basically hacked it to match and moved on. I've cleaned
it up for the next version.
>
>
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL,
> > (struct cxl_event_record_raw *)&dram);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL,
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL,
> > (struct cxl_event_record_raw *)&gen_media);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL,
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL,
> > (struct cxl_event_record_raw *)&mem_module);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL,
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL,
> > (struct cxl_event_record_raw *)&dram);
> > /* Overflow this log */
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FAIL, &hardware_replace);
> > mes->ev_status |= CXLDEV_EVENT_STATUS_FAIL;
> >
> > - mes_add_event(mes, CXL_EVENT_TYPE_FATAL, &hardware_replace);
> > - mes_add_event(mes, CXL_EVENT_TYPE_FATAL,
> > + dev_dbg(dev, "Generating fake event logs %d\n",
> > + CXL_EVENT_TYPE_FATAL);
> The dev_dbg() fine but not really part of making it dynamic, so adds
> a bit of noise. Maybe not worth splitting out though.
It's just debugging that we are indeed adding these to the now dynamic list. I
added a print for each type. I've added even more debugging with the clean up.
I'm going to leave it in for now because it is part of ensuring the dynamic
events work.
Thanks,
Ira
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FATAL, &hardware_replace);
> > + add_event_from_static(mdata, CXL_EVENT_TYPE_FATAL,
> > (struct cxl_event_record_raw *)&dram);
> > mes->ev_status |= CXLDEV_EVENT_STATUS_FATAL;
> > }
>
>