Re: [RFC 07/10] platform/x86/intel/ifs: Create kthreads for online cpus for scan test

From: Williams, Dan J
Date: Wed Mar 02 2022 - 23:18:00 EST


On Tue, 2022-03-01 at 11:54 -0800, Jithu Joseph wrote:
> Create scan kthreads for online logical cpus. Once scan test is triggered,
> it wakes up the corresponding thread and its sibling threads to execute
> the test. Once the scan test is done, the threads go back to thread wait
> for next signal to start a new scan.
>
> In a core, the scan engine is shared between siblings. When a scan test
> is triggered on a core, all the siblings rendezvous before the test execution.
> The scan results are same for all siblings.
>
> Scan may be aborted by some reasons. Scan test will be aborted in certain
> circumstances such as when interrupt occurred or cpu does not have enough
> power budget for scan. In this case, the kernel restart scan from the chunk
> where it stopped. Scan will also be aborted when the test is failed. In
> this case, the test is immediately stopped without retry.

This needs to explain why existing kernel facilities for running code
on a given target CPU are insufficient and a new open-coded thread-pool
needed to be created. For example, what's wrong with queue_work_on()?

>
> Originally-by: Kyung Min Park <kyung.min.park@xxxxxxxxx>
> Signed-off-by: Jithu Joseph <jithu.joseph@xxxxxxxxx>
> Reviewed-by: Ashok Raj <ashok.raj@xxxxxxxxx>
> Reviewed-by: Tony Luck <tony.luck@xxxxxxxxx>
> ---
>  drivers/platform/x86/intel/ifs/core.c | 317 ++++++++++++++++++++++++++
>  drivers/platform/x86/intel/ifs/ifs.h  |  91 ++++++++
>  2 files changed, 408 insertions(+)
>
> diff --git a/drivers/platform/x86/intel/ifs/core.c b/drivers/platform/x86/intel/ifs/core.c
> index 765d9a2c4683..6747b523587a 100644
> --- a/drivers/platform/x86/intel/ifs/core.c
> +++ b/drivers/platform/x86/intel/ifs/core.c
> @@ -4,11 +4,38 @@
>   * Author: Jithu Joseph <jithu.joseph@xxxxxxxxx>
>   */
>  
> +#include <linux/cpu.h>
> +#include <linux/delay.h>
> +#include <linux/kthread.h>
>  #include <linux/module.h>
> +#include <linux/nmi.h>
>  #include <asm/cpu_device_id.h>
>  
>  #include "ifs.h"
> +
> +static enum cpuhp_state cpuhp_scan_state;
>  struct ifs_params ifs_params;
> +int cpu_sibl_ct;
> +atomic_t siblings_in;  /* sibling count for joining rendezvous.*/
> +atomic_t siblings_out; /* sibling count for exiting rendezvous.*/
> +struct completion test_thread_done; /* set when scan are done for all siblings threads.*/
> +
> +DEFINE_PER_CPU(struct ifs_state, ifs_state);
> +
> +static int ifs_retry_set(const char *val, const struct kernel_param *kp);
> +static const struct kernel_param_ops ifs_retry_ops = {
> +       .set = ifs_retry_set,
> +       .get = param_get_int,
> +};
> +
> +static int retry = 5;
> +module_param_cb(retry, &ifs_retry_ops, &retry, 0644);
> +
> +MODULE_PARM_DESC(retry, "Maximum retry count when the test is not executed");

Why does this retry need to be in the kernel? Can't the test runner
retry the test if they want? If it stays in the kernel, why a module
parameter and not a sysfs attribute?

> +
> +static bool noint = 1;
> +module_param(noint, bool, 0644);
> +MODULE_PARM_DESC(noint, "Option to enable/disable interrupt during test");

Same sysfs vs module parameter question...

>  
>  #define X86_MATCH(model)                                       \
>         X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6,            \
> @@ -21,6 +48,273 @@ static const struct x86_cpu_id ifs_cpu_ids[] __initconst = {
>  
>  MODULE_DEVICE_TABLE(x86cpu, ifs_cpu_ids);
>  
> +static int ifs_retry_set(const char *val, const struct kernel_param *kp)
> +{
> +       int var = 0;
> +
> +       if (kstrtoint(val, 0, &var)) {
> +               pr_err("unable to parse retry\n");
> +               return -EINVAL;
> +       }
> +
> +       /* validate retry value for sanity */
> +       if (var < 1 || var > 20) {
> +               pr_err("retry parameter should be between 1 and 20\n");
> +               return -EINVAL;
> +       }
> +
> +       return param_set_int(val, kp);
> +}
> +
> +static unsigned long msec_to_tsc(unsigned long msec)
> +{
> +       return tsc_khz * 1000 * msec / MSEC_PER_SEC;
> +}
> +
> +static const char * const scan_test_status[] = {
> +       "SCAN no error",
> +       "Other thread could not join.",
> +       "Interrupt occurred prior to SCAN coordination.",
> +       "Core Abort SCAN Response due to power management condition.",
> +       "Non valid chunks in the range",
> +       "Mismatch in arguments between threads T0/T1.",
> +       "Core not capable of performing SCAN currently",
> +       "Unassigned error code 0x7",
> +       "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
> +       "Interrupt occurred prior to SCAN start",

This looks unmaintainable...

/me finds large comment block around IFS_* error codes and suggests
killing 2 birds with one stone, i.e. delete that comment and make this
self documenting:

static const char * const scan_test_status[] = {
[IFS_NO_ERROR] = "SCAN no error",
[IFS_OTHER_THREAD_DID_NOT_JOIN] = "Other thread could not join.",
...etc...
};

Btw, which is it "did not join" and "could not join"? If the symbol
name is going to be that long might as well make it match the log
message verbatim.

That way you can add / delete error codes without wondering if you
managed to match the code number to the right position in the array.

Even better would be to kick this out of the kernel and let the user
tool translate the error codes to test result log messages.

> +};
> +
> +static void message_not_tested(int cpu, union ifs_status status)
> +{
> +       if (status.error_code < ARRAY_SIZE(scan_test_status))
> +               pr_warn("CPU %d: SCAN operation did not start. %s\n", cpu,
> +                       scan_test_status[status.error_code]);
> +       else if (status.error_code == IFS_SW_TIMEOUT)
> +               pr_warn("CPU %d: software timeout during scan\n", cpu);
> +       else if (status.error_code == IFS_SW_PARTIAL_COMPLETION)
> +               pr_warn("CPU %d: %s\n", cpu,
> +                       "Not all scan chunks were executed. Maximum forward progress retries exceeded");

Why are these codes not in the scan_test_status set? I see that
IFS_SW_PARTIAL_COMPLETION and IFS_SW_TIMEOUT are defined with larger
values, but why?

> +       else
> +               pr_warn("CPU %d: SCAN unknown status %llx\n", cpu, status.data);
> +}
> +
> +static void message_fail(int cpu, union ifs_status status)
> +{
> +       if (status.control_error) {
> +               pr_err("CPU %d: scan failed. %s\n", cpu,
> +                      "Suggest reload scan file: # echo 1 > /sys/devices/system/cpu/ifs/reload");
> +       }
> +       if (status.signature_error) {
> +               pr_err("CPU %d: test signature incorrect. %s\n", cpu,
> +                      "Suggest retry scan to check if problem is transient");
> +       }

This looks and feels more like tools/testing/selftests/ style code
printing information for a kernel developer to read. For a production
capability I would expect these debug messages to be elided and have an
ifs user tool that knows when to "Suggest reload scan file". Otherwise,
it's not scalable to use the kernel log buffer like a utility's stdout.

> +}
> +
> +static bool can_restart(union ifs_status status)
> +{
> +       /* Signature for chunk is bad, or scan test failed */
> +       if (status.signature_error || status.control_error)
> +               return false;
> +
> +       switch (status.error_code) {
> +       case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
> +               mdelay(1);
> +               fallthrough;
> +       case IFS_NO_ERROR:
> +       case IFS_OTHER_THREAD_DID_NOT_JOIN:
> +       case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
> +       case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
> +       case IFS_INTERRUPTED_DURING_EXECUTION:
> +               return true;

If the error codes were an enum compiler would be able to check here
that all possible codes were handled, as is this needs manual review
which is a maintenance tax.

> +       }
> +       return false;
> +}
> +
> +static bool wait_for_siblings(atomic_t *t, long long timeout)
> +{
> +       atomic_inc(t);
> +       while (atomic_read(t) < cpu_sibl_ct) {
> +               if (timeout < SPINUNIT) {
> +                       pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
> +                              cpu_sibl_ct - atomic_read(t));
> +                       return false;
> +               }
> +
> +               ndelay(SPINUNIT);
> +               timeout -= SPINUNIT;
> +
> +               touch_nmi_watchdog();

Live spin-waiting for threads to rendevous with active defeat of the
lockup detector? Is this attempting to replicate an open coded SMM
entry?

> +       }
> +
> +       return true;
> +}
> +
> +/*
> + * Scan test kthreads bound with each logical cpu.
> + * Wait for the sibling thread to join before the execution.
> + * Execute the scan test by running wrmsr(MSR_ACTIVATE_SCAN).
> + */
> +static int scan_test_worker(void *info)
> +{
> +       int cpu = smp_processor_id();
> +       union ifs_scan activate;
> +       union ifs_status status;
> +       unsigned long timeout;
> +       int retries;
> +       u32 first;
> +
> +       activate.rsvd = 0;
> +       activate.delay = msec_to_tsc(THREAD_WAIT);
> +       activate.sigmce = 0;
> +
> +       while (1) {
> +               /* wait event until cpumask set from user */
> +               wait_event_interruptible(per_cpu(ifs_state, cpu).scan_wq,
> +                                        (cpumask_test_cpu(cpu, &per_cpu(ifs_state, cpu).mask) ||
> +                                        kthread_should_stop()));
> +
> +               if (kthread_should_stop())
> +                       break;
> +
> +               /*
> +                * Need to get (and keep) the threads on this core executing close together
> +                * so that the writes to MSR_ACTIVATE_SCAN below will succeed in entering
> +                * IFS test mode on this core. Interrupts on each thread are expected to be
> +                * brief. But preemption would be a problem.

What is this requirement to try to synchronize CPU execution? Comments
should explain the "why", the code usually explains the "what".

> +                */
> +               preempt_disable();
> +
> +               /* wait for the sibling threads to join */
> +               first = cpumask_first(topology_sibling_cpumask(cpu));
> +               if (!wait_for_siblings(&siblings_in, NSEC_PER_SEC)) {
> +                       preempt_enable();
> +                       return -1;
> +               }
> +
> +               activate.start = 0;
> +               activate.stop = ifs_params.valid_chunks - 1;
> +               timeout = jiffies + HZ / 2;
> +               retries = retry;
> +
> +               while (activate.start <= activate.stop) {
> +                       if (time_after(jiffies, timeout)) {
> +                               status.error_code = IFS_SW_TIMEOUT;
> +                               break;
> +                       }
> +
> +                       if (noint)
> +                               local_irq_disable();
> +                       /* scan start */
> +                       wrmsrl(MSR_ACTIVATE_SCAN, activate.data);
> +
> +                       if (noint)
> +                               local_irq_enable();
> +
> +                       /*
> +                        * All logical CPUs on this core are now running IFS test. When it completes
> +                        * execution or is interrupted, the following RDMSR gets the scan status.
> +                        */
> +
> +                       rdmsrl(MSR_SCAN_STATUS, status.data);
> +
> +                       /* Some cases can be retried, give up for others */
> +                       if (!can_restart(status))
> +                               break;
> +
> +                       if (status.chunk_num == activate.start) {
> +                               /* Check for forward progress */
> +                               if (retries-- == 0) {
> +                                       if (status.error_code == IFS_NO_ERROR)
> +                                               status.error_code = IFS_SW_PARTIAL_COMPLETION;
> +                                       break;
> +                               }
> +                       } else {
> +                               retries = retry;
> +                               activate.start = status.chunk_num;
> +                       }
> +               }
> +
> +               preempt_enable();
> +
> +               /* Update status for this core */
> +               per_cpu(ifs_state, cpu).scan_details = status.data;
> +
> +               if (status.control_error || status.signature_error) {
> +                       per_cpu(ifs_state, cpu).status = SCAN_TEST_FAIL;
> +                       cpumask_set_cpu(cpu, &ifs_params.fail_mask);
> +                       cpumask_clear_cpu(cpu, &ifs_params.not_tested_mask);
> +                       cpumask_clear_cpu(cpu, &ifs_params.pass_mask);
> +                       message_fail(cpu, status);
> +               } else if (status.error_code) {
> +                       per_cpu(ifs_state, cpu).status = SCAN_NOT_TESTED;
> +                       cpumask_set_cpu(cpu, &ifs_params.not_tested_mask);
> +                       cpumask_clear_cpu(cpu, &ifs_params.fail_mask);
> +                       cpumask_clear_cpu(cpu, &ifs_params.pass_mask);
> +                       message_not_tested(cpu, status);
> +               } else {
> +                       per_cpu(ifs_state, cpu).status = SCAN_TEST_PASS;
> +                       cpumask_set_cpu(cpu, &ifs_params.pass_mask);
> +                       cpumask_clear_cpu(cpu, &ifs_params.not_tested_mask);
> +                       cpumask_clear_cpu(cpu, &ifs_params.fail_mask);
> +               }
> +
> +               cpumask_clear_cpu(cpu, &per_cpu(ifs_state, cpu).mask);
> +
> +               if (!wait_for_siblings(&siblings_out, NSEC_PER_SEC))
> +                       return -1;
> +
> +               if (cpu == first)
> +                       complete(&test_thread_done);
> +       }
> +
> +       return 0;
> +}
> +
> +static void ifs_first_time(unsigned int cpu)
> +{
> +       init_waitqueue_head(&(per_cpu(ifs_state, cpu).scan_wq));
> +
> +       per_cpu(ifs_state, cpu).first_time = 1;
> +       per_cpu(ifs_state, cpu).status = SCAN_NOT_TESTED;
> +       cpumask_set_cpu(cpu, &ifs_params.not_tested_mask);
> +       cpumask_clear_cpu(cpu, &ifs_params.fail_mask);
> +       cpumask_clear_cpu(cpu, &ifs_params.pass_mask);
> +}
> +
> +static int ifs_online_cpu(unsigned int cpu)
> +{
> +       /* If the CPU is coming online for the first time*/
> +       if (per_cpu(ifs_state, cpu).first_time == 0)
> +               ifs_first_time(cpu);
> +
> +       cpumask_clear_cpu(cpu, &(per_cpu(ifs_state, cpu).mask));
> +
> +       per_cpu(ifs_state, cpu).scan_task = kthread_create_on_node(scan_test_worker, (void *)&cpu,
> +                                                                  cpu_to_node(cpu), "ifsCpu/%u",
> +                                                                  cpu);
> +       if (IS_ERR(per_cpu(ifs_state, cpu).scan_task)) {
> +               pr_err("scan_test_worker task create failed\n");
> +               return PTR_ERR(per_cpu(ifs_state, cpu).scan_task);
> +       }
> +       kthread_bind(per_cpu(ifs_state, cpu).scan_task, cpu);
> +       wake_up_process(per_cpu(ifs_state, cpu).scan_task);
> +
> +       return 0;
> +}
> +
> +static int ifs_offline_cpu(unsigned int cpu)
> +{
> +       struct task_struct *thread;
> +
> +       thread = per_cpu(ifs_state, cpu).scan_task;
> +       per_cpu(ifs_state, cpu).scan_task = NULL;
> +
> +       if (thread)
> +               kthread_stop(thread);

Per-cpu workqueues already handle cpu-online / offline and as far as I
can see a dedicated workqueue for ifs could allow you jettison some of
this infrastructure code.

> +
> +       return 0;
> +}
> +
>  static int __init ifs_init(void)
>  {
>         const struct x86_cpu_id *m;
> @@ -42,11 +336,34 @@ static int __init ifs_init(void)
>                 return ret;
>         }
>  
> +       init_completion(&test_thread_done);
> +       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/ifs:online",
> +                               ifs_online_cpu, ifs_offline_cpu);
> +
> +       if (ret < 0) {
> +               pr_err("cpuhp_setup_failed\n");
> +               return ret;
> +       }
> +       cpuhp_scan_state = ret;
> +
>         return 0;
>  }
>  
>  static void __exit ifs_exit(void)
>  {
> +       struct task_struct *thread;
> +       int cpu;
> +
> +       cpus_read_lock();
> +       for_each_online_cpu(cpu) {
> +               thread = per_cpu(ifs_state, cpu).scan_task;
> +               per_cpu(ifs_state, cpu).scan_task = NULL;
> +               if (thread)
> +                       kthread_stop(thread);
> +       }
> +       cpus_read_unlock();
> +       cpuhp_remove_state(cpuhp_scan_state);
> +
>         pr_info("unloaded 'In-Field Scan' module\n");
>  }
>  
> diff --git a/drivers/platform/x86/intel/ifs/ifs.h b/drivers/platform/x86/intel/ifs/ifs.h
> index 8f9abdb304b0..fcbbb49faa19 100644
> --- a/drivers/platform/x86/intel/ifs/ifs.h
> +++ b/drivers/platform/x86/intel/ifs/ifs.h
> @@ -18,6 +18,13 @@
>  #define MSR_SCAN_HASHES_STATUS                 0x000002c3
>  #define MSR_AUTHENTICATE_AND_COPY_CHUNK                0x000002c4
>  #define MSR_CHUNKS_AUTHENTICATION_STATUS       0x000002c5
> +#define MSR_ACTIVATE_SCAN                      0x000002c6
> +#define MSR_SCAN_STATUS                                0x000002c7
> +#define SCAN_TEST_PASS                         0
> +#define SCAN_TEST_FAIL                         1
> +#define SCAN_NOT_TESTED                                2
> +#define SPINUNIT                               100
> +#define THREAD_WAIT                            5
>  
>  /* MSR_SCAN_HASHES_STATUS bit fields */
>  union ifs_scan_hashes_status {
> @@ -45,16 +52,100 @@ union ifs_chunks_auth_status {
>         };
>  };
>  
> +/* MSR_ACTIVATE_SCAN bit fields */
> +union ifs_scan {
> +       u64     data;
> +       struct {
> +               u64     start   :8;
> +               u64     stop    :8;
> +               u64     rsvd    :16;
> +               u64     delay   :31;
> +               u64     sigmce  :1;
> +       };
> +};
> +
> +/* MSR_SCAN_STATUS bit fields */
> +union ifs_status {
> +       u64     data;
> +       struct {
> +               u64     chunk_num               :8;
> +               u64     chunk_stop_index        :8;
> +               u64     rsvd1                   :16;
> +               u64     error_code              :8;
> +               u64     rsvd2                   :22;
> +               u64     control_error           :1;
> +               u64     signature_error         :1;
> +       };
> +};
> +
> +/*
> + * ifs_status.error_code values after rdmsr(SCAN_STATUS)
> + * 0x0: no error.
> + * 0x1: scan did not start because all sibling threads did not join.
> + * 0x2: scan did not start because interrupt occurred prior to threads rendezvous
> + * 0x3: scan did not start because power management conditions are inadequate.
> + * 0x4: scan did not start because non-valid chunks in range stop_index:start_index.
> + * 0x5: scan did not start because of mismatches in arguments between sibling threads.
> + * 0x6: scan did not start because core is not capable of performing scan currently.
> + * 0x7: not assigned.
> + * 0x8: scan did not start because of exceed number of concurrent CPUs attempt to run scan.
> + * 0x9: interrupt occurred. Scan operation aborted prematurely. Not all chunks executed.
> + * 0xFE: not all scan chunks were executed. Maximum forward progress retries exceeded.
> + *      This is a driver populated error-code as hardware returns success in this scenario.
> + */
> +#define IFS_NO_ERROR                           0x0
> +#define IFS_OTHER_THREAD_DID_NOT_JOIN          0x1
> +#define IFS_INTERRUPTED_BEFORE_RENDEZVOUS      0x2
> +#define IFS_POWER_MGMT_INADEQUATE_FOR_SCAN     0x3
> +#define IFS_INVALID_CHUNK_RANGE                        0x4
> +#define IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS 0x5
> +#define IFS_CORE_NOT_CAPABLE_CURRENTLY         0x6
> +/* Code 0x7 not assigned */
> +#define IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT        0x8
> +#define IFS_INTERRUPTED_DURING_EXECUTION       0x9
> +
> +#define IFS_SW_TIMEOUT                         0xFD
> +#define IFS_SW_PARTIAL_COMPLETION              0xFE
> +
>  /**
>   * struct ifs_params - global ifs parameter for all cpus.
>   * @loaded_version: stores the currently loaded ifs image version.
>   * @valid_chunks: number of chunks which could be validated.
> + * @fail_mask: stores the cpus which failed the scan.
> + * @not_tested_mask: stores the cpus which have not been tested.
>   */
>  struct ifs_params {
>         int loaded_version;
>         int valid_chunks;
> +       struct cpumask fail_mask;
> +       struct cpumask pass_mask;
> +       struct cpumask not_tested_mask;
>  };
>  
> +/**
> + * struct ifs_state - per-cpu ifs parameter.
> + * @scan_task: scan_task for kthread to run scan test on each cpu.
> + * @first_time: to track if cpu is coming online for the first time.
> + * @status: it holds simple status pass/fail/untested.
> + * @scan_details: opaque scan status code from h/w.
> + * @scan_wq: kthread task wait queue.
> + * @mask: triggering the test by setting the mask.
> + */
> +struct ifs_state {
> +       struct task_struct *scan_task;
> +       int first_time;
> +       int status;
> +       u64 scan_details;
> +       wait_queue_head_t scan_wq;
> +       struct cpumask mask;
> +};
> +
> +DECLARE_PER_CPU(struct ifs_state, ifs_state);
> +
>  int load_ifs_binary(void);
>  extern struct ifs_params ifs_params;
> +extern atomic_t siblings_in;
> +extern atomic_t siblings_out;
> +extern struct completion test_thread_done;
> +extern int cpu_sibl_ct;
>  #endif