Re: [PATCH 3/3] thermal/intel_powerclamp: Convert to CPU hotplug state

From: Jacob Pan
Date: Fri Oct 21 2016 - 16:19:42 EST


On Mon, 17 Oct 2016 14:32:52 +0200
Petr Mladek <pmladek@xxxxxxxx> wrote:

> From: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
>
Hi Sebastian,
I applied this patchset on 4.9-rc1 and run some cpu online/offline
loops test while injecting idle, e.g. 25%. I got system hang after a
few cycles. Still looking into root cause.

Thanks,

Jacob
> This is a conversation to the new hotplug state machine with
> the difference that CPU_DEAD becomes CPU_PREDOWN.
>
> At the same time it makes the handling of the two states symmetrical.
> stop_power_clamp_worker() is called unconditionally and the
> controversial error message is removed.
>
> Finally, the hotplug state callbacks are removed after the
> powerclamping is stopped to avoid a potential race.
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
> [pmladek@xxxxxxxx: Fixed the possible race in powerclamp_exit()]
> Signed-off-by: Petr Mladek <pmladek@xxxxxxxx>
> ---
> drivers/thermal/intel_powerclamp.c | 69
> +++++++++++++++++++------------------- 1 file changed, 35
> insertions(+), 34 deletions(-)
>
> diff --git a/drivers/thermal/intel_powerclamp.c
> b/drivers/thermal/intel_powerclamp.c index a94f7c849a4e..390e50b97324
> 100644 --- a/drivers/thermal/intel_powerclamp.c
> +++ b/drivers/thermal/intel_powerclamp.c
> @@ -622,43 +622,35 @@ static void end_power_clamp(void)
> }
> }
>
> -static int powerclamp_cpu_callback(struct notifier_block *nfb,
> - unsigned long action, void *hcpu)
> +static int powerclamp_cpu_online(unsigned int cpu)
> {
> - unsigned long cpu = (unsigned long)hcpu;
> + if (clamping == false)
> + return 0;
> + start_power_clamp_worker(cpu);
> + /* prefer BSP as controlling CPU */
> + if (cpu == 0) {
> + control_cpu = 0;
> + smp_mb();
> + }
> + return 0;
> +}
>
> - if (false == clamping)
> - goto exit_ok;
> +static int powerclamp_cpu_predown(unsigned int cpu)
> +{
> + if (clamping == false)
> + return 0;
>
> - switch (action) {
> - case CPU_ONLINE:
> - start_power_clamp_worker(cpu);
> - /* prefer BSP as controlling CPU */
> - if (cpu == 0) {
> - control_cpu = 0;
> - smp_mb();
> - }
> - break;
> - case CPU_DEAD:
> - if (test_bit(cpu, cpu_clamping_mask)) {
> - pr_err("cpu %lu dead but powerclamping
> thread is not\n",
> - cpu);
> - stop_power_clamp_worker(cpu);
> - }
> - if (cpu == control_cpu) {
> - control_cpu = smp_processor_id();
> - smp_mb();
> - }
> - }
> + stop_power_clamp_worker(cpu);
> + if (cpu != control_cpu)
> + return 0;
>
> -exit_ok:
> - return NOTIFY_OK;
> + control_cpu = cpumask_first(cpu_online_mask);
> + if (control_cpu == cpu)
> + control_cpu = cpumask_next(cpu, cpu_online_mask);
> + smp_mb();
> + return 0;
> }
>
> -static struct notifier_block powerclamp_cpu_notifier = {
> - .notifier_call = powerclamp_cpu_callback,
> -};
> -
> static int powerclamp_get_max_state(struct thermal_cooling_device
> *cdev, unsigned long *state)
> {
> @@ -788,6 +780,8 @@ static inline void
> powerclamp_create_debug_files(void)
> debugfs_remove_recursive(debug_dir); }
>
> +static enum cpuhp_state hp_state;
> +
> static int __init powerclamp_init(void)
> {
> int retval;
> @@ -805,7 +799,14 @@ static int __init powerclamp_init(void)
>
> /* set default limit, maybe adjusted during runtime based on
> feedback */ window_size = 2;
> - register_hotcpu_notifier(&powerclamp_cpu_notifier);
> + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
> +
> "thermal/intel_powerclamp:online",
> + powerclamp_cpu_online,
> + powerclamp_cpu_predown);
> + if (retval < 0)
> + goto exit_free;
> +
> + hp_state = retval;
>
> worker_data = alloc_percpu(struct powerclamp_worker_data);
> if (!worker_data) {
> @@ -830,7 +831,7 @@ static int __init powerclamp_init(void)
> exit_free_thread:
> free_percpu(worker_data);
> exit_unregister:
> - unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
> + cpuhp_remove_state_nocalls(hp_state);
> exit_free:
> kfree(cpu_clamping_mask);
> return retval;
> @@ -839,8 +840,8 @@ static int __init powerclamp_init(void)
>
> static void __exit powerclamp_exit(void)
> {
> - unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
> end_power_clamp();
> + cpuhp_remove_state_nocalls(hp_state);
> free_percpu(worker_data);
> thermal_cooling_device_unregister(cooling_dev);
> kfree(cpu_clamping_mask);

[Jacob Pan]