Re: [PATCH v4 10/30] thermal/drivers/exynos: Replace of_thermal_is_trip_valid() by thermal_zone_get_trip()

From: Marek Szyprowski
Date: Fri Sep 23 2022 - 10:09:12 EST


Hi Daniel,

On 21.09.2022 11:42, Daniel Lezcano wrote:
> The thermal_zone_get_trip() does the same check as
> of_thermal_is_trip_valid(). Replace the call to
> of_thermal_is_trip_valid() by thermal_zone_get_trip().
>
> Signed-off-by: Daniel Lezcano <daniel.lezcano@xxxxxxxxxx>
> ---

This patch landed in linux next-20220923 as commit 4a71bb8005ba
("thermal/drivers/exynos: Replace of_thermal_is_trip_valid() by
thermal_zone_get_trip()"). Unfortunately it causes deadlock on all
Exynos based board:

============================================
WARNING: possible recursive locking detected
6.0.0-rc1-00062-g4a71bb8005ba #12855 Not tainted
--------------------------------------------
swapper/0/1 is trying to acquire lock:
c263c394 (&tz->lock){+.+.}-{3:3}, at:
thermal_zone_device_update.part.0+0x114/0x538

but task is already holding lock:
c263c394 (&tz->lock){+.+.}-{3:3}, at:
thermal_zone_device_update.part.0+0x3c/0x538

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(&tz->lock);
  lock(&tz->lock);

 *** DEADLOCK ***

 May be due to missing lock nesting notation

2 locks held by swapper/0/1:
 #0: c1d5248c (&dev->mutex){....}-{3:3}, at: __driver_attach+0xe4/0x1f0
 #1: c263c394 (&tz->lock){+.+.}-{3:3}, at:
thermal_zone_device_update.part.0+0x3c/0x538

stack backtrace:
CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.0.0-rc1-00062-g4a71bb8005ba
#12855
Hardware name: Samsung Exynos (Flattened Device Tree)
 unwind_backtrace from show_stack+0x10/0x14
 show_stack from dump_stack_lvl+0x58/0x70
 dump_stack_lvl from __lock_acquire+0x146c/0x2a7c
 __lock_acquire from lock_acquire+0x124/0x3e4
 lock_acquire from __mutex_lock+0x90/0x948
 __mutex_lock from mutex_lock_nested+0x1c/0x24
 mutex_lock_nested from thermal_zone_device_update.part.0+0x114/0x538
 thermal_zone_device_update.part.0 from
thermal_zone_device_set_mode+0x70/0x98
 thermal_zone_device_set_mode from thermal_of_zone_register+0x424/0x69c
 thermal_of_zone_register from devm_thermal_of_zone_register+0x58/0x94
 devm_thermal_of_zone_register from exynos_tmu_probe+0x29c/0x728
 exynos_tmu_probe from platform_probe+0x5c/0xb8
 platform_probe from really_probe+0xe0/0x414
 really_probe from __driver_probe_device+0xa0/0x208
 __driver_probe_device from driver_probe_device+0x30/0xc0
 driver_probe_device from __driver_attach+0xf0/0x1f0
 __driver_attach from bus_for_each_dev+0x70/0xb0
 bus_for_each_dev from bus_add_driver+0x174/0x218
 bus_add_driver from driver_register+0x88/0x11c
 driver_register from do_one_initcall+0x64/0x380
 do_one_initcall from kernel_init_freeable+0x1c0/0x224
 kernel_init_freeable from kernel_init+0x18/0x12c
 kernel_init from ret_from_fork+0x14/0x2c
Exception stack(0xf082dfb0 to 0xf082dff8)

[deadlock]

Something is wrong in locking in the functions from the above stacktrace.


> drivers/thermal/samsung/exynos_tmu.c | 9 ++++++---
> 1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
> index 91e6860b5ec4..34b460092308 100644
> --- a/drivers/thermal/samsung/exynos_tmu.c
> +++ b/drivers/thermal/samsung/exynos_tmu.c
> @@ -554,13 +554,14 @@ static void exynos4210_tmu_control(struct platform_device *pdev, bool on)
> {
> struct exynos_tmu_data *data = platform_get_drvdata(pdev);
> struct thermal_zone_device *tz = data->tzd;
> + struct thermal_trip trip;
> unsigned int con, interrupt_en = 0, i;
>
> con = get_con_reg(data, readl(data->base + EXYNOS_TMU_REG_CONTROL));
>
> if (on) {
> for (i = 0; i < data->ntrip; i++) {
> - if (!of_thermal_is_trip_valid(tz, i))
> + if (thermal_zone_get_trip(tz, i, &trip))
> continue;
>
> interrupt_en |=
> @@ -584,13 +585,14 @@ static void exynos5433_tmu_control(struct platform_device *pdev, bool on)
> {
> struct exynos_tmu_data *data = platform_get_drvdata(pdev);
> struct thermal_zone_device *tz = data->tzd;
> + struct thermal_trip trip;
> unsigned int con, interrupt_en = 0, pd_det_en, i;
>
> con = get_con_reg(data, readl(data->base + EXYNOS_TMU_REG_CONTROL));
>
> if (on) {
> for (i = 0; i < data->ntrip; i++) {
> - if (!of_thermal_is_trip_valid(tz, i))
> + if (thermal_zone_get_trip(tz, i, &trip))
> continue;
>
> interrupt_en |=
> @@ -615,13 +617,14 @@ static void exynos7_tmu_control(struct platform_device *pdev, bool on)
> {
> struct exynos_tmu_data *data = platform_get_drvdata(pdev);
> struct thermal_zone_device *tz = data->tzd;
> + struct thermal_trip trip;
> unsigned int con, interrupt_en = 0, i;
>
> con = get_con_reg(data, readl(data->base + EXYNOS_TMU_REG_CONTROL));
>
> if (on) {
> for (i = 0; i < data->ntrip; i++) {
> - if (!of_thermal_is_trip_valid(tz, i))
> + if (thermal_zone_get_trip(tz, i, &trip))
> continue;
>
> interrupt_en |=

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland