Re: [PATCH] regulator: core: avoid regulator_resolve_supply() race condition

From: Marek Szyprowski
Date: Tue Jan 12 2021 - 16:50:59 EST


Hi,

On 08.01.2021 02:16, David Collins wrote:
> The final step in regulator_register() is to call
> regulator_resolve_supply() for each registered regulator
> (including the one in the process of being registered). The
> regulator_resolve_supply() function first checks if rdev->supply
> is NULL, then it performs various steps to try to find the supply.
> If successful, rdev->supply is set inside of set_supply().
>
> This procedure can encounter a race condition if two concurrent
> tasks call regulator_register() near to each other on separate CPUs
> and one of the regulators has rdev->supply_name specified. There
> is currently nothing guaranteeing atomicity between the rdev->supply
> check and set steps. Thus, both tasks can observe rdev->supply==NULL
> in their regulator_resolve_supply() calls. This then results in
> both creating a struct regulator for the supply. One ends up
> actually stored in rdev->supply and the other is lost (though still
> present in the supply's consumer_list).
>
> Here is a kernel log snippet showing the issue:
>
> [ 12.421768] gpu_cc_gx_gdsc: supplied by pm8350_s5_level
> [ 12.425854] gpu_cc_gx_gdsc: supplied by pm8350_s5_level
> [ 12.429064] debugfs: Directory 'regulator.4-SUPPLY' with parent
> '17a00000.rsc:rpmh-regulator-gfxlvl-pm8350_s5_level'
> already present!
>
> Avoid this race condition by holding the rdev->mutex lock inside
> of regulator_resolve_supply() while checking and setting
> rdev->supply.
>
> Signed-off-by: David Collins <collinsd@xxxxxxxxxxxxxx>

This patch landed in linux next-20210112 as commit eaa7995c529b
("regulator: core: avoid regulator_resolve_supply() race condition"). I
found that it triggers a following lockdep warning during the DWC3
driver registration on some Exynos based boards (this log is from
Samsung Exynos5420-based Peach-Pit board):

======================================================
WARNING: possible circular locking dependency detected
5.11.0-rc1-00008-geaa7995c529b #10095 Not tainted
------------------------------------------------------
swapper/0/1 is trying to acquire lock:
c12e1b80 (regulator_list_mutex){+.+.}-{3:3}, at:
regulator_lock_dependent+0x4c/0x2b0

but task is already holding lock:
df7190c0 (regulator_ww_class_mutex){+.+.}-{3:3}, at:
regulator_resolve_supply+0x44/0x318

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #2 (regulator_ww_class_mutex){+.+.}-{3:3}:
       ww_mutex_lock+0x48/0x88
       regulator_lock_recursive+0x84/0x1f4
       regulator_lock_dependent+0x184/0x2b0
       regulator_enable+0x30/0xe4
       dwc3_exynos_probe+0x17c/0x2c0
       platform_probe+0x80/0xc0
       really_probe+0x1c4/0x4e4
       driver_probe_device+0x78/0x1d8
       device_driver_attach+0x58/0x60
       __driver_attach+0xfc/0x160
       bus_for_each_dev+0x6c/0xb8
       bus_add_driver+0x170/0x20c
       driver_register+0x78/0x10c
       do_one_initcall+0x88/0x438
       kernel_init_freeable+0x18c/0x1dc
       kernel_init+0x8/0x118
       ret_from_fork+0x14/0x38
       0x0

-> #1 (regulator_ww_class_acquire){+.+.}-{0:0}:
       regulator_enable+0x30/0xe4
       dwc3_exynos_probe+0x17c/0x2c0
       platform_probe+0x80/0xc0
       really_probe+0x1c4/0x4e4
       driver_probe_device+0x78/0x1d8
       device_driver_attach+0x58/0x60
       __driver_attach+0xfc/0x160
       bus_for_each_dev+0x6c/0xb8
       bus_add_driver+0x170/0x20c
       driver_register+0x78/0x10c
       do_one_initcall+0x88/0x438
       kernel_init_freeable+0x18c/0x1dc
       kernel_init+0x8/0x118
       ret_from_fork+0x14/0x38
       0x0

-> #0 (regulator_list_mutex){+.+.}-{3:3}:
       lock_acquire+0x2e4/0x5dc
       __mutex_lock+0xa4/0xb60
       mutex_lock_nested+0x1c/0x24
       regulator_lock_dependent+0x4c/0x2b0
       regulator_enable+0x30/0xe4
       regulator_resolve_supply+0x1cc/0x318
       regulator_register_resolve_supply+0x14/0x78
       class_for_each_device+0x68/0xe8
       regulator_register+0xa2c/0xc9c
       devm_regulator_register+0x40/0x70
       tps65090_regulator_probe+0x150/0x648
       platform_probe+0x80/0xc0
       really_probe+0x1c4/0x4e4
       driver_probe_device+0x78/0x1d8
       bus_for_each_drv+0x78/0xbc
       __device_attach+0xe8/0x180
       bus_probe_device+0x88/0x90
       device_add+0x4c4/0x7e8
       platform_device_add+0x120/0x25c
       mfd_add_devices+0x580/0x60c
       tps65090_i2c_probe+0xb8/0x184
       i2c_device_probe+0x234/0x2a4
       really_probe+0x1c4/0x4e4
       driver_probe_device+0x78/0x1d8
       bus_for_each_drv+0x78/0xbc
       __device_attach+0xe8/0x180
       bus_probe_device+0x88/0x90
       device_add+0x4c4/0x7e8
       i2c_new_client_device+0x15c/0x27c
       of_i2c_register_devices+0x114/0x184
       i2c_register_adapter+0x1d8/0x6dc
       ec_i2c_probe+0xc8/0x124
       platform_probe+0x80/0xc0
       really_probe+0x1c4/0x4e4
       driver_probe_device+0x78/0x1d8
       bus_for_each_drv+0x78/0xbc
       __device_attach+0xe8/0x180
       bus_probe_device+0x88/0x90
       device_add+0x4c4/0x7e8
       of_platform_device_create_pdata+0x90/0xc8
       of_platform_bus_create+0x1a0/0x4ec
       of_platform_populate+0x88/0x120
       devm_of_platform_populate+0x40/0x80
       cros_ec_register+0x174/0x308
       cros_ec_spi_probe+0x16c/0x1ec
       spi_probe+0x88/0xac
       really_probe+0x1c4/0x4e4
       driver_probe_device+0x78/0x1d8
       device_driver_attach+0x58/0x60
       __driver_attach+0xfc/0x160
       bus_for_each_dev+0x6c/0xb8
       bus_add_driver+0x170/0x20c
       driver_register+0x78/0x10c
       do_one_initcall+0x88/0x438
       kernel_init_freeable+0x18c/0x1dc
       kernel_init+0x8/0x118
       ret_from_fork+0x14/0x38
       0x0

other info that might help us debug this:

Chain exists of:
  regulator_list_mutex --> regulator_ww_class_acquire -->
regulator_ww_class_mutex

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(regulator_ww_class_mutex);
                               lock(regulator_ww_class_acquire);
                               lock(regulator_ww_class_mutex);
  lock(regulator_list_mutex);

 *** DEADLOCK ***

5 locks held by swapper/0/1:
 #0: dfb6e4c8 (&dev->mutex){....}-{3:3}, at: device_driver_attach+0x18/0x60
 #1: c1fedcd8 (&dev->mutex){....}-{3:3}, at: __device_attach+0x34/0x180
 #2: df53a4e8 (&dev->mutex){....}-{3:3}, at: __device_attach+0x34/0x180
 #3: df5224d8 (&dev->mutex){....}-{3:3}, at: __device_attach+0x34/0x180
 #4: df7190c0 (regulator_ww_class_mutex){+.+.}-{3:3}, at:
regulator_resolve_supply+0x44/0x318

stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc1-00008-geaa7995c529b
#10095
Hardware name: Samsung Exynos (Flattened Device Tree)
[<c01116e8>] (unwind_backtrace) from [<c010cf58>] (show_stack+0x10/0x14)
[<c010cf58>] (show_stack) from [<c0b38ffc>] (dump_stack+0xa4/0xc4)
[<c0b38ffc>] (dump_stack) from [<c0193458>] (check_noncircular+0x14c/0x164)
[<c0193458>] (check_noncircular) from [<c0196b90>]
(__lock_acquire+0x1830/0x31cc)
[<c0196b90>] (__lock_acquire) from [<c01991e4>] (lock_acquire+0x2e4/0x5dc)
[<c01991e4>] (lock_acquire) from [<c0b4043c>] (__mutex_lock+0xa4/0xb60)
[<c0b4043c>] (__mutex_lock) from [<c0b40f14>] (mutex_lock_nested+0x1c/0x24)
[<c0b40f14>] (mutex_lock_nested) from [<c05ccd94>]
(regulator_lock_dependent+0x4c/0x2b0)
[<c05ccd94>] (regulator_lock_dependent) from [<c05d220c>]
(regulator_enable+0x30/0xe4)
[<c05d220c>] (regulator_enable) from [<c05d248c>]
(regulator_resolve_supply+0x1cc/0x318)
[<c05d248c>] (regulator_resolve_supply) from [<c05d2974>]
(regulator_register_resolve_supply+0x14/0x78)
[<c05d2974>] (regulator_register_resolve_supply) from [<c06a3000>]
(class_for_each_device+0x68/0xe8)
[<c06a3000>] (class_for_each_device) from [<c05d3e20>]
(regulator_register+0xa2c/0xc9c)
[<c05d3e20>] (regulator_register) from [<c05d5c70>]
(devm_regulator_register+0x40/0x70)
[<c05d5c70>] (devm_regulator_register) from [<c05dea58>]
(tps65090_regulator_probe+0x150/0x648)
[<c05dea58>] (tps65090_regulator_probe) from [<c06a3fe8>]
(platform_probe+0x80/0xc0)
[<c06a3fe8>] (platform_probe) from [<c06a1114>] (really_probe+0x1c4/0x4e4)
[<c06a1114>] (really_probe) from [<c06a14ac>]
(driver_probe_device+0x78/0x1d8)
[<c06a14ac>] (driver_probe_device) from [<c069f1a4>]
(bus_for_each_drv+0x78/0xbc)
[<c069f1a4>] (bus_for_each_drv) from [<c06a0eb0>]
(__device_attach+0xe8/0x180)
[<c06a0eb0>] (__device_attach) from [<c069ff50>]
(bus_probe_device+0x88/0x90)
[<c069ff50>] (bus_probe_device) from [<c069dbac>] (device_add+0x4c4/0x7e8)
[<c069dbac>] (device_add) from [<c06a3bac>]
(platform_device_add+0x120/0x25c)
[<c06a3bac>] (platform_device_add) from [<c06d5c7c>]
(mfd_add_devices+0x580/0x60c)
[<c06d5c7c>] (mfd_add_devices) from [<c06d80e8>]
(tps65090_i2c_probe+0xb8/0x184)
[<c06d80e8>] (tps65090_i2c_probe) from [<c0822520>]
(i2c_device_probe+0x234/0x2a4)
[<c0822520>] (i2c_device_probe) from [<c06a1114>] (really_probe+0x1c4/0x4e4)
[<c06a1114>] (really_probe) from [<c06a14ac>]
(driver_probe_device+0x78/0x1d8)
[<c06a14ac>] (driver_probe_device) from [<c069f1a4>]
(bus_for_each_drv+0x78/0xbc)
[<c069f1a4>] (bus_for_each_drv) from [<c06a0eb0>]
(__device_attach+0xe8/0x180)
[<c06a0eb0>] (__device_attach) from [<c069ff50>]
(bus_probe_device+0x88/0x90)
[<c069ff50>] (bus_probe_device) from [<c069dbac>] (device_add+0x4c4/0x7e8)
[<c069dbac>] (device_add) from [<c0824aec>]
(i2c_new_client_device+0x15c/0x27c)
[<c0824aec>] (i2c_new_client_device) from [<c08285e0>]
(of_i2c_register_devices+0x114/0x184)
[<c08285e0>] (of_i2c_register_devices) from [<c08254b8>]
(i2c_register_adapter+0x1d8/0x6dc)
[<c08254b8>] (i2c_register_adapter) from [<c082dd1c>]
(ec_i2c_probe+0xc8/0x124)
[<c082dd1c>] (ec_i2c_probe) from [<c06a3fe8>] (platform_probe+0x80/0xc0)
[<c06a3fe8>] (platform_probe) from [<c06a1114>] (really_probe+0x1c4/0x4e4)
[<c06a1114>] (really_probe) from [<c06a14ac>]
(driver_probe_device+0x78/0x1d8)
[<c06a14ac>] (driver_probe_device) from [<c069f1a4>]
(bus_for_each_drv+0x78/0xbc)
[<c069f1a4>] (bus_for_each_drv) from [<c06a0eb0>]
(__device_attach+0xe8/0x180)
[<c06a0eb0>] (__device_attach) from [<c069ff50>]
(bus_probe_device+0x88/0x90)
[<c069ff50>] (bus_probe_device) from [<c069dbac>] (device_add+0x4c4/0x7e8)
[<c069dbac>] (device_add) from [<c08b140c>]
(of_platform_device_create_pdata+0x90/0xc8)
[<c08b140c>] (of_platform_device_create_pdata) from [<c08b15f0>]
(of_platform_bus_create+0x1a0/0x4ec)
[<c08b15f0>] (of_platform_bus_create) from [<c08b1af0>]
(of_platform_populate+0x88/0x120)
[<c08b1af0>] (of_platform_populate) from [<c08b1bdc>]
(devm_of_platform_populate+0x40/0x80)
[<c08b1bdc>] (devm_of_platform_populate) from [<c08b72fc>]
(cros_ec_register+0x174/0x308)
[<c08b72fc>] (cros_ec_register) from [<c08b868c>]
(cros_ec_spi_probe+0x16c/0x1ec)
[<c08b868c>] (cros_ec_spi_probe) from [<c071b2f4>] (spi_probe+0x88/0xac)
[<c071b2f4>] (spi_probe) from [<c06a1114>] (really_probe+0x1c4/0x4e4)
[<c06a1114>] (really_probe) from [<c06a14ac>]
(driver_probe_device+0x78/0x1d8)
[<c06a14ac>] (driver_probe_device) from [<c06a19c4>]
(device_driver_attach+0x58/0x60)
[<c06a19c4>] (device_driver_attach) from [<c06a1ac8>]
(__driver_attach+0xfc/0x160)
[<c06a1ac8>] (__driver_attach) from [<c069f0cc>]
(bus_for_each_dev+0x6c/0xb8)
[<c069f0cc>] (bus_for_each_dev) from [<c06a0204>]
(bus_add_driver+0x170/0x20c)
[<c06a0204>] (bus_add_driver) from [<c06a2968>] (driver_register+0x78/0x10c)
[<c06a2968>] (driver_register) from [<c0102428>]
(do_one_initcall+0x88/0x438)
[<c0102428>] (do_one_initcall) from [<c1101104>]
(kernel_init_freeable+0x18c/0x1dc)
[<c1101104>] (kernel_init_freeable) from [<c0b3c65c>]
(kernel_init+0x8/0x118)
[<c0b3c65c>] (kernel_init) from [<c010011c>] (ret_from_fork+0x14/0x38)
Exception stack(0xc1ce3fb0 to 0xc1ce3ff8)
3fa0:                                     00000000 00000000 00000000
00000000
3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000000
3fe0: 00000000 00000000 00000000 00000000 00000013 00000000

I didn't analyze it yet if this warning is really an issue or just a
false positive. If you have any hints or comments let me know.

> ---
> drivers/regulator/core.c | 39 ++++++++++++++++++++++++++++-----------
> 1 file changed, 28 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
> index fee9241..3ae5ccd 100644
> --- a/drivers/regulator/core.c
> +++ b/drivers/regulator/core.c
> @@ -1813,23 +1813,34 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
> {
> struct regulator_dev *r;
> struct device *dev = rdev->dev.parent;
> - int ret;
> + int ret = 0;
>
> /* No supply to resolve? */
> if (!rdev->supply_name)
> return 0;
>
> - /* Supply already resolved? */
> + /* Supply already resolved? (fast-path without locking contention) */
> if (rdev->supply)
> return 0;
>
> + /*
> + * Recheck rdev->supply with rdev->mutex lock held to avoid a race
> + * between rdev->supply null check and setting rdev->supply in
> + * set_supply() from concurrent tasks.
> + */
> + regulator_lock(rdev);
> +
> + /* Supply just resolved by a concurrent task? */
> + if (rdev->supply)
> + goto out;
> +
> r = regulator_dev_lookup(dev, rdev->supply_name);
> if (IS_ERR(r)) {
> ret = PTR_ERR(r);
>
> /* Did the lookup explicitly defer for us? */
> if (ret == -EPROBE_DEFER)
> - return ret;
> + goto out;
>
> if (have_full_constraints()) {
> r = dummy_regulator_rdev;
> @@ -1837,15 +1848,18 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
> } else {
> dev_err(dev, "Failed to resolve %s-supply for %s\n",
> rdev->supply_name, rdev->desc->name);
> - return -EPROBE_DEFER;
> + ret = -EPROBE_DEFER;
> + goto out;
> }
> }
>
> if (r == rdev) {
> dev_err(dev, "Supply for %s (%s) resolved to itself\n",
> rdev->desc->name, rdev->supply_name);
> - if (!have_full_constraints())
> - return -EINVAL;
> + if (!have_full_constraints()) {
> + ret = -EINVAL;
> + goto out;
> + }
> r = dummy_regulator_rdev;
> get_device(&r->dev);
> }
> @@ -1859,7 +1873,8 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
> if (r->dev.parent && r->dev.parent != rdev->dev.parent) {
> if (!device_is_bound(r->dev.parent)) {
> put_device(&r->dev);
> - return -EPROBE_DEFER;
> + ret = -EPROBE_DEFER;
> + goto out;
> }
> }
>
> @@ -1867,13 +1882,13 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
> ret = regulator_resolve_supply(r);
> if (ret < 0) {
> put_device(&r->dev);
> - return ret;
> + goto out;
> }
>
> ret = set_supply(rdev, r);
> if (ret < 0) {
> put_device(&r->dev);
> - return ret;
> + goto out;
> }
>
> /*
> @@ -1886,11 +1901,13 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
> if (ret < 0) {
> _regulator_put(rdev->supply);
> rdev->supply = NULL;
> - return ret;
> + goto out;
> }
> }
>
> - return 0;
> +out:
> + regulator_unlock(rdev);
> + return ret;
> }
>
> /* Internal regulator request function */

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland