Re: [PATCHv4 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa

From: Nikolay Aleksandrov
Date: Wed Mar 05 2025 - 03:38:55 EST


On 3/4/25 15:11, Hangbin Liu wrote:
> The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
> a warning:
>
> BUG: sleeping function called from invalid context at...
>
> Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa,
> which is not held by spin_lock_bh().
>
> Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the
> XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered
> again in bond_ipsec_free_sa().
>
> For bond_ipsec_free_sa(), there are now three conditions:
>
> 1. if (!slave): When no active device exists.
> 2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
> 3. if (xs->xso.real_dev != real_dev): When an xs has already been freed
> by bond_ipsec_del_sa_all() due to migration, and the active slave has
> changed to a new device. At the same time, the xs is marked as DEAD
> due to the XFRM entry is removed, triggering xfrm_state_gc_task() and
> bond_ipsec_free_sa().
>
> In all three cases, xdo_dev_state_free() should not be called, only xs
> should be removed from bond->ipsec list.
>
> At the same time, protect bond_ipsec_del_sa_all and bond_ipsec_add_sa_all
> with x->lock for each xs being processed. This prevents XFRM from
> concurrently initiating add/delete operations on the managed states.
>
> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
> Reported-by: Jakub Kicinski <kuba@xxxxxxxxxx>
> Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@xxxxxxxxxx
> Suggested-by: Cosmin Ratiu <cratiu@xxxxxxxxxx>
> Signed-off-by: Hangbin Liu <liuhangbin@xxxxxxxxx>
> ---
> drivers/net/bonding/bond_main.c | 53 +++++++++++++++++++++++----------
> 1 file changed, 37 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index e45bba240cbc..06b060d9b031 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -537,15 +537,22 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
> }
>
> list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + spin_lock_bh(&ipsec->xs->lock);
> + /* Skip dead xfrm states, they'll be freed later. */
> + if (ipsec->xs->km.state == XFRM_STATE_DEAD)
> + goto next;
> +
> /* If new state is added before ipsec_lock acquired */
> if (ipsec->xs->xso.real_dev == real_dev)
> - continue;
> + goto next;
>
> ipsec->xs->xso.real_dev = real_dev;
> if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
> slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
> ipsec->xs->xso.real_dev = NULL;
> }
> +next:
> + spin_unlock_bh(&ipsec->xs->lock);
> }
> out:
> mutex_unlock(&bond->ipsec_lock);
> @@ -560,7 +567,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
> struct net_device *bond_dev = xs->xso.dev;
> struct net_device *real_dev;
> netdevice_tracker tracker;
> - struct bond_ipsec *ipsec;
> struct bonding *bond;
> struct slave *slave;
>
> @@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
> real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
> out:
> netdev_put(real_dev, &tracker);
> - mutex_lock(&bond->ipsec_lock);
> - list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> - if (ipsec->xs == xs) {
> - list_del(&ipsec->list);
> - kfree(ipsec);
> - break;
> - }
> - }
> - mutex_unlock(&bond->ipsec_lock);
> }
>
> static void bond_ipsec_del_sa_all(struct bonding *bond)
> @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
>
> mutex_lock(&bond->ipsec_lock);
> list_for_each_entry(ipsec, &bond->ipsec_list, list) {

Second time - you should use list_for_each_entry_safe if you're walking and deleting
elements from the list.

> + spin_lock_bh(&ipsec->xs->lock);
> if (!ipsec->xs->xso.real_dev)
> - continue;
> + goto next;
> +
> + if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> + /* already dead no need to delete again */
> + if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> + real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);

Have you checked if .xdo_dev_state_free can sleep?
I see at least one that can: mlx5e_xfrm_free_state().

> + list_del(&ipsec->list);
> + kfree(ipsec);
> + goto next;
> + }
>
> if (!real_dev->xfrmdev_ops ||
> !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
> @@ -631,6 +638,8 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
> if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
> }
> +next:
> + spin_unlock_bh(&ipsec->xs->lock);
> }
> mutex_unlock(&bond->ipsec_lock);
> }
> @@ -640,6 +649,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
> struct net_device *bond_dev = xs->xso.dev;
> struct net_device *real_dev;
> netdevice_tracker tracker;
> + struct bond_ipsec *ipsec;
> struct bonding *bond;
> struct slave *slave;
>
> @@ -659,11 +669,22 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
> if (!xs->xso.real_dev)
> goto out;
>
> - WARN_ON(xs->xso.real_dev != real_dev);
> + mutex_lock(&bond->ipsec_lock);
> + list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + if (ipsec->xs == xs) {
> + /* do xdo_dev_state_free if real_dev matches,
> + * otherwise only remove the list
> + */
> + if (real_dev && real_dev->xfrmdev_ops &&
> + real_dev->xfrmdev_ops->xdo_dev_state_free)
> + real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
> + list_del(&ipsec->list);
> + kfree(ipsec);
> + break;
> + }
> + }
> + mutex_unlock(&bond->ipsec_lock);
>
> - if (real_dev && real_dev->xfrmdev_ops &&
> - real_dev->xfrmdev_ops->xdo_dev_state_free)
> - real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
> out:
> netdev_put(real_dev, &tracker);
> }