Re: [PATCH -tip 1/2] x86/alternative: Sync bp_patching update for avoiding NULL pointer exception

From: Masami Hiramatsu
Date: Tue Dec 10 2019 - 11:44:09 EST


Hi Peter,

On Mon, 9 Dec 2019 15:39:40 +0100
Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:

> On Wed, Nov 27, 2019 at 02:56:52PM +0900, Masami Hiramatsu wrote:
>
> > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> > index 4552795a8df4..9505096e2cd1 100644
> > --- a/arch/x86/kernel/alternative.c
> > +++ b/arch/x86/kernel/alternative.c
> > @@ -1134,8 +1134,14 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> > * sync_core() implies an smp_mb() and orders this store against
> > * the writing of the new instruction.
> > */
> > - bp_patching.vec = NULL;
> > bp_patching.nr_entries = 0;
> > + /*
> > + * This sync_core () ensures that all int3 handlers in progress
> > + * have finished. This allows poke_int3_handler () after this to
> > + * avoid touching bp_paching.vec by checking nr_entries == 0.
> > + */
> > + text_poke_sync();
> > + bp_patching.vec = NULL;
> > }
>
> How's something like this instead? Under the assumption that it is rare
> to actually hit the INT3 and even more rare to actually hit this race,
> the below should be a lot cheaper.

Ah, this reminds me of my atomic-refcounter method for kpatch idea
and module unloading.

This looks good, but I feel it is a bit complicated.

If we use atomic (and spin-wait) here, can we use atomic_inc_not_zero()
in the poke_int3_handler() at first for making sure the bp_batching is
under operation or not?
I think it makes things simpler, like below.

---------
atomic_t bp_refcnt;

poke_int3_handler()
{
smp_rmb();
if (!READ_ONCE(bp_patching.nr_entries))
return 0;
if (!atomic_inc_not_zero(&bp_refcnt))
return 0;
smp_mb__after_atomic();
[use bp_patching]
atomic_dec(&bp_refcnt);
}

text_poke_bp_batch()
{
bp_patching.vec = tp;
bp_patching.nr_entries = nr_entries;
smp_wmb();
atomic_inc(&bp_refcnt);
...
atomic_dec(&bp_refcnt);
/* wait for all running poke_int3_handler(). */
atomic_cond_read_acquire(&bp_refcnt, !VAL);
bp_patching.vec = NULL;
bp_patching.nr_entries = 0;
}
---------

Thank you,


>
> ---
> arch/x86/kernel/alternative.c | 69 +++++++++++++++++++++++++++++++++----------
> 1 file changed, 53 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 30e86730655c..12f2d193109d 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -953,6 +953,8 @@ static struct bp_patching_desc {
> int nr_entries;
> } bp_patching;
>
> +static atomic_t bp_handlers;
> +
> static inline void *text_poke_addr(struct text_poke_loc *tp)
> {
> return _stext + tp->rel_addr;
> @@ -973,8 +975,8 @@ NOKPROBE_SYMBOL(patch_cmp);
> int notrace poke_int3_handler(struct pt_regs *regs)
> {
> struct text_poke_loc *tp;
> + int nr, len, ret = 0;
> void *ip;
> - int len;
>
> /*
> * Having observed our INT3 instruction, we now must observe
> @@ -987,12 +989,21 @@ int notrace poke_int3_handler(struct pt_regs *regs)
> * Idem for other elements in bp_patching.
> */
> smp_rmb();
> -
> - if (likely(!bp_patching.nr_entries))
> + if (!READ_ONCE(bp_patching.nr_entries))
> return 0;
>
> + atomic_inc(&bp_handlers);
> + /*
> + * 'ACQUIRE', everything happens after the increment.
> + */
> + smp_mb__after_atomic();
> +
> + nr = smp_load_acquire(&bp_patching.nr_entries);
> + if (likely(!nr))
> + goto out;
> +
> if (user_mode(regs))
> - return 0;
> + goto out;
>
> /*
> * Discount the INT3. See text_poke_bp_batch().
> @@ -1002,16 +1013,16 @@ int notrace poke_int3_handler(struct pt_regs *regs)
> /*
> * Skip the binary search if there is a single member in the vector.
> */
> - if (unlikely(bp_patching.nr_entries > 1)) {
> - tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
> + if (unlikely(nr > 1)) {
> + tp = bsearch(ip, bp_patching.vec, nr,
> sizeof(struct text_poke_loc),
> patch_cmp);
> if (!tp)
> - return 0;
> + goto out;
> } else {
> tp = bp_patching.vec;
> if (text_poke_addr(tp) != ip)
> - return 0;
> + goto out;
> }
>
> len = text_opcode_size(tp->opcode);
> @@ -1023,7 +1034,7 @@ int notrace poke_int3_handler(struct pt_regs *regs)
> * Someone poked an explicit INT3, they'll want to handle it,
> * do not consume.
> */
> - return 0;
> + goto out;
>
> case CALL_INSN_OPCODE:
> int3_emulate_call(regs, (long)ip + tp->rel32);
> @@ -1038,7 +1049,14 @@ int notrace poke_int3_handler(struct pt_regs *regs)
> BUG();
> }
>
> - return 1;
> + ret = 1;
> +out:
> + /*
> + * 'RELEASE", everything happens before the decrement.
> + */
> + smp_mb__before_atomic();
> + atomic_dec(&bp_handlers);
> + return ret;
> }
> NOKPROBE_SYMBOL(poke_int3_handler);
>
> @@ -1076,7 +1094,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> lockdep_assert_held(&text_mutex);
>
> bp_patching.vec = tp;
> - bp_patching.nr_entries = nr_entries;
> + /*
> + * bp_patching.vec = tp nr = bp_patching.nr_entries
> + * REL ACQ
> + * bp_patching.nr_entries = nr_entries tp = bp_patching.vec[]
> + */
> + smp_store_release(&bp_patching.nr_entries, nr_entries);
>
> /*
> * Corresponding read barrier in int3 notifier for making sure the
> @@ -1134,13 +1157,27 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> * sync_core() implies an smp_mb() and orders this store against
> * the writing of the new instruction.
> */
> - bp_patching.nr_entries = 0;
> + WRITE_ONCE(bp_patching.nr_entries, 0);
> /*
> - * This sync_core () call ensures that all INT3 handlers in progress
> - * have finished. This allows poke_int3_handler() after this to
> - * avoid touching bp_paching.vec by checking nr_entries == 0.
> + * nr_entries = 0 bp_handlers++
> + * MB MB
> + * VAL = bp_handlers nr = nr_entries
> + */
> + smp_mb();
> + /*
> + * Guarantee all poke_int3_handler()s that have observed
> + * @bp_patching.nr_enties have completed before we clear
> + * bp_patching.vec.
> + *
> + * We can't do this before text_poke_sync() because then there
> + * might still be observable INT3 instructions.
> + */
> + atomic_cond_read_acquire(&bp_handlers, !VAL);
> + /*
> + * bp_handlers == 0 tp = bp_patching.vec[]
> + * ACQ MB
> + * bp_patching.vec = NULL bp_handlers--;
> */
> - text_poke_sync();
> bp_patching.vec = NULL;
> }
>


--
Masami Hiramatsu <mhiramat@xxxxxxxxxx>