Re: [RFC PATCH 02/41] perf: Support guest enter/exit interfaces

From: Sean Christopherson
Date: Thu Apr 11 2024 - 14:33:07 EST


On Fri, Jan 26, 2024, Xiong Zhang wrote:
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 683dc086ef10..59471eeec7e4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3803,6 +3803,8 @@ static inline void group_update_userpage(struct perf_event *group_event)
> event_update_userpage(event);
> }
>
> +static DEFINE_PER_CPU(bool, __perf_force_exclude_guest);
> +
> static int merge_sched_in(struct perf_event *event, void *data)
> {
> struct perf_event_context *ctx = event->ctx;
> @@ -3814,6 +3816,14 @@ static int merge_sched_in(struct perf_event *event, void *data)
> if (!event_filter_match(event))
> return 0;
>
> + /*
> + * The __perf_force_exclude_guest indicates entering the guest.
> + * No events of the passthrough PMU should be scheduled.
> + */
> + if (__this_cpu_read(__perf_force_exclude_guest) &&
> + has_vpmu_passthrough_cap(event->pmu))

As mentioned in the previous reply, I think perf should WARN and reject any attempt
to trigger a "passthrough" context switch if such a switch isn't supported by
perf, not silently let it go through and then skip things later.

> + return 0;
> +
> if (group_can_go_on(event, *can_add_hw)) {
> if (!group_sched_in(event, ctx))
> list_add_tail(&event->active_list, get_event_list(event));

..

> +/*
> + * When a guest enters, force all active events of the PMU, which supports
> + * the VPMU_PASSTHROUGH feature, to be scheduled out. The events of other
> + * PMUs, such as uncore PMU, should not be impacted. The guest can
> + * temporarily own all counters of the PMU.
> + * During the period, all the creation of the new event of the PMU with
> + * !exclude_guest are error out.
> + */
> +void perf_guest_enter(void)
> +{
> + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> + lockdep_assert_irqs_disabled();
> +
> + if (__this_cpu_read(__perf_force_exclude_guest))

This should be a WARN_ON_ONCE, no?

> + return;
> +
> + perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> + perf_force_exclude_guest_enter(&cpuctx->ctx);
> + if (cpuctx->task_ctx)
> + perf_force_exclude_guest_enter(cpuctx->task_ctx);
> +
> + perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +
> + __this_cpu_write(__perf_force_exclude_guest, true);
> +}
> +EXPORT_SYMBOL_GPL(perf_guest_enter);
> +
> +static void perf_force_exclude_guest_exit(struct perf_event_context *ctx)
> +{
> + struct perf_event_pmu_context *pmu_ctx;
> + struct pmu *pmu;
> +
> + update_context_time(ctx);
> + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> + pmu = pmu_ctx->pmu;
> + if (!has_vpmu_passthrough_cap(pmu))
> + continue;

I don't see how we can sanely support a CPU that doesn't support writable
PERF_GLOBAL_STATUS across all PMUs.

> +
> + perf_pmu_disable(pmu);
> + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
> + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> + perf_pmu_enable(pmu);
> + }
> +}
> +
> +void perf_guest_exit(void)
> +{
> + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> + lockdep_assert_irqs_disabled();
> +
> + if (!__this_cpu_read(__perf_force_exclude_guest))

WARN_ON_ONCE here too?

> + return;
> +
> + __this_cpu_write(__perf_force_exclude_guest, false);
> +
> + perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> + perf_force_exclude_guest_exit(&cpuctx->ctx);
> + if (cpuctx->task_ctx)
> + perf_force_exclude_guest_exit(cpuctx->task_ctx);
> +
> + perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}
> +EXPORT_SYMBOL_GPL(perf_guest_exit);
> +
> +static inline int perf_force_exclude_guest_check(struct perf_event *event,
> + int cpu, struct task_struct *task)
> +{
> + bool *force_exclude_guest = NULL;
> +
> + if (!has_vpmu_passthrough_cap(event->pmu))
> + return 0;
> +
> + if (event->attr.exclude_guest)
> + return 0;
> +
> + if (cpu != -1) {
> + force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, cpu);
> + } else if (task && (task->flags & PF_VCPU)) {
> + /*
> + * Just need to check the running CPU in the event creation. If the
> + * task is moved to another CPU which supports the force_exclude_guest.
> + * The event will filtered out and be moved to the error stage. See
> + * merge_sched_in().
> + */
> + force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, task_cpu(task));
> + }

These checks are extremely racy, I don't see how this can possibly do the
right thing. PF_VCPU isn't a "this is a vCPU task", it's a "this task is about
to do VM-Enter, or just took a VM-Exit" (the "I'm a virtual CPU" comment in
include/linux/sched.h is wildly misleading, as it's _only_ valid when accounting
time slices).

Digging deeper, I think __perf_force_exclude_guest has similar problems, e.g.
perf_event_create_kernel_counter() calls perf_event_alloc() before acquiring the
per-CPU context mutex.

> + if (force_exclude_guest && *force_exclude_guest)
> + return -EBUSY;
> + return 0;
> +}
> +
> /*
> * Holding the top-level event's child_mutex means that any
> * descendant process that has inherited this event will block
> @@ -11973,6 +12142,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> goto err_ns;
> }
>
> + if (perf_force_exclude_guest_check(event, cpu, task)) {

This should be:

err = perf_force_exclude_guest_check(event, cpu, task);
if (err)
goto err_pmu;

i.e. shouldn't effectively ignore/override the return result.

> + err = -EBUSY;
> + goto err_pmu;
> + }
> +
> /*
> * Disallow uncore-task events. Similarly, disallow uncore-cgroup
> * events (they don't make sense as the cgroup will be different
> --
> 2.34.1
>