[PATCH v4 05/38] perf: Add generic exclude_guest support

From: Mingwei Zhang
Date: Mon Mar 24 2025 - 13:35:52 EST


From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>

Only KVM knows the exact time when a guest is entering/exiting. Expose
two interfaces to KVM to switch the ownership of the PMU resources.

All the pinned events must be scheduled in first. Extend the
perf_event_sched_in() helper to support extra flag, e.g., EVENT_GUEST.

Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
Signed-off-by: Mingwei Zhang <mizhang@xxxxxxxxxx>
---
include/linux/perf_event.h | 4 ++
kernel/events/core.c | 80 ++++++++++++++++++++++++++++++++++----
2 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7bda1e20be12..37187ee8e226 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1822,6 +1822,8 @@ extern int perf_event_period(struct perf_event *event, u64 value);
extern u64 perf_event_pause(struct perf_event *event, bool reset);
int perf_get_mediated_pmu(void);
void perf_put_mediated_pmu(void);
+void perf_guest_enter(void);
+void perf_guest_exit(void);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1919,6 +1921,8 @@ static inline int perf_get_mediated_pmu(void)
}

static inline void perf_put_mediated_pmu(void) { }
+static inline void perf_guest_enter(void) { }
+static inline void perf_guest_exit(void) { }
#endif

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7a2115b2c5c1..d05487d465c9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2827,14 +2827,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
- struct pmu *pmu)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
- ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type);
if (ctx)
- ctx_sched_in(ctx, pmu, EVENT_PINNED);
- ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type);
if (ctx)
- ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type);
}

/*
@@ -2890,7 +2891,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
else if (event_type & EVENT_PINNED)
ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);

- perf_event_sched_in(cpuctx, task_ctx, pmu);
+ perf_event_sched_in(cpuctx, task_ctx, pmu, 0);

for_each_epc(epc, &cpuctx->ctx, pmu, 0)
perf_pmu_enable(epc->pmu);
@@ -4188,7 +4189,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}

- perf_event_sched_in(cpuctx, ctx, NULL);
+ perf_event_sched_in(cpuctx, ctx, NULL, 0);

perf_ctx_sched_task_cb(cpuctx->task_ctx, true);

@@ -6040,6 +6041,71 @@ void perf_put_mediated_pmu(void)
}
EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);

+static inline void perf_host_exit(struct perf_cpu_context *cpuctx)
+{
+ perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
+ perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+ if (cpuctx->task_ctx) {
+ perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+ task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
+ perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+ }
+}
+
+/* When entering a guest, schedule out all exclude_guest events. */
+void perf_guest_enter(void)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+ lockdep_assert_irqs_disabled();
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
+ goto unlock;
+
+ perf_host_exit(cpuctx);
+
+ __this_cpu_write(perf_in_guest, true);
+
+unlock:
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+EXPORT_SYMBOL_GPL(perf_guest_enter);
+
+static inline void perf_host_enter(struct perf_cpu_context *cpuctx)
+{
+ perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+ if (cpuctx->task_ctx)
+ perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+
+ perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
+
+ if (cpuctx->task_ctx)
+ perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+ perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+}
+
+void perf_guest_exit(void)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+ lockdep_assert_irqs_disabled();
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
+ goto unlock;
+
+ perf_host_enter(cpuctx);
+
+ __this_cpu_write(perf_in_guest, false);
+unlock:
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+EXPORT_SYMBOL_GPL(perf_guest_exit);
+
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
--
2.49.0.395.g12beb8f557-goog