On Wed, Aug 05, 2020 at 10:15:26AM +0800, Jin, Yao wrote:
Hi Peter,
On 8/4/2020 7:49 PM, peterz@xxxxxxxxxxxxx wrote:
On Fri, Jul 31, 2020 at 10:56:17AM +0800, Jin Yao wrote:
@@ -6973,7 +6973,8 @@ static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
- bool kernel = !event->attr.exclude_callchain_kernel;
+ bool kernel = !event->attr.exclude_callchain_kernel &&
+ !event->attr.exclude_kernel;
This seems weird; how can we get there. Also it seems to me that if you
have !exclude_callchain_kernel you already have permission for kernel
bits, so who cares.
In perf tool, exclude_callchain_kernel is set to 1 when perf-record only
collects the user callchains and exclude_kernel is set to 1 when events are
configured to run in user space.
So if an event is configured to run in user space, that should make sense we
don't need it's kernel callchains.
But it seems to me there is no code logic in perf tool which can make sure
!exclude_callchain_kernel -> !exclude_kernel.
Jiri, Arnaldo, is my understanding correct?
What the perf tool does or does not do is irrelevant. It is a valid,
(albeit slightly silly) configuration to have:
exclude_kernel && !exclude_callchain_kernel
You're now saying that when you configure things like this you're not
allowed kernel IPs, that's wrong I think.
Also, !exclude_callchain_kernel should require privilidge, whcih needs
fixing, see below.
So the new code looks like:
if (event->attr.exclude_kernel && !user_mode(regs)) {
if (!(current->flags & PF_KTHREAD)) {
regs_fake = task_pt_regs(current);
if (!regs_fake)
instruction_pointer_set(regs, -1L);
} else {
instruction_pointer_set(regs, -1L);
}
Again:
if (!(current->flags & PF_KTHREAD))
regs_fake = task_pt_regs(current);
if (!regs_fake)
instruction_pointer_set(regs, -1L);
Is much simpler and more readable.
+ if ((header->misc & PERF_RECORD_MISC_CPUMODE_MASK) ==
+ PERF_RECORD_MISC_KERNEL) {
+ header->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
+ header->misc |= PERF_RECORD_MISC_USER;
+ }
Why the conditional? At this point it had better be unconditionally
user, no?
headers->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
headers->misc |= PERF_RECORD_MISC_USER;
#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
#define PERF_RECORD_MISC_KERNEL (1 << 0)
#define PERF_RECORD_MISC_USER (2 << 0)
#define PERF_RECORD_MISC_HYPERVISOR (3 << 0)
#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0)
#define PERF_RECORD_MISC_GUEST_USER (5 << 0)
If we unconditionally set user, it will reset for hypervisor, guest
kernel and guest_user.
At the same time :u had better not get any of those either. Which seems
to suggest we're going about this wrong.
Also, if we call this before perf_misc_flags() we don't need to fix it
up.
How's this?
---
kernel/events/core.c | 38 +++++++++++++++++++++++++++++++++-----
1 file changed, 33 insertions(+), 5 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7c436d705fbd..3e4e328b521a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6988,23 +6988,49 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
return callchain ?: &__empty_callchain;
}
+/*
+ * Due to interrupt latency (skid), we may enter the kernel before taking the
+ * PMI, even if the PMU is configured to only count user events. To avoid
+ * leaking kernel addresses, use task_pt_regs(), when available.
+ */
+static struct pt_regs *sanitize_sample_regs(struct perf_event *event, struct pt_regs *regs)
+{
+ struct pt_regs *sample_regs = regs;
+
+ /* user only */
+ if (!event->attr.exclude_kernel || !event->attr.exclude_hv ||
+ !event->attr.exclude_host || !event->attr.exclude_guest)
+ return sample_regs;
+
+ if (sample_regs(regs))
+ return sample_regs;
+
+ if (!(current->flags & PF_KTHREAD)) {
+ sample_regs = task_pt_regs(current);
+ else
+ instruction_pointer_set(regs, -1L);
+
+ return sample_regs;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
+ struct pt_regs *sample_regs = sanitize_sample_regs(event, regs);
u64 sample_type = event->attr.sample_type;
header->type = PERF_RECORD_SAMPLE;
header->size = sizeof(*header) + event->header_size;
header->misc = 0;
- header->misc |= perf_misc_flags(regs);
+ header->misc |= perf_misc_flags(sample_regs);
__perf_event_header__init_id(header, data, event);
if (sample_type & PERF_SAMPLE_IP)
- data->ip = perf_instruction_pointer(regs);
+ data->ip = perf_instruction_pointer(sample_regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
@@ -7054,9 +7080,10 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) {
perf_sample_regs_user(&data->regs_user, regs,
&data->regs_user_copy);
+ }
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
@@ -7099,7 +7126,7 @@ void perf_prepare_sample(struct perf_event_header *header,
/* regs dump ABI info */
int size = sizeof(u64);
- perf_sample_regs_intr(&data->regs_intr, regs);
+ perf_sample_regs_intr(&data->regs_intr, sample_regs);
if (data->regs_intr.regs) {
u64 mask = event->attr.sample_regs_intr;
@@ -11609,7 +11636,8 @@ SYSCALL_DEFINE5(perf_event_open,
if (err)
return err;
- if (!attr.exclude_kernel) {
+ if (!attr.exclude_kernel || !attr.exclude_callchain_kernel ||
+ !attr.exclude_hv || !attr.exclude_host || !attr.exclude_guest) {
err = perf_allow_kernel(&attr);
if (err)
return err;