[GIT PULL] perf fixes

From: Ingo Molnar
Date: Thu Apr 28 2016 - 13:56:34 EST


Linus,

Please pull the latest perf-urgent-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf-urgent-for-linus

# HEAD: cf3beb7c90a8efa16a06b26634cddddc92bb819c perf/x86/intel: Fix incorrect lbr_sel_mask value

x86 PMU driver fixes plus a core code race fix.

Thanks,

Ingo

------------------>
Adam Borowski (1):
perf/x86/amd: Set the size of event map array to PERF_COUNT_HW_MAX

Alexander Shishkin (1):
perf/x86/intel/pt: Don't die on VMXON

Andi Kleen (1):
perf/x86/intel: Add model number for Skylake Server to perf

Kan Liang (1):
perf/x86/intel: Fix incorrect lbr_sel_mask value

Peter Zijlstra (2):
perf/core: Make sysctl_perf_cpu_time_max_percent conform to documentation
perf/core: Fix perf_event_open() vs. execve() race

Srinivas Pandruvada (1):
perf/x86/intel/rapl: Add missing Haswell model


arch/x86/events/amd/core.c | 2 +-
arch/x86/events/intel/core.c | 1 +
arch/x86/events/intel/lbr.c | 6 ++--
arch/x86/events/intel/pt.c | 75 +++++++++++++++++++++++++++++++++------
arch/x86/events/intel/pt.h | 3 ++
arch/x86/events/intel/rapl.c | 1 +
arch/x86/include/asm/perf_event.h | 4 +++
arch/x86/kvm/vmx.c | 4 +++
kernel/events/core.c | 55 +++++++++++++++++++---------
9 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 86a9bec18dab..bd3e8421b57c 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -115,7 +115,7 @@ static __initconst const u64 amd_hw_cache_event_ids
/*
* AMD Performance Monitor K7 and later.
*/
-static const u64 amd_perfmon_event_map[] =
+static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 68fa55b4d42e..aff79884e17d 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3639,6 +3639,7 @@ __init int intel_pmu_init(void)

case 78: /* 14nm Skylake Mobile */
case 94: /* 14nm Skylake Desktop */
+ case 85: /* 14nm Skylake Server */
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 6c3b7c1780c9..1ca5d1e7d4f2 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -63,7 +63,7 @@ static enum {

#define LBR_PLM (LBR_KERNEL | LBR_USER)

-#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
+#define LBR_SEL_MASK 0x3ff /* valid bits in LBR_SELECT */
#define LBR_NOT_SUPP -1 /* LBR filter not supported */
#define LBR_IGN 0 /* ignored */

@@ -610,8 +610,10 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
* The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
* in suppress mode. So LBR_SELECT should be set to
* (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+ * But the 10th bit LBR_CALL_STACK does not operate
+ * in suppress mode.
*/
- reg->config = mask ^ x86_pmu.lbr_sel_mask;
+ reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK);

if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
(br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 6af7cf71d6b2..09a77dbc73c9 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -136,9 +136,21 @@ static int __init pt_pmu_hw_init(void)
struct dev_ext_attribute *de_attrs;
struct attribute **attrs;
size_t size;
+ u64 reg;
int ret;
long i;

+ if (boot_cpu_has(X86_FEATURE_VMX)) {
+ /*
+ * Intel SDM, 36.5 "Tracing post-VMXON" says that
+ * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
+ * post-VMXON.
+ */
+ rdmsrl(MSR_IA32_VMX_MISC, reg);
+ if (reg & BIT(14))
+ pt_pmu.vmx = true;
+ }
+
attrs = NULL;

for (i = 0; i < PT_CPUID_LEAVES; i++) {
@@ -269,20 +281,23 @@ static void pt_config(struct perf_event *event)

reg |= (event->attr.config & PT_CONFIG_MASK);

+ event->hw.config = reg;
wrmsrl(MSR_IA32_RTIT_CTL, reg);
}

-static void pt_config_start(bool start)
+static void pt_config_stop(struct perf_event *event)
{
- u64 ctl;
+ u64 ctl = READ_ONCE(event->hw.config);
+
+ /* may be already stopped by a PMI */
+ if (!(ctl & RTIT_CTL_TRACEEN))
+ return;

- rdmsrl(MSR_IA32_RTIT_CTL, ctl);
- if (start)
- ctl |= RTIT_CTL_TRACEEN;
- else
- ctl &= ~RTIT_CTL_TRACEEN;
+ ctl &= ~RTIT_CTL_TRACEEN;
wrmsrl(MSR_IA32_RTIT_CTL, ctl);

+ WRITE_ONCE(event->hw.config, ctl);
+
/*
* A wrmsr that disables trace generation serializes other PT
* registers and causes all data packets to be written to memory,
@@ -291,8 +306,7 @@ static void pt_config_start(bool start)
* The below WMB, separating data store and aux_head store matches
* the consumer's RMB that separates aux_head load and data load.
*/
- if (!start)
- wmb();
+ wmb();
}

static void pt_config_buffer(void *buf, unsigned int topa_idx,
@@ -942,11 +956,17 @@ void intel_pt_interrupt(void)
if (!ACCESS_ONCE(pt->handle_nmi))
return;

- pt_config_start(false);
+ /*
+ * If VMX is on and PT does not support it, don't touch anything.
+ */
+ if (READ_ONCE(pt->vmx_on))
+ return;

if (!event)
return;

+ pt_config_stop(event);
+
buf = perf_get_aux(&pt->handle);
if (!buf)
return;
@@ -983,6 +1003,35 @@ void intel_pt_interrupt(void)
}
}

+void intel_pt_handle_vmx(int on)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct perf_event *event;
+ unsigned long flags;
+
+ /* PT plays nice with VMX, do nothing */
+ if (pt_pmu.vmx)
+ return;
+
+ /*
+ * VMXON will clear RTIT_CTL.TraceEn; we need to make
+ * sure to not try to set it while VMX is on. Disable
+ * interrupts to avoid racing with pmu callbacks;
+ * concurrent PMI should be handled fine.
+ */
+ local_irq_save(flags);
+ WRITE_ONCE(pt->vmx_on, on);
+
+ if (on) {
+ /* prevent pt_config_stop() from writing RTIT_CTL */
+ event = pt->handle.event;
+ if (event)
+ event->hw.config = 0;
+ }
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
+
/*
* PMU callbacks
*/
@@ -992,6 +1041,9 @@ static void pt_event_start(struct perf_event *event, int mode)
struct pt *pt = this_cpu_ptr(&pt_ctx);
struct pt_buffer *buf = perf_get_aux(&pt->handle);

+ if (READ_ONCE(pt->vmx_on))
+ return;
+
if (!buf || pt_buffer_is_full(buf, pt)) {
event->hw.state = PERF_HES_STOPPED;
return;
@@ -1014,7 +1066,8 @@ static void pt_event_stop(struct perf_event *event, int mode)
* see comment in intel_pt_interrupt().
*/
ACCESS_ONCE(pt->handle_nmi) = 0;
- pt_config_start(false);
+
+ pt_config_stop(event);

if (event->hw.state == PERF_HES_STOPPED)
return;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 336878a5d205..3abb5f5cccc8 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -65,6 +65,7 @@ enum pt_capabilities {
struct pt_pmu {
struct pmu pmu;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+ bool vmx;
};

/**
@@ -107,10 +108,12 @@ struct pt_buffer {
* struct pt - per-cpu pt context
* @handle: perf output handle
* @handle_nmi: do handle PT PMI on this cpu, there's an active event
+ * @vmx_on: 1 if VMX is ON on this cpu
*/
struct pt {
struct perf_output_handle handle;
int handle_nmi;
+ int vmx_on;
};

#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 70c93f9b03ac..1705c9d75e44 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -718,6 +718,7 @@ static int __init rapl_pmu_init(void)
break;
case 60: /* Haswell */
case 69: /* Haswell-Celeron */
+ case 70: /* Haswell GT3e */
case 61: /* Broadwell */
case 71: /* Broadwell-H */
rapl_cntr_mask = RAPL_IDX_HSW;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 5a2ed3ed2f26..f353061bba1d 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -285,6 +285,10 @@ static inline void perf_events_lapic_init(void) { }
static inline void perf_check_microcode(void) { }
#endif

+#ifdef CONFIG_CPU_SUP_INTEL
+ extern void intel_pt_handle_vmx(int on);
+#endif
+
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
extern void amd_pmu_enable_virt(void);
extern void amd_pmu_disable_virt(void);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee1c8a93871c..133679d520af 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3103,6 +3103,8 @@ static __init int vmx_disabled_by_bios(void)

static void kvm_cpu_vmxon(u64 addr)
{
+ intel_pt_handle_vmx(1);
+
asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&addr), "m"(addr)
: "memory", "cc");
@@ -3172,6 +3174,8 @@ static void vmclear_local_loaded_vmcss(void)
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+
+ intel_pt_handle_vmx(0);
}

static void hardware_disable(void)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 52bedc5a5aaa..4e2ebf6f2f1f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -412,7 +412,8 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;

- if (sysctl_perf_cpu_time_max_percent == 100) {
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0) {
printk(KERN_WARNING
"perf: Dynamic interrupt throttling disabled, can hang your system!\n");
WRITE_ONCE(perf_sample_allowed_ns, 0);
@@ -1105,6 +1106,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
+ * cred_guard_mutex
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
@@ -3420,7 +3422,6 @@ static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
struct task_struct *task;
- int err;

rcu_read_lock();
if (!vpid)
@@ -3434,16 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);

- /* Reuse ptrace permission checks for now. */
- err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto errout;
-
return task;
-errout:
- put_task_struct(task);
- return ERR_PTR(err);
-
}

/*
@@ -8413,6 +8405,24 @@ SYSCALL_DEFINE5(perf_event_open,

get_online_cpus();

+ if (task) {
+ err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+ if (err)
+ goto err_cpus;
+
+ /*
+ * Reuse ptrace permission checks for now.
+ *
+ * We must hold cred_guard_mutex across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ goto err_cred;
+ }
+
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;

@@ -8420,7 +8430,7 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cpus;
+ goto err_cred;
}

if (is_sampling_event(event)) {
@@ -8479,11 +8489,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}

- if (task) {
- put_task_struct(task);
- task = NULL;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -8581,6 +8586,11 @@ SYSCALL_DEFINE5(perf_event_open,

WARN_ON_ONCE(ctx->parent_ctx);

+ /*
+ * This is the point on no return; we cannot fail hereafter. This is
+ * where we start modifying current state.
+ */
+
if (move_group) {
/*
* See perf_event_ctx_lock() for comments on the details
@@ -8652,6 +8662,11 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&gctx->mutex);
mutex_unlock(&ctx->mutex);

+ if (task) {
+ mutex_unlock(&task->signal->cred_guard_mutex);
+ put_task_struct(task);
+ }
+
put_online_cpus();

mutex_lock(&current->perf_event_mutex);
@@ -8684,6 +8699,9 @@ SYSCALL_DEFINE5(perf_event_open,
*/
if (!event_file)
free_event(event);
+err_cred:
+ if (task)
+ mutex_unlock(&task->signal->cred_guard_mutex);
err_cpus:
put_online_cpus();
err_task:
@@ -8968,6 +8986,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)

/*
* When a child task exits, feed back event values to parent events.
+ *
+ * Can be called with cred_guard_mutex held when called from
+ * install_exec_creds().
*/
void perf_event_exit_task(struct task_struct *child)
{