RE: [PATCH V7 3/6] perf, x86: handle multiple records in PEBS buffer

From: Liang, Kan
Date: Tue May 05 2015 - 12:30:39 EST




>
> On Mon, Apr 20, 2015 at 04:07:47AM -0400, Kan Liang wrote:
> > +static inline void *
> > +get_next_pebs_record_by_bit(void *base, void *top, int bit) {
> > + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> > + void *at;
> > + u64 pebs_status;
> > +
> > + if (base == NULL)
> > + return NULL;
> > +
> > + for (at = base; at < top; at += x86_pmu.pebs_record_size) {
> > + struct pebs_record_nhm *p = at;
> > +
> > + if (test_bit(bit, (unsigned long *)&p->status)) {
>
> Just wondering, is that BT better than: p->state & (1 << bit) ?

Technically, I think they are same here.
test_bit looks more common, and widely used in drain_pebs functions.
So I changed it according to Andi's comment.

>
> > +
> > + if (p->status == (1 << bit))
> > + return at;
> > +
> > + /* clear non-PEBS bit and re-check */
> > + pebs_status = p->status & cpuc->pebs_enabled;
> > + pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
> > + if (pebs_status == (1 << bit))
> > + return at;
> > + }
> > + }
> > + return NULL;
> > +}
> > +
> > static void __intel_pmu_pebs_event(struct perf_event *event,
> > + struct pt_regs *iregs,
> > + void *base, void *top,
> > + int bit, int count)
> > {
> > struct perf_sample_data data;
> > struct pt_regs regs;
> > + int i;
> > + void *at = get_next_pebs_record_by_bit(base, top, bit);
> >
> > + if (!intel_pmu_save_and_restart(event) &&
> > + !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
> > return;
> >
> > + if (count > 1) {
> > + for (i = 0; i < count - 1; i++) {
> > + setup_pebs_sample_data(event, iregs, at, &data,
> &regs);
> > + perf_event_output(event, &data, &regs);
> > + at += x86_pmu.pebs_record_size;
> > + at = get_next_pebs_record_by_bit(at, top, bit);
> > + }
> > + }
> > +
> > + setup_pebs_sample_data(event, iregs, at, &data, &regs);
> >
> > + /* all records are processed, handle event overflow now */
>
> All but the last. There explicitly is one left to be able to call the overflow
> handler is there not?

Right, I will change the comments.

>
> > + if (perf_event_overflow(event, &data, &regs)) {
> > x86_pmu_stop(event, 0);
> > + return;
> > + }
> > +
> > }
> >
> > static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) @@
> > -1000,72 +1081,86 @@ static void intel_pmu_drain_pebs_core(struct
> pt_regs *iregs)
> > if (!event->attr.precise_ip)
> > return;
> >
> > + n = (top - at) / x86_pmu.pebs_record_size;
> > if (n <= 0)
> > return;
> >
> > + __intel_pmu_pebs_event(event, iregs, at,
> > + top, 0, n);
> > }
> >
> > static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) {
> > struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> > struct debug_store *ds = cpuc->ds;
> > + struct perf_event *event;
> > + void *base, *at, *top;
> > int bit;
> > + int counts[MAX_PEBS_EVENTS] = {};
> >
> > if (!x86_pmu.pebs_active)
> > return;
> >
> > + base = (struct pebs_record_nhm *)(unsigned
> > +long)ds->pebs_buffer_base;
> > top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
> >
> > ds->pebs_index = ds->pebs_buffer_base;
> >
> > + if (unlikely(base >= top))
> > return;
> >
> > + for (at = base; at < top; at += x86_pmu.pebs_record_size) {
> > struct pebs_record_nhm *p = at;
> >
> > for_each_set_bit(bit, (unsigned long *)&p->status,
> > x86_pmu.max_pebs_events) {
> > event = cpuc->events[bit];
> > WARN_ON_ONCE(!event);
> >
> > + if (event->attr.precise_ip)
> > + break;
> > + }
>
> Would it make sense to delay looking for the event until you've found
> there is a single bit set -- and already know which bit that is?
>

Yes, I think we can test cpuc->pebs_enabled here.
It should be better than attr.precise_ip checking.

- for (; at < top; at += x86_pmu.pebs_record_size) {
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
struct pebs_record_nhm *p = at;

for_each_set_bit(bit, (unsigned long *)&p->status,
x86_pmu.max_pebs_events) {
- event = cpuc->events[bit];
- if (!test_bit(bit, cpuc->active_mask))
- continue;
-
- WARN_ON_ONCE(!event);

- if (!event->attr.precise_ip)
- continue;
+ if (test_bit(bit, cpuc->pebs_enabled))
+ break;
+ }

... ...

+ for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+ if (counts[bit] == 0)
continue;
-
- __intel_pmu_pebs_event(event, iregs, at);
+ event = cpuc->events[bit];
+ WARN_ON_ONCE(!event);
+ WARN_ON_ONCE(!event->attr.precise_ip);
+ __intel_pmu_pebs_event(event, iregs, base,
+ top, bit, counts[bit]);
}

> >
> > + if (bit >= x86_pmu.max_pebs_events)
> > + continue;
> > + if (!test_bit(bit, cpuc->active_mask))
> > + continue;
> > + /*
> > + * The PEBS hardware does not deal well with the situation
> > + * when events happen near to each other and multiple
> bits
> > + * are set. But it should happen rarely.
> > + *
> > + * If these events include one PEBS and multiple non-PEBS
> > + * events, it doesn't impact PEBS record. The record will
> > + * be handled normally. (slow path)
> > + *
> > + * If these events include two or more PEBS events, the
> > + * records for the events can be collapsed into a single
> > + * one, and it's not possible to reconstruct all events
> > + * that caused the PEBS record. It's called collision.
> > + * If collision happened, the record will be dropped.
> > + *
> > + */
> > + if (p->status != (1 << bit)) {
> > + u64 pebs_status;
> > +
> > + /* slow path */
> > + pebs_status = p->status & cpuc->pebs_enabled;
> > + pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
> > + if (pebs_status != (1 << bit)) {
> > + perf_log_lost(event);
>
> Does it make sense to keep an error[bit] count and only log once with the
> actual number in? -- when !0 obviously.

Yes, will do it.

Thanks,
Kan
>
> > continue;
> > + }
> > }
> > + counts[bit]++;
> > + }
> >
> > + for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
> > + if (counts[bit] == 0)
> > continue;
> > + event = cpuc->events[bit];
> > + __intel_pmu_pebs_event(event, iregs, base,
> > + top, bit, counts[bit]);
> > }
> > }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo
> info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/