Re: [RFC] perf_events: ctx_flexible_sched_in() not maximizing PMU utilization
From: Peter Zijlstra
Date: Fri May 14 2010 - 10:55:22 EST
On Mon, 2010-05-10 at 11:41 +0200, Stephane Eranian wrote:
> Looks like a good solution, at least better than what is there right now.
Something like the below I guess, still need to make it an actual patch
though ;-)
The interesting bit is the schedule() function which tries to schedule
two queues fairly (cpu context and task context), and selects between
the two based on lag -- I'm not quite sure that that works out as
expected though, still have to do the actual math.
Also, I guess we should come up with a saner solution for the
per-task-per-cpu events (the unservicable stuff)
---
struct flexible_queue {
struct rb_root waiting;
struct rb_node *rb_leftmost;
u64 min_service;
u64 rel_avg;
u64 clock;
u64 running_stamp;
struct list_head running;
struct list_head unservicable;
int unservicable_cpu;
int nr_waiting;
};
enum flexible_type {
flexible_waiting,
flexible_running,
flexible_unservicable,
};
struct flexible_event {
union {
struct rb_node rb_entry;
struct list_head list_entry;
};
enum flexible_type type;
u64 service;
};
static inline
s64 flexible_event_rel(struct flexible_queue *fq, struct flexible_event *fe)
{
return fe->sevice - fq->min_service;
}
static void __enqueue_event(struct flexible_queue *fq, struct flexible_event *fe)
{
struct rb_node **link = &fq->waiting.rb_node;
struct rb_node *parent = NULL;
struct flexible_event *entry;
s64 rel = flexible_event_rel(fq, fe);
int leftmost = 1;
if (rel > 0) {
fq->min_service += rel;
fq->rel_avg -= fq->nr_waiting * rel;
rel = 0;
}
fq->nr_waiting++;
fq->rel_avg += rel;
fe->type = flexible_waiting;
while (*link) {
parent = *link;
entry = rb_entry(parent, struct flexible_event, rb_entry);
/*
* We dont care about collisions. Nodes with
* the same rel stay together.
*/
if (rel < flexible_event_rel(fq, fe)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
if (leftmost)
cfs_rq->rb_leftmost = &se->rb_entry;
rb_link_node(&fe->rb_entry, parent, link);
rb_insert_color(&fe->rb_entry, &fq->waiting);
}
static void __dequeue_event(struct flexible_queue *fq, struct flexible_event *fe)
{
if (fq->rb_leftmost == &fe->rb_entry) {
struct rb_node *next_node;
next_node = rb_next(&fe->rb_entry);
fq->rb_leftmost = next_node;
}
rb_erase(&fe->rb_entry, &fq->waiting);
fq->nr_waiting--;
fq->rel_avg -= flexible_event_rel(fq, fe);
}
static void
flexible_event_add(struct flexible_queue *fq, struct flexible_event *fe)
{
fe->service = fq->min_service + fq->rel_avg;
__enqueue_event(fq, fe);
}
static void
flexible_event_del(struct flexible_queue *fq, struct flexible_event *fe)
{
switch (fe->type) {
case flexible_waiting:
__dequeue_event(fq, fe);
break;
case flexible_running:
case flexible_unservicable:
list_del(&fe->list_entry);
break;
}
}
static void flexible_unschedule(struct flexible_queue *fq)
{
struct flexible_event *fe, *tmp;
s64 delta = (s64)(fq->clock - fq->running_stamp);
list_for_each_entry_safe(fe, tmp, &fq->running, list_entry) {
list_del(&fe->list_entry);
fe->service += delta;
__enqueue_event(fq, fe);
}
}
static
s64 flexible_event_lag(struct flexible_queue *fq, struct flexible_event *fe)
{
return fq->min_service + (fq->rel_avg / fq->nr_waiting) - fe->service;
}
static struct flexible_event *__pick_event(struct flexible_queue *fq)
{
struct flexible_event *fe;
if (!fq->rb_leftmost)
return NULL;
fe = rb_entry(fq->rb_leftmost, struct flexible_event, rb_entry);
return fe;
}
static
int flexible_schedule(struct flexible_queue *fq1, struct flexible_queue *fq2)
{
struct flexible_event *tmp, *fe;
struct flexible_queue *fq;
s64 tmp_lag, max_lag;
if (fq->unservicable_cpu != smp_processor_id()) {
list_for_each_entry_save(fe, tmp, &fq->unservicable, list_entry) {
list_del(&fe->list_entry);
flexible_event_add(fq, fe);
}
fq->unservicable_cpu = smp_processor_id();
}
again:
max_lag = 0;
fe = NULL;
tmp =__pick_event(fq1);
if (tmp) {
tmp_lag = flexible_event_lag(fq1, fe1);
if (tmp_lag > max_lag) {
fq = fq1;
fe = fe1;
max_lag = tmp_lag;
}
}
tmp =__pick_event(fq2);
if (tmp) {
tmp_lag = flexible_event_lag(fq2, fe2);
if (tmp_lag > max_lag) {
fq = fq2;
fe = fe2;
max_lag = tmp_lag;
}
}
if (!fe)
return 1; /* no more events to schedule */
fq->running_stamp = fq->clock;
if (event_of(fe)->cpu != -1 && event_of(fe)->cpu != smp_processor_id()) {
__dequeue_event(fq, fe);
fe->type = flexible_unservicable;
list_add(&fe->list_entry, &fq->unservicable);
goto again; /* can't run this event, try another */
}
if (group_sched_in(event_of(fe), ...) == 0) {
__dequeue_event(fq, fe);
fe->type = flexible_running;
list_add(&fe->list_entry, &fq->running);
return 0; /* success */
}
return -EBUSY; /* doesn't fit */
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/