Re: [PATCH v2 09/32] perf/x86/intel/cqm: basic RMID hierarchy with per package RMIDs

From: Thomas Gleixner
Date: Wed May 18 2016 - 15:54:15 EST


On Wed, 11 May 2016, David Carrillo-Cisneros wrote:
> /*
> * A cache groups is a group of perf_events with the same target (thread,
> * cgroup, CPU or system-wide). Each cache group receives has one RMID.
> @@ -80,8 +85,68 @@ static inline int __cqm_prmid_update(struct prmid *prmid,
> static LIST_HEAD(cache_groups);
> static DEFINE_MUTEX(cqm_mutex);
>
> +struct monr *monr_hrchy_root;

Why is this global and not static? And what is a monr_hrchy? I assume it's a
monitoring hierarchy, right? So can you please find a more intuitive name?

> +
> struct pkg_data **cqm_pkgs_data;

Same question. Why global? Missed this when looking at the other patch.

>
> +static inline bool __pmonr__in_astate(struct pmonr *pmonr)
> +{
> + lockdep_assert_held(&__pkg_data(pmonr, pkg_data_lock));

__pkg_data() got introduced in an earlier patch, which does not even know
about pmonr.

+#define __pkg_data(pmonr, member) cqm_pkgs_data[pmonr->pkg_id]->member

And this is utter crap, really.

What the hell is wrong with:

lockdep_assert_held(&cqm_pkgs_data[pmonr->pkg_id]->lock);

Nothing, aside of making the code readable. You might have noticed that I
shortened the lock name, which is pointlessly complicated:

pkg_data->pkg_data_lock ????

What's wrong with pkg_data->lock? Again, nothing. It's obvious that the lock
protects the data structure. You only need a more complex name if there is a
lock in the data structure which protects only a part of it or something
different.

> + return pmonr->prmid;
> +}
> +
> +static inline bool __pmonr__in_ustate(struct pmonr *pmonr)
> +{
> + lockdep_assert_held(&__pkg_data(pmonr, pkg_data_lock));
> + return !pmonr->prmid;
> +}
> +
> +static inline bool monr__is_root(struct monr *monr)

Can you please get rid of these double underscores. That's perf user space
style and we really don't need that in proper kernel code.

> +{
> + return monr_hrchy_root == monr;
> +}
> +
> +static inline bool monr__is_mon_active(struct monr *monr)
> +{
> + return monr->flags & MONR_MON_ACTIVE;
> +}
> +
> +static inline void __monr__set_summary_read_rmid(struct monr *monr, u32 rmid)
> +{
> + int i;
> + struct pmonr *pmonr;
> + union prmid_summary summary;
> +
> + monr_hrchy_assert_held_raw_spin_locks();
> +
> + cqm_pkg_id_for_each_online(i) {
> + pmonr = monr->pmonrs[i];
> + WARN_ON_ONCE(!__pmonr__in_ustate(pmonr));
> + summary.value = atomic64_read(&pmonr->prmid_summary_atomic);
> + summary.read_rmid = rmid;
> + atomic64_set(&pmonr->prmid_summary_atomic, summary.value);
> + }
> +}
> +
> +static inline void __monr__set_mon_active(struct monr *monr)
> +{
> + monr_hrchy_assert_held_raw_spin_locks();

So you have the same check twice. Here and in the function above.

> + __monr__set_summary_read_rmid(monr, 0);

Thanks to missing comments I have to assume, that setting a monitor active
always uses rmid 0, right? If that's correct, I'm failing to figure out why.

> + monr->flags |= MONR_MON_ACTIVE;
> +}
> +
> +/*
> + * All pmonrs must be in (U)state.
> + * clearing MONR_MON_ACTIVE prevents (U)state prmids from transitioning
> + * to another state.
> + */
> +static inline void __monr__clear_mon_active(struct monr *monr)
> +{
> + monr_hrchy_assert_held_raw_spin_locks();

Another double lockdep check.

> + __monr__set_summary_read_rmid(monr, INVALID_RMID);
> + monr->flags &= ~MONR_MON_ACTIVE;
> +}
> +
> static inline bool __valid_pkg_id(u16 pkg_id)
> {
> return pkg_id < topology_max_packages();
> @@ -125,6 +190,10 @@ static int pkg_data_init_cpu(int cpu)
> }
>
> INIT_LIST_HEAD(&pkg_data->free_prmids_pool);
> + INIT_LIST_HEAD(&pkg_data->active_prmids_pool);
> + INIT_LIST_HEAD(&pkg_data->nopmonr_limbo_prmids_pool);
> +

That extra new line has a special meaning? Focus on the important next line,
right?

> + INIT_LIST_HEAD(&pkg_data->astate_pmonrs_lru);
>
> mutex_init(&pkg_data->pkg_data_mutex);
> raw_spin_lock_init(&pkg_data->pkg_data_lock);
> @@ -136,12 +205,156 @@ static int pkg_data_init_cpu(int cpu)
> return 0;
> }
>
> +static inline bool __valid_rmid(u16 pkg_id, u32 rmid)

These underscores are annoying. We use them when we have something like this:

int __foo(int bar)
{
return something(bar);
}

int foo(int bar)
{
int ret;

lock(x);
ret = __foo(bar);
unlock(x);
return ret;
}

So we distinguish between inner and outer function with or without protection
or some other thing. I can't see that here.

> +{
> + return rmid <= cqm_pkgs_data[pkg_id]->max_rmid;
> +}
> +
> +static inline bool __valid_prmid(u16 pkg_id, struct prmid *prmid)
> +{
> + struct pkg_data *pkg_data = cqm_pkgs_data[pkg_id];
> + bool valid = __valid_rmid(pkg_id, prmid->rmid);
> +
> + WARN_ON_ONCE(valid && pkg_data->prmids_by_rmid[
> + prmid->rmid]->rmid != prmid->rmid);
> + return valid;
> +}
> +
> +static inline struct prmid *
> +__prmid_from_rmid(u16 pkg_id, u32 rmid)
> +{
> + struct prmid *prmid;
> +
> + if (!__valid_rmid(pkg_id, rmid))
> + return NULL;
> + prmid = cqm_pkgs_data[pkg_id]->prmids_by_rmid[rmid];
> + WARN_ON_ONCE(!__valid_prmid(pkg_id, prmid));
> + return prmid;
> +}
> +
> +static struct pmonr *pmonr_alloc(int cpu)
> +{
> + struct pmonr *pmonr;
> + union prmid_summary summary;
> +
> + pmonr = kmalloc_node(sizeof(struct pmonr),
> + GFP_KERNEL, cpu_to_node(cpu));
> + if (!pmonr)
> + return ERR_PTR(-ENOMEM);

Again. kzalloc_node(). So you can spare all that NULL initialization and if
you add a member it's guaranteed to be zero/NULL.

> +
> + pmonr->prmid = NULL;
> +

You've got an endless supply of stray newlines w/o value

> + pmonr->monr = NULL;
> + INIT_LIST_HEAD(&pmonr->rotation_entry);
> +
> + pmonr->pkg_id = topology_physical_package_id(cpu);
> + summary.sched_rmid = INVALID_RMID;
> + summary.read_rmid = INVALID_RMID;
> + atomic64_set(&pmonr->prmid_summary_atomic, summary.value);
> +
> + return pmonr;
> +}
> +
> +static void pmonr_dealloc(struct pmonr *pmonr)
> +{
> + kfree(pmonr);

What's the value add of this function? Nothing at all, except that seing it at
the call site makes one wonder, whether this function does more than kfree().

> +}
> +
> +/*
> + * @root: Common ancestor.
> + * a bust be distinct to b.
> + * @true if a is ancestor of b.

This is not a proper kernel doc comment

> + */
> +static inline bool
> +__monr_hrchy_is_ancestor(struct monr *root,
> + struct monr *a, struct monr *b)

static inline bool monr_hrchy_is_ancestor(struct monr *root,
struct monr *a,
struct monr *b)

> +{
> + WARN_ON_ONCE(!root || !a || !b);

Another WARN_ON which continues to run into a NULL pointer dereference. You
can spare the WARN_ON as the Ooops will give you a backtrace anyway.

> + WARN_ON_ONCE(a == b);
> +
> + if (root == a)
> + return true;
> + if (root == b)
> + return false;
> +
> + b = b->parent;
> + /* Break at the root */
> + while (b != root) {
> + WARN_ON_ONCE(!b);

Another one. Sigh!

> + if (a == b)
> + return true;
> + b = b->parent;
> + }
> + return false;
> +}
> +
> +/* helper function to finish transition to astate. */

Great comment. It translates the function name into something which is
resembling a sentence, but it carefully avoids to explain what the function
does.

> +static inline void
> +__pmonr__finish_to_astate(struct pmonr *pmonr, struct prmid *prmid)
> +{
> + union prmid_summary summary;
> +
> + lockdep_assert_held(&__pkg_data(pmonr, pkg_data_lock));
> +
> + pmonr->prmid = prmid;
> +
> + list_move_tail(
> + &prmid->pool_entry, &__pkg_data(pmonr, active_prmids_pool));
> + list_move_tail(
> + &pmonr->rotation_entry, &__pkg_data(pmonr, astate_pmonrs_lru));
> +
> + summary.sched_rmid = pmonr->prmid->rmid;
> + summary.read_rmid = pmonr->prmid->rmid;
> + atomic64_set(&pmonr->prmid_summary_atomic, summary.value);
> +}
> +
> +static inline void
> +__pmonr__ustate_to_astate(struct pmonr *pmonr, struct prmid *prmid)
> +{
> + lockdep_assert_held(&__pkg_data(pmonr, pkg_data_lock));
> + __pmonr__finish_to_astate(pmonr, prmid);

So the value of this function is that it checks that the lock is held, and
then calls the other function which does that check again. Really useful.

> +}
> +
> +static inline void
> +__pmonr__to_ustate(struct pmonr *pmonr)

Marking functions inline which are used once is pointless. static is good
enough. The compiler can figure that out alone.

> +{
> + union prmid_summary summary;
> +
> + lockdep_assert_held(&__pkg_data(pmonr, pkg_data_lock));
> +
> + /* Do not warn on re-enter state for (U)state, to simplify cleanup
> + * of initialized states that were not scheduled.
> + */

Multiline comments start with
/*
* ....

> + if (__pmonr__in_ustate(pmonr))
> + return;
> +
> + if (__pmonr__in_astate(pmonr)) {
> + WARN_ON_ONCE(!pmonr->prmid);

Hell. Your WARN_ONs are just useless.

> + list_move_tail(&pmonr->prmid->pool_entry,

Here you dereference that very NULL pointer.

> + &__pkg_data(pmonr, nopmonr_limbo_prmids_pool));
> + pmonr->prmid = NULL;
> + } else {
> + WARN_ON_ONCE(true);
> + return;
> + }
> + list_del_init(&pmonr->rotation_entry);
> +
> + summary.sched_rmid = INVALID_RMID;
> + summary.read_rmid =
> + monr__is_mon_active(pmonr->monr) ? 0 : INVALID_RMID;
> +
> + atomic64_set(&pmonr->prmid_summary_atomic, summary.value);
> + WARN_ON_ONCE(!__pmonr__in_ustate(pmonr));
> +}
> +
> static int intel_cqm_setup_pkg_prmid_pools(u16 pkg_id)
> {
> int r;
> unsigned long flags;
> struct prmid *prmid;
> struct pkg_data *pkg_data = cqm_pkgs_data[pkg_id];
> + struct pmonr *root_pmonr;
>
> if (!__valid_pkg_id(pkg_id))
> return -EINVAL;
> @@ -163,8 +376,13 @@ static int intel_cqm_setup_pkg_prmid_pools(u16 pkg_id)
> &pkg_data->pkg_data_lock, flags, pkg_id);
> pkg_data->prmids_by_rmid[r] = prmid;
>
> + list_add_tail(&prmid->pool_entry, &pkg_data->free_prmids_pool);

Ah, here you add it to the list.

>
> /* RMID 0 is special and makes the root of rmid hierarchy. */
> + if (r == 0) {

And here you add content to the comment. Useful for review, really.

> + root_pmonr = monr_hrchy_root->pmonrs[pkg_id];
> + __pmonr__ustate_to_astate(root_pmonr, prmid);

This moves the prmid from pkg_data->free_pool to pkg_data->active_pool, right?
So why doing the add/move dance at all?

> + }
> raw_spin_unlock_irqrestore(&pkg_data->pkg_data_lock, flags);
> }
> return 0;
> @@ -180,6 +398,238 @@ fail:
> }
>
>
> +/* Alloc monr with all pmonrs in (U)state. */
> +static struct monr *monr_alloc(void)
> +{
> + int i;
> + struct pmonr *pmonr;
> + struct monr *monr;
> +
> + monr = kmalloc(sizeof(struct monr), GFP_KERNEL);
> +
> + if (!monr)
> + return ERR_PTR(-ENOMEM);
> +
> + monr->flags = 0;
> + monr->parent = NULL;
> + INIT_LIST_HEAD(&monr->children);
> + INIT_LIST_HEAD(&monr->parent_entry);
> + monr->mon_event_group = NULL;
> +
> + monr->pmonrs = kmalloc(
> + sizeof(struct pmonr *) * topology_max_packages(), GFP_KERNEL);

kzalloc once again. And you can avoid those annoying line breaks by
calculating the size before the alloc.

> +
> + if (!monr->pmonrs)
> + return ERR_PTR(-ENOMEM);
> +
> + /* Iterate over all pkgs, even unitialized ones. */
> + for (i = 0; i < topology_max_packages(); i++) {
> + /* Do not create pmonrs for unitialized packages. */
> + if (!cqm_pkgs_data[i]) {
> + monr->pmonrs[i] = NULL;
> + continue;
> + }
> + /* Rotation cpu is on pmonr's package. */
> + pmonr = pmonr_alloc(cqm_pkgs_data[i]->rotation_cpu);
> + if (IS_ERR(pmonr))
> + goto clean_pmonrs;
> + pmonr->monr = monr;
> + monr->pmonrs[i] = pmonr;
> + }
> + return monr;
> +
> +clean_pmonrs:
> + while (i--) {
> + if (cqm_pkgs_data[i])
> + kfree(monr->pmonrs[i]);

So here you do a simple kfree(). I would have expected, that you use your
dealloc wrapper. Consistency is overrated, right?

> + }
> + kfree(monr);
> + return ERR_CAST(pmonr);
> +}
> +
> +/* Only can dealloc monrs with all pmonrs in (U)state. */

I can't figure out how that function checks this.

> +static void monr_dealloc(struct monr *monr)
> +{
> + int i;
> +
> + cqm_pkg_id_for_each_online(i)
> + pmonr_dealloc(monr->pmonrs[i]);
> +
> + kfree(monr);
> +}
> +
> +/*
> + * Always finds a rmid_entry to schedule. To be called during scheduler.
> + * A fast path that only uses read_lock for common case when rmid for current

read_lock? Can't find a read_lock in that code.

> + * package has been used before.
> + * On failure, verify that monr is active, if it is, try to obtain a free rmid
> + * and set pmonr to (A)state.
> + * On failure, transverse up monr_hrchy until finding one prmid for this
> + * pkg_id and set pmonr to (I)state.
> + * Called during task switch, it will set pmonr's prmid_summary to reflect the
> + * sched and read rmids that reflect pmonr's state.
> + */
> +static inline void
> +monr_hrchy_get_next_prmid_summary(struct pmonr *pmonr)
> +{
> + union prmid_summary summary;
> +
> + /*
> + * First, do lock-free fastpath.
> + */
> + summary.value = atomic64_read(&pmonr->prmid_summary_atomic);
> + if (summary.sched_rmid != INVALID_RMID)
> + return;
> +
> + if (!prmid_summary__is_mon_active(summary))
> + return;
> +
> + /*
> + * Lock-free path failed at first attempt. Now acquire lock and repeat
> + * in case the monr was modified in the mean time.
> + * This time try to obtain free rmid and update pmonr accordingly,
> + * instead of failing fast.
> + */
> + raw_spin_lock_nested(&__pkg_data(pmonr, pkg_data_lock), pmonr->pkg_id);

In which pkg_data->lock does this nest inside?

> +
> + summary.value = atomic64_read(&pmonr->prmid_summary_atomic);
> + if (summary.sched_rmid != INVALID_RMID) {
> + raw_spin_unlock(&__pkg_data(pmonr, pkg_data_lock));
> + return;
> + }
> +
> + /* Do not try to obtain RMID if monr is not active. */
> + if (!prmid_summary__is_mon_active(summary)) {
> + raw_spin_unlock(&__pkg_data(pmonr, pkg_data_lock));
> + return;
> + }
> +
> + /*
> + * Can only fail if it was in (U)state.

-ENOPARSE. What fails if it was in ustate? And what happens if it was not in
ustate and fails?

> + * Try to obtain a free prmid and go to (A)state, if not possible,
> + * it should go to (I)state.
> + */
> + WARN_ON_ONCE(!__pmonr__in_ustate(pmonr));
> +
> + if (list_empty(&__pkg_data(pmonr, free_prmids_pool))) {
> + /* Failed to obtain an valid rmid in this package for this
> + * monr. In next patches it will transition to (I)state.
> + * For now, stay in (U)state (do nothing).
> + */
> + } else {
> + /* Transition to (A)state using free prmid. */
> + __pmonr__ustate_to_astate(
> + pmonr,
> + list_first_entry(&__pkg_data(pmonr, free_prmids_pool),
> + struct prmid, pool_entry));

What's wrong with using variables?

struct pkg_data *pkgd;
struct prmid *prmid;

pkgd = cqm_pkgs_data[pmonr->pkg_id];
raw_spin_lock(&pkgd->lock);

prmid = list_first_entry_or_null(pkgd->free_pool, struct prmid, pool_entry);

if (!prmid) {
....
} else {
pmonr_ustate_to_astate(pmonr, prmid);
}
raw_spin_unlock(&pkgd->lock);

That's too obvious to read, right?

> + }
> + raw_spin_unlock(&__pkg_data(pmonr, pkg_data_lock));
> +}
> +
> +static inline void __assert_monr_is_leaf(struct monr *monr)
> +{
> + int i;
> +
> + monr_hrchy_assert_held_mutexes();
> + monr_hrchy_assert_held_raw_spin_locks();
> +
> + cqm_pkg_id_for_each_online(i)
> + WARN_ON_ONCE(!__pmonr__in_ustate(monr->pmonrs[i]));
> +
> + WARN_ON_ONCE(!list_empty(&monr->children));

Another gazillion of WARN_ONs which just do nothing to prevent the following
damage.

> +}
> +
> +static inline void
> +__monr_hrchy_insert_leaf(struct monr *monr, struct monr *parent)
> +{
> + monr_hrchy_assert_held_mutexes();
> + monr_hrchy_assert_held_raw_spin_locks();
> +
> + __assert_monr_is_leaf(monr);
> +
> + list_add_tail(&monr->parent_entry, &parent->children);
> + monr->parent = parent;
> +}
> +
> +static inline void
> +__monr_hrchy_remove_leaf(struct monr *monr)
> +{
> + /* Since root cannot be removed, monr must have a parent */
> + WARN_ON_ONCE(!monr->parent);

And if it does not have one, it's root and should be left alone, right?

> +
> + monr_hrchy_assert_held_mutexes();
> + monr_hrchy_assert_held_raw_spin_locks();
> +
> + __assert_monr_is_leaf(monr);
> +
> + list_del_init(&monr->parent_entry);
> + monr->parent = NULL;
> +}
> +
> +static int __monr_hrchy_attach_cpu_event(struct perf_event *event)
> +{
> + lockdep_assert_held(&cqm_mutex);
> + WARN_ON_ONCE(monr_from_event(event));

So yet once more. If that thing has a monr, then why are you just proceeding?

> +
> + event_set_monr(event, monr_hrchy_root);
> + return 0;
> +}
> +
> +/* task events are always leaves in the monr_hierarchy */
> +static int __monr_hrchy_attach_task_event(struct perf_event *event,
> + struct monr *parent_monr)
> +{
> + struct monr *monr;
> + unsigned long flags;
> + int i;
> +
> + lockdep_assert_held(&cqm_mutex);
> +
> + monr = monr_alloc();
> + if (IS_ERR(monr))
> + return PTR_ERR(monr);
> + event_set_monr(event, monr);
> + monr->mon_event_group = event;
> +
> + monr_hrchy_acquire_locks(flags, i);
> + __monr_hrchy_insert_leaf(monr, parent_monr);
> + __monr__set_mon_active(monr);
> + monr_hrchy_release_locks(flags, i);
> +
> + return 0;
> +}
> +
> +/*
> + * Find appropriate position in hierarchy and set monr. Create new
> + * monr if necessary.

I can understand that you want the names short. But in comments we really can
have monitor spelled out.

> + * Locks rmid hrchy.

Ditto.

> /* Read current package immediately and remote pkg (if any) from cache. */
> static void intel_cqm_event_read(struct perf_event *event)
> {
> + union prmid_summary summary;
> + struct prmid *prmid;
> + u16 pkg_id = topology_physical_package_id(smp_processor_id());
> + struct pmonr *pmonr = monr_from_event(event)->pmonrs[pkg_id];
> +
> + summary.value = atomic64_read(&pmonr->prmid_summary_atomic);
> + prmid = __prmid_from_rmid(pkg_id, summary.read_rmid);
> + cqm_prmid_update(prmid);
> + local64_set(&event->count, atomic64_read(&prmid->last_read_value));
> }
>
> -static void intel_cqm_event_start(struct perf_event *event, int mode)
> +static inline void __intel_cqm_event_start(
> + struct perf_event *event, union prmid_summary summary)

This is a horrible formatting. But at least the double underscores make sense
here.

> {
> if (!(event->hw.state & PERF_HES_STOPPED))
> return;
>
> event->hw.state &= ~PERF_HES_STOPPED;
> + pqr_update_rmid(summary.sched_rmid);
> +}
> +
> +static void intel_cqm_event_start(struct perf_event *event, int mode)
> +{
> + union prmid_summary summary;
> + u16 pkg_id = topology_physical_package_id(smp_processor_id());
> + struct pmonr *pmonr = monr_from_event(event)->pmonrs[pkg_id];
> +
> + /* Utilize most up to date pmonr summary. */
> + monr_hrchy_get_next_prmid_summary(pmonr);
> + summary.value = atomic64_read(&pmonr->prmid_summary_atomic);
> + __intel_cqm_event_start(event, summary);
> }
>
> static void intel_cqm_event_stop(struct perf_event *event, int mode)
> {
> + union prmid_summary summary;
> + u16 pkg_id = topology_physical_package_id(smp_processor_id());
> + struct pmonr *root_pmonr = monr_hrchy_root->pmonrs[pkg_id];
> +
> if (event->hw.state & PERF_HES_STOPPED)
> return;
>
> event->hw.state |= PERF_HES_STOPPED;
> +
> + summary.value = atomic64_read(&root_pmonr->prmid_summary_atomic);
> + /* Occupancy of CQM events is obtained at read. No need to read
> + * when event is stopped since read on inactive cpus succeed.

This sentence does not make sense.

> + */
> + pqr_update_rmid(summary.sched_rmid);
> }
>
> static int intel_cqm_event_add(struct perf_event *event, int mode)
> {
> + struct monr *monr;
> + struct pmonr *pmonr;
> + union prmid_summary summary;
> + u16 pkg_id = topology_physical_package_id(smp_processor_id());
> +
> + monr = monr_from_event(event);
> + pmonr = monr->pmonrs[pkg_id];
> +
> event->hw.state = PERF_HES_STOPPED;
>
> + /* Utilize most up to date pmonr summary. */
> + monr_hrchy_get_next_prmid_summary(pmonr);
> + summary.value = atomic64_read(&pmonr->prmid_summary_atomic);
> +
> + if (!prmid_summary__is_mon_active(summary))
> + return -1;
> +
> + if (mode & PERF_EF_START)
> + __intel_cqm_event_start(event, summary);
> +
> + /* (I)state pmonrs cannot report occupancy for themselves. */
> return 0;
> }
>
> @@ -275,6 +788,9 @@ static inline bool cqm_group_leader(struct perf_event *event)
> static void intel_cqm_event_destroy(struct perf_event *event)
> {
> struct perf_event *group_other = NULL;
> + struct monr *monr;
> + int i;
> + unsigned long flags;

Can you please find a consistent style to order your variables.

> @@ -562,6 +1097,12 @@ static int __init intel_cqm_init(void)
> goto error;
> }
>
> + monr_hrchy_root = monr_alloc();
> + if (IS_ERR(monr_hrchy_root)) {
> + ret = PTR_ERR(monr_hrchy_root);
> + goto error;
> + }
> +
> /* Select the minimum of the maximum rmids to use as limit for
> * threshold. XXX: per-package threshold.
> */
> @@ -570,6 +1111,7 @@ static int __init intel_cqm_init(void)
> min_max_rmid = cqm_pkgs_data[i]->max_rmid;
> intel_cqm_setup_pkg_prmid_pools(i);
> }
> + monr_hrchy_root->flags |= MONR_MON_ACTIVE;
>
> /*
> * A reasonable upper limit on the max threshold is the number


Why am I not surprised that monr_hrchy_root is leaked as well on error exit.

> diff --git a/arch/x86/events/intel/cqm.h b/arch/x86/events/intel/cqm.h
> /*
> + * Minimum time elapsed between reads of occupancy value for an RMID when
> + * transversing the monr hierarchy.
> + */
> +#define RMID_DEFAULT_MIN_UPDATE_TIME 20 /* ms */
> +static unsigned int __rmid_min_update_time = RMID_DEFAULT_MIN_UPDATE_TIME;

We do not define static variables in header files. And why has this variable
double underscores? Just because.

> +
> +static inline int cqm_prmid_update(struct prmid *prmid);

Why do you need a forward declaration for this here?

> +/*

Kernel doc comments start with /**

> + * union prmid_summary: Machine-size summary of a pmonr's prmid state.

/**
* union prmid_summary - Machine-size summary .....

Can you spot the difference?

> +
> +/* A pmonr in (U)state has no sched_rmid, read_rmid can be 0 or INVALID_RMID
> + * depending on whether monitoring is active or not.
> + */
> +inline bool prmid_summary__is_ustate(union prmid_summary summ)
> +{
> + return summ.sched_rmid == INVALID_RMID;
> +}
> +
> +inline bool prmid_summary__is_mon_active(union prmid_summary summ)
> +{
> + /* If not in (U)state, then MONR_MON_ACTIVE must be set. */
> + return summ.sched_rmid != INVALID_RMID ||
> + summ.read_rmid == 0;
> +}

I really have a hard time to figure out the value of these (UAI)state stuff.

What's wrong with: prmid_summary_is_unused(), prmid_summary_is_active()?

> +
> +struct monr;
> +
> +/* struct pmonr: Node of per-package hierarchy of MONitored Resources.

Again. No valid kerneldoc.

> + * @prmid: The prmid of this pmonr -when in (A)state-.
> + * @rotation_entry: List entry to attach to pmonr rotation lists in
> + * pkg_data.
> + * @monr: The monr that contains this pmonr.
> + * @pkg_id: Auxiliar variable with pkg id for this pmonr.
> + * @prmid_summary_atomic: Atomic accesor to store a union prmid_summary
> + * that represent the state of this pmonr.
> + *
> + * A pmonr forms a per-package hierarchy of prmids. Each one represents a
> + * resource to be monitored and can hold a prmid. Due to rmid scarcity,
> + * rmids can be recycled and rotated. When a rmid is not available for this
> + * pmonr, the pmonr utilizes the rmid of its ancestor.
> + * A pmonr is always in one of the following states:
> + * - (A)ctive: Has @prmid assigned, @ancestor_pmonr must be NULL.
> + * - (U)nused: No @ancestor_pmonr and no @prmid, hence no available
> + * prmid and no inhering one either. Not in rotation list.
> + * This state is unschedulable and a prmid
> + * should be found (either o free one or ancestor's) before
> + * scheduling a thread with (U)state pmonr in
> + * a cpu in this package.
> + *
> + * The state transitions are:
> + * (U) : The initial state. Starts there after allocation.
> + * (U) -> (A): If on first sched (or initialization) pmonr receives a prmid.
> + * (A) -> (U): On destruction of monr.
> + *
> + * Each pmonr is contained by a monr.
> + */
> +struct pmonr {
> +
> + struct prmid *prmid;
> +
> + struct monr *monr;
> + struct list_head rotation_entry;
> +
> + u16 pkg_id;
> +
> + /* all writers are sync'ed by package's lock. */

Please add this information to the kernel doc above and not here.

> + atomic64_t prmid_summary_atomic;
> +};
> +
> +/*
> * struct pkg_data: Per-package CQM data.
> * @max_rmid: Max rmid valid for cpus in this package.
> * @prmids_by_rmid: Utility mapping between rmid values and prmids.
> * XXX: Make it an array of prmids.
> * @free_prmid_pool: Free prmids.
> + * @active_prmid_pool: prmids associated with a (A)state pmonr.
> + * @nopmonr_limbo_prmid_pool: prmids in limbo state that are not referenced
> + * by a pmonr.
> + * @astate_pmonrs_lru: pmonrs in (A)state. LRU in increasing order of
> + * pmonr.last_enter_astate.
> * @pkg_data_mutex: Hold for stability when modifying pmonrs
> * hierarchy.
> * @pkg_data_lock: Hold to protect variables that may be accessed
> @@ -64,6 +170,12 @@ struct pkg_data {
> * Pools of prmids used in rotation logic.
> */
> struct list_head free_prmids_pool;
> + /* Can be modified during task switch with (U)state -> (A)state. */

Ditto

> + struct list_head active_prmids_pool;
> + /* Only modified during rotation logic and deletion. */
> + struct list_head nopmonr_limbo_prmids_pool;
> +
> + struct list_head astate_pmonrs_lru;
>
> struct mutex pkg_data_mutex;
> raw_spinlock_t pkg_data_lock;
> @@ -71,6 +183,52 @@ struct pkg_data {
> int rotation_cpu;
> };
>
> +/*
> + * Root for system-wide hierarchy of monr.
> + * A per-package raw_spin_lock protects changes to the per-pkg elements of
> + * the monr hierarchy.
> + * To modify the monr hierarchy, must hold all locks in each package
> + * using packaged-id as nesting parameter.
> + */
> +extern struct monr *monr_hrchy_root;
> +
> extern struct pkg_data **cqm_pkgs_data;

extern? Which files are using this? AFAICT only cqm.c. So if you plan to use
this in some other file later, then this can be made extern when that other
stuff happens, not just in case.

Thanks,

tglx