Re: [PATCH V3 5/5] perf/amd/iommu: Enable support for multiple IOMMUs
From: Borislav Petkov
Date: Wed Feb 10 2016 - 12:14:15 EST
On Tue, Feb 09, 2016 at 04:53:55PM -0600, Suravee Suthikulpanit wrote:
> The current amd_iommu_pc_get_set_reg_val() does not support muli-IOMMU
> system. This patch replace amd_iommu_pc_get_set_reg_val() with
> amd_iommu_pc_set_reg_val() and amd_iommu_pc_[set|get]_cnt_vals().
>
> Also, the current struct hw_perf_event.prev_count can only store the
> previous counter value only from one IOMMU. So, this patch introduces
> a new data structure, perf_amd_iommu.prev_cnts, to track previous value
> of each performance counter of each bank of each IOMMU.
>
> Last, this patch introduce perf_iommu_cnts to temporary hold counter
> values from each IOMMU for internal use when manages counters among
> multiple IOMMUs.
>
> Note that this implementation makes an assumption that the counters
> on all IOMMUs will be programmed the same way (i.e with the same events).
>
> Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@xxxxxxx>
> ---
> arch/x86/kernel/cpu/perf_event_amd_iommu.c | 125 +++++++++++++++++++++--------
> drivers/iommu/amd_iommu_init.c | 87 +++++++++++++++++---
> include/linux/perf/perf_event_amd_iommu.h | 8 +-
> 3 files changed, 174 insertions(+), 46 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
> index 791bbcf..ce6ba3f 100644
> --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
> +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
> @@ -42,6 +42,12 @@ struct perf_amd_iommu {
> u64 cntr_assign_mask;
> raw_spinlock_t lock;
> const struct attribute_group *attr_groups[4];
> +
> + /* This is a 3D array used to store the previous count values
> + * from each performance counter of each bank of each IOMMU.
> + * I.E. size of array = (num iommus * num banks * num counters)
> + */
> + local64_t *prev_cnts;
> };
>
> #define format_group attr_groups[0]
> @@ -121,6 +127,11 @@ static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
> { /* end: all zeroes */ },
> };
>
> +/* This is an array used to temporary hold the current values
> + * read from a particular perf counter from each IOMMU.
> + */
> +static u64 *perf_iommu_cnts;
> +
> /*---------------------------------------------
> * sysfs cpumask attributes
> *---------------------------------------------*/
> @@ -256,44 +267,46 @@ static void perf_iommu_enable_event(struct perf_event *ev)
> u64 reg = 0ULL;
>
> reg = csource;
> - amd_iommu_pc_get_set_reg_val(devid,
> + amd_iommu_pc_set_reg_val(devid,
> _GET_BANK(ev), _GET_CNTR(ev) ,
> - IOMMU_PC_COUNTER_SRC_REG, ®, true);
> + IOMMU_PC_COUNTER_SRC_REG, ®);
Read-impairing indentation. It should be aligned at the opening brace:
function_name(param1, param2,
param3, ...
Ditto for the calls below.
>
> reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
0ULL?
Is this supposed to clear the old value from the previous read above?
What's wrong with
reg = devid | (_GET_DEVID_MASK(ev) << 32);
?
Same for the rest.
> if (reg)
> reg |= (1UL << 31);
All those can be turned into
reg |= BIT(31);
> - amd_iommu_pc_get_set_reg_val(devid,
> + amd_iommu_pc_set_reg_val(devid,
> _GET_BANK(ev), _GET_CNTR(ev) ,
> - IOMMU_PC_DEVID_MATCH_REG, ®, true);
> + IOMMU_PC_DEVID_MATCH_REG, ®);
>
> reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
> if (reg)
> reg |= (1UL << 31);
> - amd_iommu_pc_get_set_reg_val(devid,
> + amd_iommu_pc_set_reg_val(devid,
> _GET_BANK(ev), _GET_CNTR(ev) ,
> - IOMMU_PC_PASID_MATCH_REG, ®, true);
> + IOMMU_PC_PASID_MATCH_REG, ®);
>
> reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
> if (reg)
> reg |= (1UL << 31);
> - amd_iommu_pc_get_set_reg_val(devid,
> + amd_iommu_pc_set_reg_val(devid,
> _GET_BANK(ev), _GET_CNTR(ev) ,
> - IOMMU_PC_DOMID_MATCH_REG, ®, true);
> + IOMMU_PC_DOMID_MATCH_REG, ®);
> }
>
> static void perf_iommu_disable_event(struct perf_event *event)
> {
> u64 reg = 0ULL;
>
> - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
> + amd_iommu_pc_set_reg_val(_GET_DEVID(event),
> _GET_BANK(event), _GET_CNTR(event),
> - IOMMU_PC_COUNTER_SRC_REG, ®, true);
> + IOMMU_PC_COUNTER_SRC_REG, ®);
> }
>
> static void perf_iommu_start(struct perf_event *event, int flags)
> {
> struct hw_perf_event *hwc = &event->hw;
> + struct perf_amd_iommu *perf_iommu =
> + container_of(event->pmu, struct perf_amd_iommu, pmu);
There's no need to make it less readable and still fit within the 80
cols rule. Feel free to relax it a little.
> pr_debug("perf: amd_iommu:perf_iommu_start\n");
> if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
> @@ -303,10 +316,19 @@ static void perf_iommu_start(struct perf_event *event, int flags)
> hwc->state = 0;
>
> if (flags & PERF_EF_RELOAD) {
> - u64 prev_raw_count = local64_read(&hwc->prev_count);
> - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
> - _GET_BANK(event), _GET_CNTR(event),
> - IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
> + int i;
> +
> + for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
> + int index = get_iommu_bnk_cnt_evt_idx(perf_iommu, i,
> + _GET_BANK(event), _GET_CNTR(event));
No need for the line break.
> +
> + perf_iommu_cnts[i] = local64_read(
> + &perf_iommu->prev_cnts[index]);
Yuck. What's wrong with:
perf_iommu_cnts[i] = local64_read(&perf_iommu->prev_cnts[index]);
?
It looks much better to me.
> + }
> +
> + amd_iommu_pc_set_cnt_vals(_GET_BANK(event), _GET_CNTR(event),
> + amd_iommu_get_num_iommus(),
> + perf_iommu_cnts);
This one looks good.
> }
>
> perf_iommu_enable_event(event);
> @@ -316,29 +338,47 @@ static void perf_iommu_start(struct perf_event *event, int flags)
>
> static void perf_iommu_read(struct perf_event *event)
> {
> - u64 count = 0ULL;
> - u64 prev_raw_count = 0ULL;
> + int i;
> u64 delta = 0ULL;
> struct hw_perf_event *hwc = &event->hw;
> - pr_debug("perf: amd_iommu:perf_iommu_read\n");
> -
> - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
> - _GET_BANK(event), _GET_CNTR(event),
> - IOMMU_PC_COUNTER_REG, &count, false);
> + struct perf_amd_iommu *perf_iommu =
> + container_of(event->pmu, struct perf_amd_iommu, pmu);
Indentation.
>
> - /* IOMMU pc counter register is only 48 bits */
> - count &= 0xFFFFFFFFFFFFULL;
> + pr_debug("perf: amd_iommu:perf_iommu_read\n");
What is that debug statement good for? Can it go?
>From looking at that file, it has a bunch more which are completely
useless and can be deleted. In a separate patch.
> - prev_raw_count = local64_read(&hwc->prev_count);
> - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
> - count) != prev_raw_count)
> + if (amd_iommu_pc_get_cnt_vals(_GET_BANK(event), _GET_CNTR(event),
> + amd_iommu_get_num_iommus(),
> + perf_iommu_cnts))
> return;
>
> - /* Handling 48-bit counter overflowing */
> - delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
> - delta >>= COUNTER_SHIFT;
> - local64_add(delta, &event->count);
> -
> + /* Now we re-aggregating event counts and prev-counts
> + * from all IOMMUs
> + */
Kernel comment style:
/*
* Start of first sentence...
* Second sentence...
* ...
*/
> + local64_set(&hwc->prev_count, 0);
> +
> + for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
> + int indx = get_iommu_bnk_cnt_evt_idx(perf_iommu, i,
> + _GET_BANK(event), _GET_CNTR(event));
Indentation.
> + u64 prev_raw_count = local64_read(&perf_iommu->prev_cnts[indx]);
> +
> + /* IOMMU pc counter register is only 48 bits */
> + perf_iommu_cnts[i] &= 0xFFFFFFFFFFFFULL;
&= GENMASK_ULL(48, 0)
> +
> + /*
> + * Since we do not enable counter overflow interrupts,
> + * we do not have to worry about prev_count changing on us
> + */
> + local64_set(&perf_iommu->prev_cnts[indx],
> + perf_iommu_cnts[i]);
No need to break it.
> +
> + local64_add(prev_raw_count, &hwc->prev_count);
> +
> + /* Handling 48-bit counter overflowing */
/* Handle 48-bit counter overflow: */
reads better.
> + delta = (perf_iommu_cnts[i] << COUNTER_SHIFT) -
> + (prev_raw_count << COUNTER_SHIFT);
No need for the linebreak.
> + delta >>= COUNTER_SHIFT;
> + local64_add(delta, &event->count);
> + }
> }
>
> static void perf_iommu_stop(struct perf_event *event, int flags)
> @@ -428,10 +468,14 @@ static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
>
> static __init void amd_iommu_pc_exit(void)
> {
> - if (__perf_iommu.events_group != NULL) {
> - kfree(__perf_iommu.events_group);
> - __perf_iommu.events_group = NULL;
> - }
> + kfree(__perf_iommu.events_group);
> + __perf_iommu.events_group = NULL;
> +
> + kfree(__perf_iommu.prev_cnts);
> + __perf_iommu.prev_cnts = NULL;
> +
> + kfree(perf_iommu_cnts);
> + perf_iommu_cnts = NULL;
> }
>
> static __init int _init_perf_amd_iommu(
> @@ -461,6 +505,17 @@ static __init int _init_perf_amd_iommu(
> perf_iommu->null_group = NULL;
> perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
>
> + perf_iommu->prev_cnts = kzalloc(sizeof(*perf_iommu->prev_cnts) *
> + (amd_iommu_get_num_iommus() * perf_iommu->max_banks *
> + perf_iommu->max_counters), GFP_KERNEL);
Indentation.
perf_iommu->prev_cnts = kzalloc(sizeof(*perf_iommu->prev_cnts) *
amd_iommu_get_num_iommus() *
perf_iommu->max_banks *
perf_iommu->max_counters), GFP_KERNEL);
looks more readable to me.
> + if (!perf_iommu->prev_cnts)
> + return -ENOMEM;
> +
> + perf_iommu_cnts = kzalloc(sizeof(*perf_iommu_cnts) *
> + amd_iommu_get_num_iommus(), GFP_KERNEL);
No need for the linebreak.
> + if (!perf_iommu_cnts)
> + return -ENOMEM;
> +
> ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
> if (ret) {
> pr_err("perf: amd_iommu: Failed to initialized.\n");
You can define pr_fmt to "perf/amd_iommu: " at the beginning of this
file and then you don't need to supply that prefix each time you call
pr_*
And that sentence above it wrong. It should be:
pr_err("Error initializing AMD IOMMU perf counters.\n");
or something like that.
> diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
> index 531b2e1..7b1b561 100644
> --- a/drivers/iommu/amd_iommu_init.c
> +++ b/drivers/iommu/amd_iommu_init.c
> @@ -1133,6 +1133,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
> return 0;
> }
>
> +static int _amd_iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
> + u8 bank, u8 cntr, u8 fxn,
> + u64 *value, bool is_write);
static int _amd_iommu_this_func_name_is_def_too_long
>
> static void init_iommu_perf_ctr(struct amd_iommu *iommu)
> {
> @@ -1144,8 +1147,8 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu)
> amd_iommu_pc_present = true;
>
> /* Check if the performance counters can be written to */
> - if ((0 != amd_iommu_pc_get_set_reg_val(0, 0, 0, 0, &val, true)) ||
> - (0 != amd_iommu_pc_get_set_reg_val(0, 0, 0, 0, &val2, false)) ||
> + if ((_amd_iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val, true)) ||
> + (_amd_iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val2, false)) ||
> (val != val2)) {
> pr_err("AMD-Vi: Unable to write to IOMMU perf counter.\n");
> amd_iommu_pc_present = false;
> @@ -2294,10 +2297,10 @@ u8 amd_iommu_pc_get_max_counters(void)
> }
> EXPORT_SYMBOL(amd_iommu_pc_get_max_counters);
>
> -int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
> - u64 *value, bool is_write)
> +static int _amd_iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
> + u8 bank, u8 cntr, u8 fxn,
> + u64 *value, bool is_write)
Ditto.
> {
> - struct amd_iommu *iommu;
> u32 offset;
> u32 max_offset_lim;
>
> @@ -2305,9 +2308,6 @@ int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
> if (!amd_iommu_pc_present)
> return -ENODEV;
>
> - /* Locate the iommu associated with the device ID */
> - iommu = amd_iommu_rlookup_table[devid];
> -
> /* Check for valid iommu and pc register indexing */
> if (WARN_ON((iommu == NULL) || (fxn > 0x28) || (fxn & 7)))
> return -ENODEV;
> @@ -2332,4 +2332,73 @@ int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
>
> return 0;
> }
> -EXPORT_SYMBOL(amd_iommu_pc_get_set_reg_val);
> +
> +int amd_iommu_pc_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn, u64 *value)
> +{
> + struct amd_iommu *iommu;
> +
> + for_each_iommu(iommu) {
> + int ret = _amd_iommu_pc_get_set_reg_val(iommu, bank, cntr,
> + fxn, value, true);
> + if (ret)
> + return ret;
> + }
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(amd_iommu_pc_set_reg_val);
Why isn't this EXPORT_SYMBOL_GPL?
> +
> +int amd_iommu_pc_set_cnt_vals(u8 bank, u8 cntr, int num, u64 *value)
That whole naming is hard to read. What's wrong with
amd_iommu_set_counters()
?
> +{
> + struct amd_iommu *iommu;
> + int i = 0;
> +
> + if (num > amd_iommus_present)
> + return -EINVAL;
> +
> + for_each_iommu(iommu) {
> + int ret = _amd_iommu_pc_get_set_reg_val(iommu, bank, cntr,
> + IOMMU_PC_COUNTER_REG,
> + &value[i], true);
> + if (ret)
> + return ret;
> + if (i++ == amd_iommus_present)
> + break;
> + }
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(amd_iommu_pc_set_cnt_vals);
Ditto, why not EXPORT_SYMBOL_GPL ?
> +
> +int amd_iommu_pc_get_cnt_vals(u8 bank, u8 cntr, int num, u64 *value)
> +{
> + struct amd_iommu *iommu;
> + int i = 0, ret;
> +
> + if (!num)
> + return -EINVAL;
> +
> + /*
> + * Here, we read the specified counters on all IOMMU,
Plural is IOMMUs ?
> + * which should have been programmed the same way.
> + * and aggregate the counter values.
That comment is a sentence with a fullstop in the middle.
> + */
> + for_each_iommu(iommu) {
> + u64 tmp;
> +
> + if (i >= num)
> + return -EINVAL;
> +
> + ret = _amd_iommu_pc_get_set_reg_val(iommu, bank, cntr,
> + IOMMU_PC_COUNTER_REG,
> + &tmp, false);
> + if (ret)
> + return ret;
So if one of those intermediary calls of _amd_iommu_pc_get_set_reg_val()
fails, we return here not having read all counters. Unwind and handle
that error properly maybe?
Same issue above.
> +
> + /* IOMMU pc counter register is only 48 bits */
> + value[i] = tmp & 0xFFFFFFFFFFFFULL;
> + }
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(amd_iommu_pc_get_cnt_vals);
> diff --git a/include/linux/perf/perf_event_amd_iommu.h b/include/linux/perf/perf_event_amd_iommu.h
> index cb820c2..be1a17d 100644
> --- a/include/linux/perf/perf_event_amd_iommu.h
> +++ b/include/linux/perf/perf_event_amd_iommu.h
> @@ -33,7 +33,11 @@ extern u8 amd_iommu_pc_get_max_banks(void);
>
> extern u8 amd_iommu_pc_get_max_counters(void);
>
> -extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
> - u8 fxn, u64 *value, bool is_write);
> +extern int amd_iommu_pc_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
> + u64 *value);
> +
> +extern int amd_iommu_pc_set_cnt_vals(u8 bank, u8 cntr, int num, u64 *value);
> +
> +extern int amd_iommu_pc_get_cnt_vals(u8 bank, u8 cntr, int num, u64 *value);
No need for that "extern"
> #endif /*_PERF_EVENT_AMD_IOMMU_H_*/
> --
> 2.5.0
>
>
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.