Re: [PATCH v2] perf/x86/ibs: Set mem_lvl_num, mem_remote and mem_hops for data_src

From: Namhyung Kim
Date: Wed Mar 29 2023 - 03:05:42 EST


Hi Ravi,

On Mon, Mar 27, 2023 at 6:09 AM Ravi Bangoria <ravi.bangoria@xxxxxxx> wrote:
>
> From: Namhyung Kim <namhyung@xxxxxxxxxx>
>
> Kernel IBS driver wasn't using new PERF_MEM_* APIs due to few of its
> limitations. Mainly:
>
> - mem_lvl_num doesn't allow setting multiple sources whereas old API
> allows it. Setting multiple data sources is useful because IBS on
> pre-zen4 uarch doesn't provide fine granular DataSrc details (there
> is only one such DataSrc(2h) though).
> - perf mem sorting logic (sort__lvl_cmp()) ignores mem_lvl_num. perf
> c2c (c2c_decode_stats()) does not use mem_lvl_num at all. perf mem
> prints mem_lvl and mem_lvl_num both if both are set, which is ugly.
>
> 1st one can be handled by using ANY_CACHE with HOPS_0. 2nd one is
> purely perf tool specific issue and should be fixed separately.
>
> Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@xxxxxxx>
> ---
> v1: https://lore.kernel.org/r/20230311000642.1270971-1-namhyung@xxxxxxxxxx
> v1->v2:
> - In addition to setting new API fields, convert all individual field
> assignments to compile time wrapper macros built using PERF_MEM_S().
> Also convert DataSrc conditional code to array lookups.
>
> arch/x86/events/amd/ibs.c | 155 +++++++++++++++++---------------------
> 1 file changed, 68 insertions(+), 87 deletions(-)
>
> diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
> index 64582954b5f6..b46e0b725fe5 100644
> --- a/arch/x86/events/amd/ibs.c
> +++ b/arch/x86/events/amd/ibs.c
> @@ -703,9 +703,41 @@ static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
> return op_data2->data_src_lo;
> }
>
> -static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> - union ibs_op_data3 *op_data3,
> - struct perf_sample_data *data)
> +#define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
> +#define LN(x) PERF_MEM_S(LVLNUM, x)
> +#define REM PERF_MEM_S(REMOTE, REMOTE)
> +#define HOPS(x) PERF_MEM_S(HOPS, x)
> +
> +static u64 g_data_src[8] = {
> + [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
> + [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM),
> + [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
> + [IBS_DATA_SRC_IO] = L(IO) | LN(IO),
> +};
> +
> +#define RMT_NODE_BITS ((1 << IBS_DATA_SRC_DRAM) | \
> + (1 << IBS_DATA_SRC_IO))
> +#define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
> +
> +static u64 g_zen4_data_src[32] = {
> + [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3),
> + [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
> + [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM),
> + [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
> + [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM),
> + [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO),
> + [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL),
> +};
> +
> +#define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \
> + (1 << IBS_DATA_SRC_EXT_PMEM) | \
> + (1 << IBS_DATA_SRC_EXT_IO) |\
> + (1 << IBS_DATA_SRC_EXT_EXT_MEM))
> +#define ZEN4_RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
> +
> +static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> + union ibs_op_data3 *op_data3,
> + struct perf_sample_data *data)
> {
> union perf_mem_data_src *data_src = &data->data_src;
> u8 ibs_data_src = perf_ibs_data_src(op_data2);
> @@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
> * memory accesses. So, check DcUcMemAcc bit early.
> */
> - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
> - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
> + return L(UNC);

Hmm.. it seems we don't have PERF_MEM_LVLNUM_UNC.

>
> /* L1 Hit */
> - if (op_data3->dc_miss == 0) {
> - data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (op_data3->dc_miss == 0)
> + return L(L1) | LN(L1);
>
> /* L2 Hit */
> if (op_data3->l2_miss == 0) {
> /* Erratum #1293 */
> if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
> - !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
> - data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
> + return L(L2) | LN(L2);
> }
>
> /*
> @@ -744,82 +770,36 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> if (data_src->mem_op != PERF_MEM_OP_LOAD)
> goto check_mab;
>
> - /* L3 Hit */
> if (ibs_caps & IBS_CAPS_ZEN4) {
> - if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
> - return;
> - }
> - } else {
> - if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 |
> - PERF_MEM_LVL_HIT;
> - return;
> - }
> - }
> + u64 val = g_zen4_data_src[ibs_data_src];
>
> - /* A peer cache in a near CCX */
> - if (ibs_caps & IBS_CAPS_ZEN4 &&
> - ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (!val)
> + goto check_mab;
>
> - /* A peer cache in a far CCX */
> - if (ibs_caps & IBS_CAPS_ZEN4) {
> - if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
> - return;
> + /* HOPS_1 because IBS doesn't provide remote socket detail */
> + if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
> + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
> + val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
> + else
> + val |= REM | HOPS(1);
> }
> - } else {
> - if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
> - return;
> - }
> - }
>
> - /* DRAM */
> - if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) {
> - if (op_data2->rmt_node == 0)
> - data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
> - else
> - data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + return val;
> + } else {
> + u64 val = g_data_src[ibs_data_src];
>
> - /* PMEM */
> - if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) {
> - data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM;
> - if (op_data2->rmt_node) {
> - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> - /* IBS doesn't provide Remote socket detail */
> - data_src->mem_hops = PERF_MEM_HOPS_1;
> - }
> - return;
> - }
> + if (!val)
> + goto check_mab;
>
> - /* Extension Memory */
> - if (ibs_caps & IBS_CAPS_ZEN4 &&
> - ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
> - data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
> - if (op_data2->rmt_node) {
> - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> - /* IBS doesn't provide Remote socket detail */
> - data_src->mem_hops = PERF_MEM_HOPS_1;
> + /* HOPS_1 because IBS doesn't provide remote socket detail */
> + if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
> + if (ibs_data_src == IBS_DATA_SRC_DRAM)
> + val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
> + else
> + val |= REM | HOPS(1);
> }
> - return;
> - }
>
> - /* IO */
> - if (ibs_data_src == IBS_DATA_SRC_EXT_IO) {
> - data_src->mem_lvl = PERF_MEM_LVL_IO;
> - data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
> - if (op_data2->rmt_node) {
> - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> - /* IBS doesn't provide Remote socket detail */
> - data_src->mem_hops = PERF_MEM_HOPS_1;
> - }
> - return;
> + return val;
> }
>
> check_mab:
> @@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
> * MAB only when IBS fails to provide DataSrc.
> */
> - if (op_data3->dc_miss_no_mab_alloc) {
> - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (op_data3->dc_miss_no_mab_alloc)
> + return L(LFB) | LN(LFB);
>
> data_src->mem_lvl = PERF_MEM_LVL_NA;
> + return 0;

Wouldn't it be 'return L(NA) | LN(NA);' ?

Thanks,
Namhyung


> }
>
> static bool perf_ibs_cache_hit_st_valid(void)
> @@ -925,7 +904,9 @@ static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
> union ibs_op_data2 *op_data2,
> union ibs_op_data3 *op_data3)
> {
> - perf_ibs_get_mem_lvl(op_data2, op_data3, data);
> + union perf_mem_data_src *data_src = &data->data_src;
> +
> + data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
> perf_ibs_get_mem_snoop(op_data2, data);
> perf_ibs_get_tlb_lvl(op_data3, data);
> perf_ibs_get_mem_lock(op_data3, data);
> --
> 2.39.2
>