Re: [PATCH v2 10/11] perf c2c: Sort on all cache hit for load operations

From: Namhyung Kim
Date: Wed Jan 06 2021 - 02:53:45 EST


On Sun, Dec 13, 2020 at 10:39 PM Leo Yan <leo.yan@xxxxxxxxxx> wrote:
>
> Except the existed three display options 'tot', 'rmt', 'lcl', this patch
> adds option 'all' so can sort on the all cache hit for load operation.
> This new introduced option can be a choice for profiling cache false
> sharing if the memory event doesn't contain HITM tags.
>
> For displaying with option 'all', the "Shared Data Cache Line Table" and
> "Shared Cache Line Distribution Pareto" both have difference comparing
> to other three display options.
>
> For the "Shared Data Cache Line Table", instead of sorting HITM metrics,
> it sorts with the metrics "tot_ld_hit" and "percent_tot_ld_hit". If
> without HITM metrics, users can analyze the load hit statistics for all
> cache levels, so the dimensions of total load hit is used to replace
> HITM dimensions.
>
> For Pareto, every single cache line shows the metrics "cl_tot_ld_hit"
> and "cl_tot_ld_miss" instead of "cl_rmt_hitm" and "percent_lcl_hitm",
> and the single cache line view is sorted by metrics "tot_ld_hit".
>
> As result, we can get the 'all' display as follows:
>
> # perf c2c report -d all --coalesce tid,pid,iaddr,dso --stdio
>
> [...]
>
> =================================================
> Shared Data Cache Line Table
> =================================================
> #
> # ----------- Cacheline ---------- Load Hit Load Hit Total Total Total ---- Stores ---- ----- Core Load Hit ----- - LLC Load Hit -- - RMT Load Hit -- --- Load Dram ----
> # Index Address Node PA cnt Pct Total records Loads Stores L1Hit L1Miss FB L1 L2 LclHit LclHitm RmtHit RmtHitm Lcl Rmt
> # ..... .................. .... ...... ........ ........ ....... ....... ....... ....... ....... ....... ....... ....... ........ ....... ........ ....... ........ ........
> #
> 0 0x556f25dff100 0 1895 75.73% 4591 7840 4591 3249 2633 616 849 2734 67 58 883 0 0 0 0
> 1 0x556f25dff080 0 1 13.10% 794 794 794 0 0 0 164 486 28 20 96 0 0 0 0
> 2 0x556f25dff0c0 0 1 10.01% 607 607 607 0 0 0 107 5 5 488 2 0 0 0 0
>
> =================================================
> Shared Cache Line Distribution Pareto
> =================================================
> #
> # -- Load Refs -- -- Store Refs -- --------- Data address --------- ---------- cycles ---------- Total cpu Shared
> # Num Hit Miss L1 Hit L1 Miss Offset Node PA cnt Pid Tid Code address rmt hitm lcl hitm load records cnt Symbol Object Source:Line Node
> # ..... ....... ....... ....... ....... .................. .... ...... ....... .................. .................. ........ ........ ........ ....... ........ ................... ................. ........................... ....
> #
> -------------------------------------------------------------
> 0 4591 0 2633 616 0x556f25dff100
> -------------------------------------------------------------
> 20.52% 0.00% 0.00% 0.00% 0x0 0 1 28079 28082:lock_th 0x556f25bfdc1d 0 2200 1276 942 1 [.] read_write_func false_sharing.exe false_sharing_example.c:146 0
> 19.82% 0.00% 38.06% 0.00% 0x0 0 1 28079 28082:lock_th 0x556f25bfdc16 0 2190 1130 1912 1 [.] read_write_func false_sharing.exe false_sharing_example.c:145 0
> 18.25% 0.00% 56.63% 0.00% 0x0 0 1 28079 28081:lock_th 0x556f25bfdc16 0 2173 1074 2329 1 [.] read_write_func false_sharing.exe false_sharing_example.c:145 0
> 18.23% 0.00% 0.00% 0.00% 0x0 0 1 28079 28081:lock_th 0x556f25bfdc1d 0 2013 1220 837 1 [.] read_write_func false_sharing.exe false_sharing_example.c:146 0
> 0.00% 0.00% 3.11% 59.90% 0x0 0 1 28079 28081:lock_th 0x556f25bfdc28 0 0 0 451 1 [.] read_write_func false_sharing.exe false_sharing_example.c:146 0
> 0.00% 0.00% 2.20% 40.10% 0x0 0 1 28079 28082:lock_th 0x556f25bfdc28 0 0 0 305 1 [.] read_write_func false_sharing.exe false_sharing_example.c:146 0
> 12.00% 0.00% 0.00% 0.00% 0x20 0 1 28079 28083:reader_thd 0x556f25bfdc73 0 159 107 551 1 [.] read_write_func false_sharing.exe false_sharing_example.c:155 0
> 11.17% 0.00% 0.00% 0.00% 0x20 0 1 28079 28084:reader_thd 0x556f25bfdc73 0 148 108 513 1 [.] read_write_func false_sharing.exe false_sharing_example.c:155 0
>
> [...]
>
> Signed-off-by: Leo Yan <leo.yan@xxxxxxxxxx>
> ---
> tools/perf/builtin-c2c.c | 139 ++++++++++++++++++++++++++++-----------
> 1 file changed, 101 insertions(+), 38 deletions(-)
>
> diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
> index 9342c30d86ee..0df4a4a30f7a 100644
> --- a/tools/perf/builtin-c2c.c
> +++ b/tools/perf/builtin-c2c.c
[SNIP]
> @@ -2502,12 +2528,21 @@ static void print_pareto(FILE *out)
> int ret;
> const char *cl_output;
>
> - cl_output = "cl_num,"
> - "cl_rmt_hitm,"
> - "cl_lcl_hitm,"
> - "cl_stores_l1hit,"
> - "cl_stores_l1miss,"
> - "dcacheline";
> + if (c2c.display == DISPLAY_TOT || c2c.display == DISPLAY_LCL ||
> + c2c.display == DISPLAY_RMT)
> + cl_output = "cl_num,"
> + "cl_rmt_hitm,"
> + "cl_lcl_hitm,"
> + "cl_stores_l1hit,"
> + "cl_stores_l1miss,"
> + "dcacheline";
> + else /* c2c.display == DISPLAY_ALL */
> + cl_output = "cl_num,"
> + "cl_tot_ld_hit,"
> + "cl_tot_ld_miss,"
> + "cl_stores_l1hit,"
> + "cl_stores_l1miss,"
> + "dcacheline";

Nit: You can keep the default value as is, and add an if statement
just for the DISPLAY_ALL.

>
> perf_hpp_list__init(&hpp_list);
> ret = hpp_list__parse(&hpp_list, cl_output, NULL);
> @@ -2543,7 +2578,7 @@ static void print_c2c_info(FILE *out, struct perf_session *session)
> fprintf(out, "%-36s: %s\n", first ? " Events" : "", evsel__name(evsel));
> first = false;
> }
> - fprintf(out, " Cachelines sort on : %s HITMs\n",
> + fprintf(out, " Cachelines sort on : %s\n",
> display_str[c2c.display]);
> fprintf(out, " Cacheline data grouping : %s\n", c2c.cl_sort);
> }
> @@ -2700,7 +2735,7 @@ static int perf_c2c_browser__title(struct hist_browser *browser,
> {
> scnprintf(bf, size,
> "Shared Data Cache Line Table "
> - "(%lu entries, sorted on %s HITMs)",
> + "(%lu entries, sorted on %s)",
> browser->nr_non_filtered_entries,
> display_str[c2c.display]);
> return 0;
> @@ -2906,6 +2941,8 @@ static int setup_display(const char *str)
> c2c.display = DISPLAY_RMT;
> else if (!strcmp(display, "lcl"))
> c2c.display = DISPLAY_LCL;
> + else if (!strcmp(display, "all"))
> + c2c.display = DISPLAY_ALL;
> else {
> pr_err("failed: unknown display type: %s\n", str);
> return -1;
> @@ -2952,10 +2989,12 @@ static int build_cl_output(char *cl_sort, bool no_source)
> }
>
> if (asprintf(&c2c.cl_output,
> - "%s%s%s%s%s%s%s%s%s%s",
> + "%s%s%s%s%s%s%s%s%s%s%s",
> c2c.use_stdio ? "cl_num_empty," : "",
> - "percent_rmt_hitm,"
> - "percent_lcl_hitm,"
> + c2c.display == DISPLAY_ALL ? "percent_ld_hit,"
> + "percent_ld_miss," :
> + "percent_rmt_hitm,"
> + "percent_lcl_hitm,",
> "percent_stores_l1hit,"
> "percent_stores_l1miss,"
> "offset,offset_node,dcacheline_count,",
> @@ -2984,6 +3023,7 @@ static int build_cl_output(char *cl_sort, bool no_source)
> static int setup_coalesce(const char *coalesce, bool no_source)
> {
> const char *c = coalesce ?: coalesce_default;
> + const char *sort_str = NULL;
>
> if (asprintf(&c2c.cl_sort, "offset,%s", c) < 0)
> return -ENOMEM;
> @@ -2991,12 +3031,16 @@ static int setup_coalesce(const char *coalesce, bool no_source)
> if (build_cl_output(c2c.cl_sort, no_source))
> return -1;
>
> - if (asprintf(&c2c.cl_resort, "offset,%s",
> - c2c.display == DISPLAY_TOT ?
> - "tot_hitm" :
> - c2c.display == DISPLAY_RMT ?
> - "rmt_hitm,lcl_hitm" :
> - "lcl_hitm,rmt_hitm") < 0)
> + if (c2c.display == DISPLAY_TOT)
> + sort_str = "tot_hitm";
> + else if (c2c.display == DISPLAY_RMT)
> + sort_str = "rmt_hitm,lcl_hitm";
> + else if (c2c.display == DISPLAY_LCL)
> + sort_str = "lcl_hitm,rmt_hitm";
> + else if (c2c.display == DISPLAY_ALL)
> + sort_str = "tot_ld_hit";
> +
> + if (asprintf(&c2c.cl_resort, "offset,%s", sort_str) < 0)
> return -ENOMEM;
>
> pr_debug("coalesce sort fields: %s\n", c2c.cl_sort);
> @@ -3131,20 +3175,37 @@ static int perf_c2c__report(int argc, const char **argv)
> goto out_mem2node;
> }
>
> - output_str = "cl_idx,"
> - "dcacheline,"
> - "dcacheline_node,"
> - "dcacheline_count,"
> - "percent_hitm,"
> - "tot_hitm,lcl_hitm,rmt_hitm,"
> - "tot_recs,"
> - "tot_loads,"
> - "tot_stores,"
> - "stores_l1hit,stores_l1miss,"
> - "ld_fbhit,ld_l1hit,ld_l2hit,"
> - "ld_lclhit,lcl_hitm,"
> - "ld_rmthit,rmt_hitm,"
> - "dram_lcl,dram_rmt";
> + if (c2c.display == DISPLAY_TOT || c2c.display == DISPLAY_LCL ||
> + c2c.display == DISPLAY_RMT)
> + output_str = "cl_idx,"
> + "dcacheline,"
> + "dcacheline_node,"
> + "dcacheline_count,"
> + "percent_hitm,"
> + "tot_hitm,lcl_hitm,rmt_hitm,"
> + "tot_recs,"
> + "tot_loads,"
> + "tot_stores,"
> + "stores_l1hit,stores_l1miss,"
> + "ld_fbhit,ld_l1hit,ld_l2hit,"
> + "ld_lclhit,lcl_hitm,"
> + "ld_rmthit,rmt_hitm,"
> + "dram_lcl,dram_rmt";
> + else /* c2c.display == DISPLAY_ALL */
> + output_str = "cl_idx,"
> + "dcacheline,"
> + "dcacheline_node,"
> + "dcacheline_count,"
> + "percent_tot_ld_hit,"
> + "tot_ld_hit,"
> + "tot_recs,"
> + "tot_loads,"
> + "tot_stores,"
> + "stores_l1hit,stores_l1miss,"
> + "ld_fbhit,ld_l1hit,ld_l2hit,"
> + "ld_lclhit,lcl_hitm,"
> + "ld_rmthit,rmt_hitm,"
> + "dram_lcl,dram_rmt";

Ditto.

Thanks,
Namhyung


>
> if (c2c.display == DISPLAY_TOT)
> sort_str = "tot_hitm";
> @@ -3152,6 +3213,8 @@ static int perf_c2c__report(int argc, const char **argv)
> sort_str = "rmt_hitm";
> else if (c2c.display == DISPLAY_LCL)
> sort_str = "lcl_hitm";
> + else if (c2c.display == DISPLAY_ALL)
> + sort_str = "tot_ld_hit";
>
> c2c_hists__reinit(&c2c.hists, output_str, sort_str);
>
> --
> 2.17.1
>