Re: [PATCH v1 1/6] perf report: Sort child tasks by tid

From: Ian Rogers
Date: Wed Feb 28 2024 - 02:06:11 EST


On Tue, Feb 27, 2024 at 10:11 PM Namhyung Kim <namhyung@xxxxxxxxxx> wrote:
>
> On Mon, Feb 26, 2024 at 11:12 PM Ian Rogers <irogers@xxxxxxxxxx> wrote:
> >
> > On Mon, Feb 26, 2024 at 10:39 PM Namhyung Kim <namhyung@xxxxxxxxxx> wrote:
> > >
> > > On Tue, Feb 13, 2024 at 10:37 PM Ian Rogers <irogers@xxxxxxxxxx> wrote:
> > > >
> > > > Commit 91e467bc568f ("perf machine: Use hashtable for machine
> > > > threads") made the iteration of thread tids unordered. The perf report
> > > > --tasks output now shows child threads in an order determined by the
> > > > hashing. For example, in this snippet tid 3 appears after tid 256 even
> > > > though they have the same ppid 2:
> > > >
> > > > ```
> > > > $ perf report --tasks
> > > > % pid tid ppid comm
> > > > 0 0 -1 |swapper
> > > > 2 2 0 | kthreadd
> > > > 256 256 2 | kworker/12:1H-k
> > > > 693761 693761 2 | kworker/10:1-mm
> > > > 1301762 1301762 2 | kworker/1:1-mm_
> > > > 1302530 1302530 2 | kworker/u32:0-k
> > > > 3 3 2 | rcu_gp
> > > > ...
> > > > ```
> > > >
> > > > The output is easier to read if threads appear numerically
> > > > increasing. To allow for this, read all threads into a list then sort
> > > > with a comparator that orders by the child task's of the first common
> > > > parent. The list creation and deletion are created as utilities on
> > > > machine. The indentation is possible by counting the number of
> > > > parents a child has.
> > > >
> > > > With this change the output for the same data file is now like:
> > > > ```
> > > > $ perf report --tasks
> > > > % pid tid ppid comm
> > > > 0 0 -1 |swapper
> > > > 1 1 0 | systemd
> > > > 823 823 1 | systemd-journal
> > > > 853 853 1 | systemd-udevd
> > > > 3230 3230 1 | systemd-timesyn
> > > > 3236 3236 1 | auditd
> > > > 3239 3239 3236 | audisp-syslog
> > > > 3321 3321 1 | accounts-daemon
> > > > ...
> > > > ```
> > > >
> > > > Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
>
> I know you sent out v2 already, but let me continue the discussion
> here.
>
>
> > > > ---
> > > > tools/perf/builtin-report.c | 203 ++++++++++++++++++++----------------
> > > > tools/perf/util/machine.c | 30 ++++++
> > > > tools/perf/util/machine.h | 10 ++
> > > > 3 files changed, 155 insertions(+), 88 deletions(-)
> > > >
> > > > diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> > > > index 8e16fa261e6f..b48f1d5309e3 100644
> > > > --- a/tools/perf/builtin-report.c
> > > > +++ b/tools/perf/builtin-report.c
> > > > @@ -59,6 +59,7 @@
> > > > #include <linux/ctype.h>
> > > > #include <signal.h>
> > > > #include <linux/bitmap.h>
> > > > +#include <linux/list_sort.h>
> > > > #include <linux/string.h>
> > > > #include <linux/stringify.h>
> > > > #include <linux/time64.h>
> > > > @@ -828,35 +829,6 @@ static void tasks_setup(struct report *rep)
> > > > rep->tool.no_warn = true;
> > > > }
> > > >
> > > > -struct task {
> > > > - struct thread *thread;
> > > > - struct list_head list;
> > > > - struct list_head children;
> > > > -};
> > > > -
> > > > -static struct task *tasks_list(struct task *task, struct machine *machine)
> > > > -{
> > > > - struct thread *parent_thread, *thread = task->thread;
> > > > - struct task *parent_task;
> > > > -
> > > > - /* Already listed. */
> > > > - if (!list_empty(&task->list))
> > > > - return NULL;
> > > > -
> > > > - /* Last one in the chain. */
> > > > - if (thread__ppid(thread) == -1)
> > > > - return task;
> > > > -
> > > > - parent_thread = machine__find_thread(machine, -1, thread__ppid(thread));
> > > > - if (!parent_thread)
> > > > - return ERR_PTR(-ENOENT);
> > > > -
> > > > - parent_task = thread__priv(parent_thread);
> > > > - thread__put(parent_thread);
> > > > - list_add_tail(&task->list, &parent_task->children);
> > > > - return tasks_list(parent_task, machine);
> > > > -}
> > > > -
> > > > struct maps__fprintf_task_args {
> > > > int indent;
> > > > FILE *fp;
> > > > @@ -900,89 +872,144 @@ static size_t maps__fprintf_task(struct maps *maps, int indent, FILE *fp)
> > > > return args.printed;
> > > > }
> > > >
> > > > -static void task__print_level(struct task *task, FILE *fp, int level)
> > > > +static int thread_level(struct machine *machine, const struct thread *thread)
> > > > {
> > > > - struct thread *thread = task->thread;
> > > > - struct task *child;
> > > > - int comm_indent = fprintf(fp, " %8d %8d %8d |%*s",
> > > > - thread__pid(thread), thread__tid(thread),
> > > > - thread__ppid(thread), level, "");
> > > > + struct thread *parent_thread;
> > > > + int res;
> > > >
> > > > - fprintf(fp, "%s\n", thread__comm_str(thread));
> > > > + if (thread__tid(thread) <= 0)
> > > > + return 0;
> > > >
> > > > - maps__fprintf_task(thread__maps(thread), comm_indent, fp);
> > > > + if (thread__ppid(thread) <= 0)
> > > > + return 1;
> > > >
> > > > - if (!list_empty(&task->children)) {
> > > > - list_for_each_entry(child, &task->children, list)
> > > > - task__print_level(child, fp, level + 1);
> > > > + parent_thread = machine__find_thread(machine, -1, thread__ppid(thread));
> > > > + if (!parent_thread) {
> > > > + pr_err("Missing parent thread of %d\n", thread__tid(thread));
> > > > + return 0;
> > > > }
> > > > + res = 1 + thread_level(machine, parent_thread);
> > > > + thread__put(parent_thread);
> > > > + return res;
> > > > }
> > > >
> > > > -static int tasks_print(struct report *rep, FILE *fp)
> > > > +static void task__print_level(struct machine *machine, struct thread *thread, FILE *fp)
> > > > {
> > > > - struct perf_session *session = rep->session;
> > > > - struct machine *machine = &session->machines.host;
> > > > - struct task *tasks, *task;
> > > > - unsigned int nr = 0, itask = 0, i;
> > > > - struct rb_node *nd;
> > > > - LIST_HEAD(list);
> > > > + int level = thread_level(machine, thread);
> > > > + int comm_indent = fprintf(fp, " %8d %8d %8d |%*s",
> > > > + thread__pid(thread), thread__tid(thread),
> > > > + thread__ppid(thread), level, "");
> > > >
> > > > - /*
> > > > - * No locking needed while accessing machine->threads,
> > > > - * because --tasks is single threaded command.
> > > > - */
> > > > + fprintf(fp, "%s\n", thread__comm_str(thread));
> > > >
> > > > - /* Count all the threads. */
> > > > - for (i = 0; i < THREADS__TABLE_SIZE; i++)
> > > > - nr += machine->threads[i].nr;
> > > > + maps__fprintf_task(thread__maps(thread), comm_indent, fp);
> > > > +}
> > > >
> > > > - tasks = malloc(sizeof(*tasks) * nr);
> > > > - if (!tasks)
> > > > - return -ENOMEM;
> > > > +static int task_list_cmp(void *priv, const struct list_head *la, const struct list_head *lb)
> > >
> > > I'm a little afraid that this comparison logic becomes complex.
> > > But I think it's better than having a tree of thread relationship.
> > > Just a comment that explains why we need this would be nice.
> >
> > I can add something in v2.
> >
> > >
> > > > +{
> > > > + struct machine *machine = priv;
> > > > + struct thread_list *task_a = list_entry(la, struct thread_list, list);
> > > > + struct thread_list *task_b = list_entry(lb, struct thread_list, list);
> > > > + struct thread *a = task_a->thread;
> > > > + struct thread *b = task_b->thread;
> > > > + int level_a, level_b, res;
> > > > +
> > > > + /* Compare a and b to root. */
> > > > + if (thread__tid(a) == thread__tid(b))
> > > > + return 0;
> > > >
> > > > - for (i = 0; i < THREADS__TABLE_SIZE; i++) {
> > > > - struct threads *threads = &machine->threads[i];
> > > > + if (thread__tid(a) == 0)
> > > > + return -1;
> > > >
> > > > - for (nd = rb_first_cached(&threads->entries); nd;
> > > > - nd = rb_next(nd)) {
> > > > - task = tasks + itask++;
> > > > + if (thread__tid(b) == 0)
> > > > + return 1;
> > > >
> > > > - task->thread = rb_entry(nd, struct thread_rb_node, rb_node)->thread;
> > > > - INIT_LIST_HEAD(&task->children);
> > > > - INIT_LIST_HEAD(&task->list);
> > > > - thread__set_priv(task->thread, task);
> > > > - }
> > > > + /* If parents match sort by tid. */
> > > > + if (thread__ppid(a) == thread__ppid(b)) {
> > > > + return thread__tid(a) < thread__tid(b)
> > > > + ? -1
> > > > + : (thread__tid(a) > thread__tid(b) ? 1 : 0);
> > >
> > > Can it be simply like this? We know tid(a) != tid(b).
> > >
> > > return thread__tid(a) < thread__tid(b) ? -1 : 1;
> >
> > Yes, but the parent check is still required.
>
> Sure. I only meant the return statement.
>
> >
> > > > }
> > > >
> > > > /*
> > > > - * Iterate every task down to the unprocessed parent
> > > > - * and link all in task children list. Task with no
> > > > - * parent is added into 'list'.
> > > > + * Find a and b such that if they are a child of each other a and b's
> > > > + * tid's match, otherwise a and b have a common parent and distinct
> > > > + * tid's to sort by. First make the depths of the threads match.
> > > > */
> > > > - for (itask = 0; itask < nr; itask++) {
> > > > - task = tasks + itask;
> > > > -
> > > > - if (!list_empty(&task->list))
> > > > - continue;
> > > > -
> > > > - task = tasks_list(task, machine);
> > > > - if (IS_ERR(task)) {
> > > > - pr_err("Error: failed to process tasks\n");
> > > > - free(tasks);
> > > > - return PTR_ERR(task);
> > > > + level_a = thread_level(machine, a);
> > > > + level_b = thread_level(machine, b);
> > > > + a = thread__get(a);
> > > > + b = thread__get(b);
> > > > + for (int i = level_a; i > level_b; i--) {
> > > > + struct thread *parent = machine__find_thread(machine, -1, thread__ppid(a));
> > > > +
> > > > + thread__put(a);
> > > > + if (!parent) {
> > > > + pr_err("Missing parent thread of %d\n", thread__tid(a));
> > > > + thread__put(b);
> > > > + return -1;
> > > > }
> > > > + a = parent;
> > > > + }
> > > > + for (int i = level_b; i > level_a; i--) {
> > > > + struct thread *parent = machine__find_thread(machine, -1, thread__ppid(b));
> > > >
> > > > - if (task)
> > > > - list_add_tail(&task->list, &list);
> > > > + thread__put(b);
> > > > + if (!parent) {
> > > > + pr_err("Missing parent thread of %d\n", thread__tid(b));
> > > > + thread__put(a);
> > > > + return 1;
> > > > + }
> > > > + b = parent;
> > > > + }
> > > > + /* Search up to a common parent. */
> > > > + while (thread__ppid(a) != thread__ppid(b)) {
> > > > + struct thread *parent;
> > > > +
> > > > + parent = machine__find_thread(machine, -1, thread__ppid(a));
> > > > + thread__put(a);
> > > > + if (!parent)
> > > > + pr_err("Missing parent thread of %d\n", thread__tid(a));
> > > > + a = parent;
> > > > + parent = machine__find_thread(machine, -1, thread__ppid(b));
> > > > + thread__put(b);
> > > > + if (!parent)
> > > > + pr_err("Missing parent thread of %d\n", thread__tid(b));
> > > > + b = parent;
> > > > + if (!a || !b)
> > > > + return !a && !b ? 0 : (!a ? -1 : 1);
> > >
> > > Wouldn't it leak a refcount if either a or b is NULL (not both)?
> >
> > It would, but this would be an error condition anyway. I can add puts.
> >
> > >
> > > > + }
> > > > + if (thread__tid(a) == thread__tid(b)) {
> > > > + /* a is a child of b or vice-versa, deeper levels appear later. */
> > > > + res = level_a < level_b ? -1 : (level_a > level_b ? 1 : 0);
> > > > + } else {
> > > > + /* Sort by tid now the parent is the same. */
> > > > + res = thread__tid(a) < thread__tid(b) ? -1 : 1;
> > > > }
> > > > + thread__put(a);
> > > > + thread__put(b);
> > > > + return res;
> > > > +}
> > > > +
> > > > +static int tasks_print(struct report *rep, FILE *fp)
> > > > +{
> > > > + struct machine *machine = &rep->session->machines.host;
> > > > + LIST_HEAD(tasks);
> > > > + int ret;
> > > >
> > > > - fprintf(fp, "# %8s %8s %8s %s\n", "pid", "tid", "ppid", "comm");
> > > > + ret = machine__thread_list(machine, &tasks);
> > > > + if (!ret) {
> > > > + struct thread_list *task;
> > >
> > > Do we really need this thread_list? Why not use an
> > > array of threads directly?
> >
> > The code isn't particularly performance critical. I used a list as it
> > best approximated how the rbtree was being used. The code is reused in
> > subsequent patches, there's no initial pass to size an array and I
> > think the reallocarray/qsort logic is generally more problematic than
> > the list ones. If we were worried about performance then I think
> > arrays could make sense for optimization, but I think this is good
> > enough for now.
>
> Well, it's not about performance. It made me think why we need
> this thread_list but I couldn't find the reason. If you can move
> machine__threads_nr() here then you won't need realloc().

But then you can race between allocating an array and traversing to
fill it in. Using realloc in the iterator callback would avoid this
but with capacity tracking, etc. If this were C++ its a call between a
std::vector and a std::list, and std::vector would win that race there
(imo). Here we're moving from code that was working on sorted tree
nodes in code that tends to more heavily use lists. I wanted the
transition from the rbtree nodes to list nodes to be as small as
possible in the changes to the APIs that did strange things to the
threads tree (resorting it). Moving to an array with indices would
require more tracking and be a larger change in general. The array
could move because of a realloc, whilst nodes wouldn't, etc. Having
the code now work on a list its easier to see how it can migrate to an
array, but that can be follow on work. I'm not sure we're motivated to
do it given there's no code on a performance critical path.

Thanks,
Ian

> Thanks,
> Namhyung
>
> > > >
> > > > - list_for_each_entry(task, &list, list)
> > > > - task__print_level(task, fp, 0);
> > > > + list_sort(machine, &tasks, task_list_cmp);
> > > >
> > > > - free(tasks);
> > > > - return 0;
> > > > + fprintf(fp, "# %8s %8s %8s %s\n", "pid", "tid", "ppid", "comm");
> > > > +
> > > > + list_for_each_entry(task, &tasks, list)
> > > > + task__print_level(machine, task->thread, fp);
> > > > + }
> > > > + thread_list__delete(&tasks);
> > > > + return ret;
> > > > }
> > > >
> > > > static int __cmd_report(struct report *rep)
> > > > diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> > > > index 3da92f18814a..7872ce92c9fc 100644
> > > > --- a/tools/perf/util/machine.c
> > > > +++ b/tools/perf/util/machine.c
> > > > @@ -3261,6 +3261,36 @@ int machines__for_each_thread(struct machines *machines,
> > > > return rc;
> > > > }
> > > >
> > > > +
> > > > +static int thread_list_cb(struct thread *thread, void *data)
> > > > +{
> > > > + struct list_head *list = data;
> > > > + struct thread_list *entry = malloc(sizeof(*entry));
> > > > +
> > > > + if (!entry)
> > > > + return -ENOMEM;
> > > > +
> > > > + entry->thread = thread__get(thread);
> > > > + list_add_tail(&entry->list, list);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +int machine__thread_list(struct machine *machine, struct list_head *list)
> > > > +{
> > > > + return machine__for_each_thread(machine, thread_list_cb, list);
> > > > +}
> > > > +
> > > > +void thread_list__delete(struct list_head *list)
> > > > +{
> > > > + struct thread_list *pos, *next;
> > > > +
> > > > + list_for_each_entry_safe(pos, next, list, list) {
> > > > + thread__zput(pos->thread);
> > > > + list_del(&pos->list);
> > > > + free(pos);
> > > > + }
> > > > +}
> > > > +
> > > > pid_t machine__get_current_tid(struct machine *machine, int cpu)
> > > > {
> > > > if (cpu < 0 || (size_t)cpu >= machine->current_tid_sz)
> > > > diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
> > > > index 1279acda6a8a..b738ce84817b 100644
> > > > --- a/tools/perf/util/machine.h
> > > > +++ b/tools/perf/util/machine.h
> > > > @@ -280,6 +280,16 @@ int machines__for_each_thread(struct machines *machines,
> > > > int (*fn)(struct thread *thread, void *p),
> > > > void *priv);
> > > >
> > > > +struct thread_list {
> > > > + struct list_head list;
> > > > + struct thread *thread;
> > > > +};
> > > > +
> > > > +/* Make a list of struct thread_list based on threads in the machine. */
> > > > +int machine__thread_list(struct machine *machine, struct list_head *list);
> > > > +/* Free up the nodes within the thread_list list. */
> > > > +void thread_list__delete(struct list_head *list);
> > > > +
> > > > pid_t machine__get_current_tid(struct machine *machine, int cpu);
> > > > int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid,
> > > > pid_t tid);
> > > > --
> > > > 2.43.0.687.g38aa6559b0-goog
> > > >