Re: [PATCH v6 1/3] perf: add PERF_RECORD_NAMESPACES to include namespaces related info

From: Peter Zijlstra
Date: Thu Feb 16 2017 - 05:28:46 EST


On Wed, Feb 08, 2017 at 02:01:24PM +0530, Hari Bathini wrote:
> With the advert of container technologies like docker, that depend
> on namespaces for isolation, there is a need for tracing support for
> namespaces. This patch introduces new PERF_RECORD_NAMESPACES event
> for tracing based on namespaces related info. This event records
> the device and inode numbers for every namespace of all processes.
>
> While device number is same for all namespaces currently, that may

'While device numbers are the same ..." ?

> change in future, to avoid the need for a namespace of namespaces.

Unfinished sentence

> Recording device number along with inode number will take care of such
> scenario.

More words on why this is so? Because I've no clue.

> Also, recording device and inode numbers for every namespace
> lets the userspace take a call on the definition of a container and
> update perf tool accordingly.


> @@ -862,6 +886,18 @@ enum perf_event_type {
> */
> PERF_RECORD_SWITCH_CPU_WIDE = 15,
>
> + /*
> + * struct {
> + * struct perf_event_header header;
> + * u32 pid;
> + * u32 tid;
> + * u32 nr_namespaces;
> + * struct namespace_link_info link_info[NAMESPACES_MAX];
> + * struct sample_id sample_id;
> + * };
> + */
> + PERF_RECORD_NAMESPACES = 16,

This thing works by accident, there's a u32 hole in that structure.
Also, the array isn't NAMESPACES_MAX long, its nr_namespaces, that's
what its there for. The entries should be internally consistent.


> @@ -6584,6 +6590,135 @@ void perf_event_comm(struct task_struct *task, bool exec)
> }
>
> /*
> + * namespaces tracking
> + */
> +
> +struct namespaces_event_id {
> + struct perf_event_header header;
> +
> + u32 pid;
> + u32 tid;
> + u32 nr_namespaces;
> + struct perf_ns_link_info link_info[NAMESPACES_MAX];
> +};

Contains the same hole and makes the record depend on architecture
alignment requirements.

> +static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
> + struct task_struct *task,
> + const struct proc_ns_operations *ns_ops)
> +{
> + struct path ns_path;
> + struct inode *ns_inode;
> + void *error;
> +
> + error = ns_get_path(&ns_path, task, ns_ops);
> + if (!error) {
> + snprintf(ns_link_info->name, NS_NAME_SIZE,
> + "%s", ns_path.dentry->d_iname);
> +
> + ns_inode = ns_path.dentry->d_inode;
> + ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
> + ns_link_info->ino = ns_inode->i_ino;
> + }
> +}
> +
> +static void perf_event_namespaces_output(struct perf_event *event,
> + void *data)
> +{
> + struct perf_namespaces_event *namespaces_event = data;
> + struct perf_output_handle handle;
> + struct perf_sample_data sample;
> + struct namespaces_event_id *ei;
> + struct task_struct *task = namespaces_event->task;
> + int ret;
> +
> + if (!perf_event_namespaces_match(event))
> + return;
> +
> + ei = &namespaces_event->event_id;
> + perf_event_header__init_id(&ei->header, &sample, event);
> + ret = perf_output_begin(&handle, event, ei->header.size);
> + if (ret)
> + return;
> +
> + ei->pid = perf_event_pid(event, task);
> + ei->tid = perf_event_tid(event, task);
> +
> + ei->nr_namespaces = NAMESPACES_MAX;
> +
> + perf_fill_ns_link_info(&ei->link_info[MNT_NS_INDEX],
> + task, &mntns_operations);
> +
> +#ifdef CONFIG_USER_NS
> + perf_fill_ns_link_info(&ei->link_info[USER_NS_INDEX],
> + task, &userns_operations);
> +#endif
> +#ifdef CONFIG_NET_NS
> + perf_fill_ns_link_info(&ei->link_info[NET_NS_INDEX],
> + task, &netns_operations);
> +#endif
> +#ifdef CONFIG_UTS_NS
> + perf_fill_ns_link_info(&ei->link_info[UTS_NS_INDEX],
> + task, &utsns_operations);
> +#endif
> +#ifdef CONFIG_IPC_NS
> + perf_fill_ns_link_info(&ei->link_info[IPC_NS_INDEX],
> + task, &ipcns_operations);
> +#endif
> +#ifdef CONFIG_PID_NS
> + perf_fill_ns_link_info(&ei->link_info[PID_NS_INDEX],
> + task, &pidns_operations);
> +#endif
> +#ifdef CONFIG_CGROUPS
> + perf_fill_ns_link_info(&ei->link_info[CGROUP_NS_INDEX],
> + task, &cgroupns_operations);
> +#endif

Two points here, since you've given these thingies a string identifier,
do you still need to rely on their positional index? If not, you could
generate smaller events if we lack some of these CONFIG knobs.

Secondly, does anything in here depend on @event? Afaict you're
generating the exact same information for each event.

The reason we set {pid,tid} for each event is because of PID_NS, we
report the pid number as per the namespace associated with the event.

But from what I can tell, there is no such namespace of namespaces
dependency here and this should live in the function below.

> + perf_output_put(&handle, namespaces_event->event_id);

And if you do smaller events, this needs changing..

> + perf_event__output_id_sample(event, &handle, &sample);
> +
> + perf_output_end(&handle);
> +}
> +
> +void perf_event_namespaces(struct task_struct *task)
> +{
> + struct perf_namespaces_event namespaces_event;
> +
> + if (!atomic_read(&nr_namespaces_events))
> + return;
> +
> + namespaces_event = (struct perf_namespaces_event){
> + .task = task,
> + .event_id = {
> + .header = {
> + .type = PERF_RECORD_NAMESPACES,
> + .misc = 0,
> + .size = sizeof(namespaces_event.event_id),
> + },
> + /* .pid */
> + /* .tid */
> + /* .nr_namespaces */
> + /* .link_info[NAMESPACES_MAX] */
> + },
> + };

So this would be the right place to generate the NS information, since
it doesn't appear to depend on the namespace of the event.

> + perf_iterate_sb(perf_event_namespaces_output,
> + &namespaces_event,
> + NULL);
> +}



> @@ -9667,6 +9804,11 @@ SYSCALL_DEFINE5(perf_event_open,
> return -EACCES;
> }
>
> + if (attr.namespaces) {
> + if (!capable(CAP_SYS_ADMIN))
> + return -EACCES;
> + }

Many times we allow users access to such information; should this be
qualified with something like perf_paranoid_kernel() or somesuch?

> +
> if (attr.freq) {
> if (attr.sample_freq > sysctl_perf_event_sample_rate)
> return -EINVAL;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 11c5c8a..fd77e67 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2289,6 +2289,9 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
> free_fs_struct(new_fs);
>
> bad_unshare_out:
> + if (!err)
> + perf_event_namespaces(current);
> +
> return err;
> }

That seems like a very odd place, why not put it right before the
bad_unshare_cleanup_cred: label?


> @@ -264,6 +265,10 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
> switch_task_namespaces(tsk, new_nsproxy);
> out:
> fput(file);
> +
> + if (!err)
> + perf_event_namespaces(tsk);
> +
> return err;
> }

Same again..