Re: I.1 - System calls - ioctl

From: stephane eranian
Date: Thu Jul 30 2009 - 09:58:58 EST


On Mon, Jul 13, 2009 at 12:53 PM, Peter Zijlstra<a.p.zijlstra@xxxxxxxxx> wrote:
> On Mon, 2009-06-22 at 08:58 -0400, Christoph Hellwig wrote:
>> But talking about syscalls the sys_perf_counter_open prototype is
>> really ugly - it uses either the pid or cpu argument which is a pretty
>> clear indicator it should actually be two sys calls.
>
> Would something like the below be any better?
>
> It would allow us to later add something like PERF_TARGET_SOCKET and
> things like that.

One thing for sure, you need to provision for future targets, SOCKET
is an obvious
one. But given that your goal is to have a generic API, not just for
CPU PMU, then you
need to be able to add new targets, e.g., socket, chipset, GPU. The
current model with
pid and cpu is too limited, relying on flags to add more parameters
for a target is not pretty.

Given that an event can only be attached to a single target at a time, it seems
you could either do:
- encapsulate target type + target into a struct and pass that.
This is your proposal here.
- add a generic int target parameter and pass the target type in flags
- provide one syscall per target type (seems to be Hellwig's)

Your scheme makes it possible to pass 64-bit target id. Not clear if
this is really needed.
It adds yet another data structure to your API.

>
> (utterly untested)
>
> ---
> Âinclude/linux/perf_counter.h | Â 12 ++++++++++++
> Âinclude/linux/syscalls.h   |  Â3 ++-
> Âkernel/perf_counter.c    Â|  24 +++++++++++++++++++++++-
> Âtools/perf/builtin-record.c Â| Â 11 ++++++++++-
> Âtools/perf/builtin-stat.c  Â|  12 ++++++++++--
> Âtools/perf/builtin-top.c   |  11 ++++++++++-
> Âtools/perf/perf.h      Â|  Â7 +++----
> Â7 files changed, 70 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
> index 5e970c7..f7dd22e 100644
> --- a/include/linux/perf_counter.h
> +++ b/include/linux/perf_counter.h
> @@ -189,6 +189,18 @@ struct perf_counter_attr {
> Â Â Â Â__u64 Â Â Â Â Â Â Â Â Â __reserved_3;
> Â};
>
> +enum perf_counter_target_ids {
> + Â Â Â PERF_TARGET_PID Â Â Â Â = 0,
> + Â Â Â PERF_TARGET_CPU Â Â Â Â = 1,
> +
> + Â Â Â PERF_TARGET_MAX Â Â Â Â /* non-ABI */
> +};
> +
> +struct perf_counter_target {
> + Â Â Â __u32 Â Â Â Â Â Â Â Â Â id;
> + Â Â Â __u64 Â Â Â Â Â Â Â Â Â val;
> +};
> +
> Â/*
> Â* Ioctls that can be done on a perf counter fd:
> Â*/
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 80de700..670edad 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -760,5 +760,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
>
> Âasmlinkage long sys_perf_counter_open(
> Â Â Â Â Â Â Â Âstruct perf_counter_attr __user *attr_uptr,
> - Â Â Â Â Â Â Â pid_t pid, int cpu, int group_fd, unsigned long flags);
> + Â Â Â Â Â Â Â struct perf_counter_target __user *target,
> + Â Â Â Â Â Â Â int group_fd, unsigned long flags);
> Â#endif
> diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
> index fa20a9d..205f0b6 100644
> --- a/kernel/perf_counter.c
> +++ b/kernel/perf_counter.c
> @@ -3969,15 +3969,18 @@ err_size:
> Â*/
> ÂSYSCALL_DEFINE5(perf_counter_open,
> Â Â Â Â Â Â Â Âstruct perf_counter_attr __user *, attr_uptr,
> - Â Â Â Â Â Â Â pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
> + Â Â Â Â Â Â Â struct perf_counter_target __user *, target_uptr,
> + Â Â Â Â Â Â Â int, group_fd, unsigned long, flags)
> Â{
> Â Â Â Âstruct perf_counter *counter, *group_leader;
> Â Â Â Âstruct perf_counter_attr attr;
> + Â Â Â struct perf_counter_target target;
> Â Â Â Âstruct perf_counter_context *ctx;
> Â Â Â Âstruct file *counter_file = NULL;
> Â Â Â Âstruct file *group_file = NULL;
> Â Â Â Âint fput_needed = 0;
> Â Â Â Âint fput_needed2 = 0;
> + Â Â Â int pid, cpu;
> Â Â Â Âint ret;
>
> Â Â Â Â/* for future expandability... */
> @@ -3998,6 +4001,25 @@ SYSCALL_DEFINE5(perf_counter_open,
> Â Â Â Â Â Â Â Â Â Â Â Âreturn -EINVAL;
> Â Â Â Â}
>
> + Â Â Â ret = copy_from_user(&target, target_uptr, sizeof(target));
> + Â Â Â if (ret)
> + Â Â Â Â Â Â Â return ret;
> +
> + Â Â Â switch (target.id) {
> + Â Â Â case PERF_TARGET_PID:
> + Â Â Â Â Â Â Â pid = target.val;
> + Â Â Â Â Â Â Â cpu = -1;
> + Â Â Â Â Â Â Â break;
> +
> + Â Â Â case PERF_TARGET_CPU:
> + Â Â Â Â Â Â Â cpu = target.val;
> + Â Â Â Â Â Â Â pid = -1;
> + Â Â Â Â Â Â Â break;
> +
> + Â Â Â default:
> + Â Â Â Â Â Â Â return -EINVAL;
> + Â Â Â }
> +
> Â Â Â Â/*
> Â Â Â Â * Get the target context (task or percpu):
> Â Â Â Â */
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 4ef78a5..b172d30 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -374,6 +374,7 @@ static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int
> Âstatic void create_counter(int counter, int cpu, pid_t pid)
> Â{
> Â Â Â Âstruct perf_counter_attr *attr = attrs + counter;
> + Â Â Â struct perf_counter_target target;
> Â Â Â Âstruct perf_header_attr *h_attr;
> Â Â Â Âint track = !counter; /* only the first counter needs these */
> Â Â Â Âstruct {
> @@ -409,8 +410,16 @@ static void create_counter(int counter, int cpu, pid_t pid)
>    Âattr->inherit      = (cpu < 0) && inherit;
>    Âattr->disabled     Â= 1;
>
> + Â Â Â if (cpu < 0) {
> + Â Â Â Â Â Â Â target.id = PERF_TARGET_PID;
> + Â Â Â Â Â Â Â target.val = pid;
> + Â Â Â } else {
> + Â Â Â Â Â Â Â target.id = PERF_TARGET_CPU;
> + Â Â Â Â Â Â Â target.val = cpu;
> + Â Â Â }
> +
> Âtry_again:
> - Â Â Â fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
> + Â Â Â fd[nr_cpu][counter] = sys_perf_counter_open(attr, &target, group_fd, 0);
>
> Â Â Â Âif (fd[nr_cpu][counter] < 0) {
> Â Â Â Â Â Â Â Âint err = errno;
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 27921a8..342e642 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -106,6 +106,7 @@ static u64 Â Â Â Â Â Â Â Â Âruntime_cycles_noise;
> Âstatic void create_perf_stat_counter(int counter, int pid)
> Â{
> Â Â Â Âstruct perf_counter_attr *attr = attrs + counter;
> + Â Â Â struct perf_counter_target target;
>
> Â Â Â Âif (scale)
> Â Â Â Â Â Â Â Âattr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
> @@ -114,8 +115,12 @@ static void create_perf_stat_counter(int counter, int pid)
> Â Â Â Âif (system_wide) {
> Â Â Â Â Â Â Â Âunsigned int cpu;
>
> + Â Â Â Â Â Â Â target.id = PERF_TARGET_CPU;
> +
> Â Â Â Â Â Â Â Âfor (cpu = 0; cpu < nr_cpus; cpu++) {
> - Â Â Â Â Â Â Â Â Â Â Â fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
> + Â Â Â Â Â Â Â Â Â Â Â target.val = cpu;
> +
> + Â Â Â Â Â Â Â Â Â Â Â fd[cpu][counter] = sys_perf_counter_open(attr, &target, -1, 0);
> Â Â Â Â Â Â Â Â Â Â Â Âif (fd[cpu][counter] < 0 && verbose)
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âfprintf(stderr, ERR_PERF_OPEN, counter,
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âfd[cpu][counter], strerror(errno));
> @@ -125,7 +130,10 @@ static void create_perf_stat_counter(int counter, int pid)
>        Âattr->disabled    = 1;
> Â Â Â Â Â Â Â Âattr->enable_on_exec = 1;
>
> - Â Â Â Â Â Â Â fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0);
> + Â Â Â Â Â Â Â target.id = PERF_TARGET_PID;
> + Â Â Â Â Â Â Â target.val = pid;
> +
> + Â Â Â Â Â Â Â fd[0][counter] = sys_perf_counter_open(attr, &target, -1, 0);
> Â Â Â Â Â Â Â Âif (fd[0][counter] < 0 && verbose)
> Â Â Â Â Â Â Â Â Â Â Â Âfprintf(stderr, ERR_PERF_OPEN, counter,
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âfd[0][counter], strerror(errno));
> diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> index 95d5c0a..facc870 100644
> --- a/tools/perf/builtin-top.c
> +++ b/tools/perf/builtin-top.c
> @@ -548,6 +548,7 @@ int group_fd;
>
> Âstatic void start_counter(int i, int counter)
> Â{
> + Â Â Â struct perf_counter_target target;
> Â Â Â Âstruct perf_counter_attr *attr;
> Â Â Â Âunsigned int cpu;
>
> @@ -560,8 +561,16 @@ static void start_counter(int i, int counter)
>    Âattr->sample_type    = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
>    Âattr->freq       Â= freq;
>
> + Â Â Â if (cpu < 0) {
> + Â Â Â Â Â Â Â target.id = PERF_TARGET_PID;
> + Â Â Â Â Â Â Â target.val = target_pid;
> + Â Â Â } else {
> + Â Â Â Â Â Â Â target.id = PERF_TARGET_CPU;
> + Â Â Â Â Â Â Â target.val = cpu;
> + Â Â Â }
> +
> Âtry_again:
> - Â Â Â fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
> + Â Â Â fd[i][counter] = sys_perf_counter_open(attr, &target, group_fd, 0);
>
> Â Â Â Âif (fd[i][counter] < 0) {
> Â Â Â Â Â Â Â Âint err = errno;
> diff --git a/tools/perf/perf.h b/tools/perf/perf.h
> index e5148e2..3d663d7 100644
> --- a/tools/perf/perf.h
> +++ b/tools/perf/perf.h
> @@ -85,12 +85,11 @@ static inline unsigned long long rdclock(void)
>
> Âstatic inline int
> Âsys_perf_counter_open(struct perf_counter_attr *attr,
> - Â Â Â Â Â Â Â Â Â Â pid_t pid, int cpu, int group_fd,
> - Â Â Â Â Â Â Â Â Â Â unsigned long flags)
> + Â Â Â Â Â Â Â Â Â Â struct perf_counter_target *target,
> + Â Â Â Â Â Â Â Â Â Â int group_fd, unsigned long flags)
> Â{
> Â Â Â Âattr->size = sizeof(*attr);
> - Â Â Â return syscall(__NR_perf_counter_open, attr, pid, cpu,
> - Â Â Â Â Â Â Â Â Â Â Âgroup_fd, flags);
> + Â Â Â return syscall(__NR_perf_counter_open, attr, target, group_fd, flags);
> Â}
>
> Â#define MAX_COUNTERS Â Â Â Â Â Â Â Â Â 256
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/