Re: [PATCH v8 03/11] tracing/user_events: Use remote writes for event enablement

From: Steven Rostedt
Date: Tue Mar 28 2023 - 16:59:50 EST


On Tue, 21 Feb 2023 13:11:35 -0800
Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> wrote:

> include/linux/user_events.h | 53 ++-
> include/uapi/linux/user_events.h | 15 +-
> kernel/trace/Kconfig | 5 +-
> kernel/trace/trace_events_user.c | 586 ++++++++++++++++++++++++-------
> 4 files changed, 517 insertions(+), 142 deletions(-)
>
> diff --git a/include/linux/user_events.h b/include/linux/user_events.h
> index 3d747c45d2fa..0120b3dd5b03 100644
> --- a/include/linux/user_events.h
> +++ b/include/linux/user_events.h
> @@ -9,13 +9,63 @@
> #ifndef _LINUX_USER_EVENTS_H
> #define _LINUX_USER_EVENTS_H
>
> +#include <linux/list.h>
> +#include <linux/refcount.h>
> +#include <linux/mm_types.h>
> +#include <linux/workqueue.h>
> #include <uapi/linux/user_events.h>
>
> #ifdef CONFIG_USER_EVENTS
> struct user_event_mm {
> + struct list_head link;
> + struct list_head enablers;
> + struct mm_struct *mm;
> + struct user_event_mm *next;
> + refcount_t refcnt;
> + refcount_t tasks;
> + struct rcu_work put_rwork;
> };

This is more of a nit, and not something to change unless there's more real
content to change, but please when making structures tab out the field
names:

struct user_event_mm {
struct list_head link;
struct list_head enablers;
struct mm_struct *mm;
struct user_event_mm *next;
refcount_t refcnt;
refcount_t tasks;
struct rcu_work put_rwork;
};


See how much easier it is to read (and know what fields exist).



> -#endif
>

> #endif /* _LINUX_USER_EVENTS_H */
> diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
> index 03f92366068d..22521bc622db 100644
> --- a/include/uapi/linux/user_events.h
> +++ b/include/uapi/linux/user_events.h
> @@ -27,12 +27,21 @@ struct user_reg {
> /* Input: Size of the user_reg structure being used */
> __u32 size;
>
> + /* Input: Bit in enable address to use */
> + __u8 enable_bit;
> +
> + /* Input: Enable size in bytes at address */
> + __u8 enable_size;
> +
> + /* Input: Flags for future use, set to 0 */
> + __u16 flags;
> +
> + /* Input: Address to update when enabled */
> + __u64 enable_addr;
> +
> /* Input: Pointer to string with event name, description and flags */
> __u64 name_args;
>
> - /* Output: Bitwise index of the event within the status page */
> - __u32 status_bit;
> -
> /* Output: Index of the event to use when writing data */
> __u32 write_index;
> } __attribute__((__packed__));

May want to tab the above too, but as each field is commented, it's not as
big of an issue.


> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index d7043043f59c..b61a1bfbfc22 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -791,9 +791,10 @@ config USER_EVENTS
> can be used like an existing kernel trace event. User trace
> events are generated by writing to a tracefs file. User
> processes can determine if their tracing events should be
> - generated by memory mapping a tracefs file and checking for
> - an associated byte being non-zero.
> + generated by registering a value and bit with the kernel
> + that reflects when it is enabled or not.
>
> + See Documentation/trace/user_events.rst.
> If in doubt, say N.
>
> config HIST_TRIGGERS
> diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
> index 070551480747..553a82ee7aeb 100644
> --- a/kernel/trace/trace_events_user.c
> +++ b/kernel/trace/trace_events_user.c
> @@ -19,6 +19,7 @@
> #include <linux/tracefs.h>
> #include <linux/types.h>
> #include <linux/uaccess.h>
> +#include <linux/highmem.h>
> #include <linux/user_events.h>
> #include "trace.h"
> #include "trace_dynevent.h"
> @@ -29,34 +30,11 @@
> #define FIELD_DEPTH_NAME 1
> #define FIELD_DEPTH_SIZE 2
>
> -/*
> - * Limits how many trace_event calls user processes can create:
> - * Must be a power of two of PAGE_SIZE.
> - */
> -#define MAX_PAGE_ORDER 0
> -#define MAX_PAGES (1 << MAX_PAGE_ORDER)
> -#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
> -#define MAX_EVENTS (MAX_BYTES * 8)
> -
> /* Limit how long of an event name plus args within the subsystem. */
> #define MAX_EVENT_DESC 512
> #define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
> #define MAX_FIELD_ARRAY_SIZE 1024
>
> -/*
> - * The MAP_STATUS_* macros are used for taking a index and determining the
> - * appropriate byte and the bit in the byte to set/reset for an event.
> - *
> - * The lower 3 bits of the index decide which bit to set.
> - * The remaining upper bits of the index decide which byte to use for the bit.
> - *
> - * This is used when an event has a probe attached/removed to reflect live
> - * status of the event wanting tracing or not to user-programs via shared
> - * memory maps.
> - */
> -#define MAP_STATUS_BYTE(index) ((index) >> 3)
> -#define MAP_STATUS_MASK(index) BIT((index) & 7)
> -
> /*
> * Internal bits (kernel side only) to keep track of connected probes:
> * These are used when status is requested in text form about an event. These
> @@ -70,20 +48,14 @@
> #define EVENT_STATUS_OTHER BIT(7)
>
> /*
> - * Stores the pages, tables, and locks for a group of events.
> - * Each logical grouping of events has its own group, with a
> - * matching page for status checks within user programs. This
> - * allows for isolation of events to user programs by various
> - * means.
> + * Stores the system name, tables, and locks for a group of events. This
> + * allows isolation for events by various means.
> */
> struct user_event_group {
> - struct page *pages;
> - char *register_page_data;
> char *system_name;
> struct hlist_node node;
> struct mutex reg_mutex;
> DECLARE_HASHTABLE(register_table, 8);
> - DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
> };
>
> /* Group for init_user_ns mapping, top-most group */
> @@ -106,12 +78,34 @@ struct user_event {
> struct list_head fields;
> struct list_head validators;
> refcount_t refcnt;
> - int index;
> - int flags;
> int min_size;
> char status;
> };

But these should be tabbed out.

>
> +/*
> + * Stores per-mm/event properties that enable an address to be
> + * updated properly for each task. As tasks are forked, we use
> + * these to track enablement sites that are tied to an event.
> + */
> +struct user_event_enabler {
> + struct list_head link;
> + struct user_event *event;
> + unsigned long addr;
> +
> + /* Track enable bit, flags, etc. Aligned for bitops. */
> + unsigned int values;
> +};
> +

And the above.

> +/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
> +#define ENABLE_VAL_BIT_MASK 0x3F
> +

This is something that can be added later as a clean up, but if there's a
real issue found with these patches, then make the next version have the
updates.

If you do another version, update the tabs of existing structures in a
separate patch from any content change.

-- Steve