Re: [PATCH for 4.14] membarrier: Provide register expedited private command

From: Paul E. McKenney
Date: Thu Oct 19 2017 - 13:48:58 EST


On Thu, Oct 19, 2017 at 01:30:15PM -0400, Mathieu Desnoyers wrote:
> [ This patch is sent directly to Linus, because it needs to be merged
> before the end of 4.14 rc cycle. It introduces a "register private
> expedited" membarrier command which allows eventual removal of
> important memory barrier constraints on the scheduler fast-paths. It
> changes how the "private expedited" membarrier command (new to 4.14)
> is used from user-space. Sorry to send this late in the cycle. ]
>
> Provide a command allowing processes to register their intent to use
> the private expedited command. This affects how the expedited private
> command introduced in 4.14-rc is meant to be used, and should be merged
> before 4.14 final.
>
> Processes are now required to register before using
> MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM.
>
> This fixes a problem that arose when designing requested extensions to
> sys_membarrier() to allow JITs to efficiently flush old code from
> instruction caches. Several potential algorithms are much less painful
> if the user register intent to use this functionality early on, for
> example, before the process spawns the second thread. Registering at
> this time removes the need to interrupt each and every thread in that
> process at the first expedited sys_membarrier() system call.
>
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
> CC: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>

This looks much less intrusive than the earlier series!

Acked-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>

> CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> CC: Ingo Molnar <mingo@xxxxxxxxxx>
> CC: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
> CC: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> ---
> fs/exec.c | 1 +
> include/linux/mm_types.h | 3 +++
> include/linux/sched/mm.h | 16 ++++++++++++++++
> include/uapi/linux/membarrier.h | 23 ++++++++++++++++-------
> kernel/sched/membarrier.c | 34 ++++++++++++++++++++++++++++++----
> 5 files changed, 66 insertions(+), 11 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 5470d3c1892a..3e14ba25f678 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1802,6 +1802,7 @@ static int do_execveat_common(int fd, struct filename *filename,
> /* execve succeeded */
> current->fs->in_exec = 0;
> current->in_execve = 0;
> + membarrier_execve(current);
> acct_update_integrals(current);
> task_numa_free(current);
> free_bprm(bprm);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 46f4ecf5479a..1861ea8dba77 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -445,6 +445,9 @@ struct mm_struct {
> unsigned long flags; /* Must use atomic bitops to access the bits */
>
> struct core_state *core_state; /* coredumping support */
> +#ifdef CONFIG_MEMBARRIER
> + atomic_t membarrier_state;
> +#endif
> #ifdef CONFIG_AIO
> spinlock_t ioctx_lock;
> struct kioctx_table __rcu *ioctx_table;
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index ae53e413fb13..ab9bf7b73954 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -211,4 +211,20 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
> current->flags = (current->flags & ~PF_MEMALLOC) | flags;
> }
>
> +#ifdef CONFIG_MEMBARRIER
> +enum {
> + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
> + MEMBARRIER_STATE_SWITCH_MM = (1U << 1),
> +};
> +
> +static inline void membarrier_execve(struct task_struct *t)
> +{
> + atomic_set(&t->mm->membarrier_state, 0);
> +}
> +#else
> +static inline void membarrier_execve(struct task_struct *t)
> +{
> +}
> +#endif
> +
> #endif /* _LINUX_SCHED_MM_H */
> diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> index 6d47b3249d8a..4e01ad7ffe98 100644
> --- a/include/uapi/linux/membarrier.h
> +++ b/include/uapi/linux/membarrier.h
> @@ -52,21 +52,30 @@
> * (non-running threads are de facto in such a
> * state). This only covers threads from the
> * same processes as the caller thread. This
> - * command returns 0. The "expedited" commands
> - * complete faster than the non-expedited ones,
> - * they never block, but have the downside of
> - * causing extra overhead.
> + * command returns 0 on success. The
> + * "expedited" commands complete faster than
> + * the non-expedited ones, they never block,
> + * but have the downside of causing extra
> + * overhead. A process needs to register its
> + * intent to use the private expedited command
> + * prior to using it, otherwise this command
> + * returns -EPERM.
> + * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
> + * Register the process intent to use
> + * MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
> + * returns 0.
> *
> * Command to be passed to the membarrier system call. The commands need to
> * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
> * the value 0.
> */
> enum membarrier_cmd {
> - MEMBARRIER_CMD_QUERY = 0,
> - MEMBARRIER_CMD_SHARED = (1 << 0),
> + MEMBARRIER_CMD_QUERY = 0,
> + MEMBARRIER_CMD_SHARED = (1 << 0),
> /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
> /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
> - MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
> + MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
> + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
> };
>
> #endif /* _UAPI_LINUX_MEMBARRIER_H */
> diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
> index a92fddc22747..dd7908743dab 100644
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -18,6 +18,7 @@
> #include <linux/membarrier.h>
> #include <linux/tick.h>
> #include <linux/cpumask.h>
> +#include <linux/atomic.h>
>
> #include "sched.h" /* for cpu_rq(). */
>
> @@ -26,21 +27,26 @@
> * except MEMBARRIER_CMD_QUERY.
> */
> #define MEMBARRIER_CMD_BITMASK \
> - (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
> + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
> + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
>
> static void ipi_mb(void *info)
> {
> smp_mb(); /* IPIs should be serializing but paranoid. */
> }
>
> -static void membarrier_private_expedited(void)
> +static int membarrier_private_expedited(void)
> {
> int cpu;
> bool fallback = false;
> cpumask_var_t tmpmask;
>
> + if (!(atomic_read(&current->mm->membarrier_state)
> + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
> + return -EPERM;
> +
> if (num_online_cpus() == 1)
> - return;
> + return 0;
>
> /*
> * Matches memory barriers around rq->curr modification in
> @@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)
> * rq->curr modification in scheduler.
> */
> smp_mb(); /* exit from system call is not a mb */
> + return 0;
> +}
> +
> +static void membarrier_register_private_expedited(void)
> +{
> + struct task_struct *p = current;
> + struct mm_struct *mm = p->mm;
> +
> + /*
> + * We need to consider threads belonging to different thread
> + * groups, which use the same mm. (CLONE_VM but not
> + * CLONE_THREAD).
> + */
> + if (atomic_read(&mm->membarrier_state)
> + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
> + return;
> + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
> + &mm->membarrier_state);
> }
>
> /**
> @@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
> synchronize_sched();
> return 0;
> case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
> - membarrier_private_expedited();
> + return membarrier_private_expedited();
> + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
> + membarrier_register_private_expedited();
> return 0;
> default:
> return -EINVAL;
> --
> 2.11.0
>