Re: [RFC PATCH v2] membarrier: expedited private command

From: Peter Zijlstra
Date: Fri Jul 28 2017 - 07:10:31 EST


On Fri, Jul 28, 2017 at 10:55:32AM +0200, Peter Zijlstra wrote:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e9785f7aed75..33f34a201255 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2641,8 +2641,18 @@ static struct rq *finish_task_switch(struct task_struct *prev)
> finish_arch_post_lock_switch();
>
> fire_sched_in_preempt_notifiers(current);
> +
> + /*
> + * For CONFIG_MEMBARRIER we need a full memory barrier after the
> + * rq->curr assignment. Not all architectures have one in either
> + * switch_to() or switch_mm() so we use (and complement) the one
> + * implied by mmdrop()'s atomic_dec_and_test().
> + */
> if (mm)
> mmdrop(mm);
> + else if (IS_ENABLED(CONFIG_MEMBARRIER))
> + smp_mb();
> +
> if (unlikely(prev_state == TASK_DEAD)) {
> if (prev->sched_class->task_dead)
> prev->sched_class->task_dead(prev);

Hurm.. so going over this again, I'm not sure this is as good as we
thought it was..


context_switch()

mm = next->mm
old_mm = prev->active_mm;

if (!mm) {
next->active_mm = old_mm;
mmgrab(oldmm);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);

if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = old_mm;
}

/* ... */

finish_task_switch()

mm = rq->prev_mm; // oldmm when !prev->mm
rq->prev_mm = NULL;

if (mm)
mmdrop(mm);



That mmdrop() is to balance the mmgrab() for when we switch between
kthreads. Also, it looks to me that if we do kthread->kthread switches,
we do a superfluous mmgrab() / mmdrop().

Something like the below perhaps would optimize and clarify things.

---
kernel/sched/core.c | 35 ++++++++++++++++++++++-------------
1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9785f7aed75..7924b4cc2bff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2733,12 +2733,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
- struct mm_struct *mm, *oldmm;
-
prepare_task_switch(rq, prev, next);

- mm = next->mm;
- oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
@@ -2746,16 +2742,29 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
arch_start_context_switch(prev);

- if (!mm) {
- next->active_mm = oldmm;
- mmgrab(oldmm);
- enter_lazy_tlb(oldmm, next);
- } else
- switch_mm_irqs_off(oldmm, mm, next);
+ /*
+ * kernel -> kernel transfer active
+ * user -> kernel mmgrab()
+ *
+ * kernel -> user mmdrop()
+ * user -> user switch_mm()
+ */
+ if (!next->mm) { // to kernel
+ next->active_mm = prev->active_mm;
+ enter_lazy_tlb(prev->active_mm, next);
+
+ if (prev->mm) // from user
+ mmgrab(prev->active_mm);
+
+ } else { // to user
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);

- if (!prev->mm) {
- prev->active_mm = NULL;
- rq->prev_mm = oldmm;
+ if (!prev->mm) { // from kernel
+ rq->prev_mm = prev->active_mm;
+ prev->active_mm = NULL;
+
+ /* will mmdrop() in finish_task_switch(). */
+ }
}

rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);