Re: [PATCH RFC tip/core/rcu 11/11] rcu: move TREE_RCU from softirqto kthread

From: Lai Jiangshan
Date: Fri Feb 25 2011 - 03:29:46 EST


On 02/23/2011 09:39 AM, Paul E. McKenney wrote:
> From: Paul E. McKenney <paul.mckenney@xxxxxxxxxx>
>
> If RCU priority boosting is to be meaningful, callback invocation must
> be boosted in addition to preempted RCU readers. Otherwise, in presence
> of CPU real-time threads, the grace period ends, but the callbacks don't
> get invoked. If the callbacks don't get invoked, the associated memory
> doesn't get freed, so the system is still subject to OOM.
>
> But it is not reasonable to priority-boost RCU_SOFTIRQ, so this commit
> moves the callback invocations to a kthread, which can be boosted easily.
>
> Signed-off-by: Paul E. McKenney <paul.mckenney@xxxxxxxxxx>
> Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
> ---
> include/linux/interrupt.h | 1 -
> include/trace/events/irq.h | 3 +-
> kernel/rcutree.c | 324 ++++++++++++++++++++++++++++++++++-
> kernel/rcutree.h | 8 +
> kernel/rcutree_plugin.h | 4 +-
> tools/perf/util/trace-event-parse.c | 1 -
> 6 files changed, 331 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 79d0c4f..ed47deb 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -385,7 +385,6 @@ enum
> TASKLET_SOFTIRQ,
> SCHED_SOFTIRQ,
> HRTIMER_SOFTIRQ,
> - RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
>
> NR_SOFTIRQS
> };
> diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
> index 1c09820..ae045ca 100644
> --- a/include/trace/events/irq.h
> +++ b/include/trace/events/irq.h
> @@ -20,8 +20,7 @@ struct softirq_action;
> softirq_name(BLOCK_IOPOLL), \
> softirq_name(TASKLET), \
> softirq_name(SCHED), \
> - softirq_name(HRTIMER), \
> - softirq_name(RCU))
> + softirq_name(HRTIMER))
>
> /**
> * irq_handler_entry - called immediately before the irq action handler
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index 0ac1cc0..2241f28 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -47,6 +47,8 @@
> #include <linux/mutex.h>
> #include <linux/time.h>
> #include <linux/kernel_stat.h>
> +#include <linux/wait.h>
> +#include <linux/kthread.h>
>
> #include "rcutree.h"
>
> @@ -82,6 +84,18 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
> int rcu_scheduler_active __read_mostly;
> EXPORT_SYMBOL_GPL(rcu_scheduler_active);
>
> +/* Control variables for per-CPU and per-rcu_node kthreads. */

I think "per-leaf-rcu_node" is better. It seems that only the leaf rcu_node
of rcu_sched are used for rcu_node kthreads and they also serve for
other rcu domains(rcu_bh, rcu_preempt)? I think we need to add some
comments for it.

> +/*
> + * Timer handler to initiate the waking up of per-CPU kthreads that
> + * have yielded the CPU due to excess numbers of RCU callbacks.
> + */
> +static void rcu_cpu_kthread_timer(unsigned long arg)
> +{
> + unsigned long flags;
> + struct rcu_data *rdp = (struct rcu_data *)arg;
> + struct rcu_node *rnp = rdp->mynode;
> + struct task_struct *t;
> +
> + raw_spin_lock_irqsave(&rnp->lock, flags);
> + rnp->wakemask |= rdp->grpmask;

I think there is no reason that the rnp->lock also protects the
rnp->node_kthread_task. "raw_spin_unlock_irqrestore(&rnp->lock, flags);"
can be moved up here.

> + t = rnp->node_kthread_task;
> + if (t == NULL) {
> + raw_spin_unlock_irqrestore(&rnp->lock, flags);
> + return;
> + }
> + wake_up_process(t);
> + raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +}
> +
> +/*
> + * Drop to non-real-time priority and yield, but only after posting a
> + * timer that will cause us to regain our real-time priority if we
> + * remain preempted. Either way, we restore our real-time priority
> + * before returning.
> + */
> +static void rcu_yield(int cpu)
> +{
> + struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu);
> + struct sched_param sp;
> + struct timer_list yield_timer;
> +
> + setup_timer(&yield_timer, rcu_cpu_kthread_timer, (unsigned long)rdp);
> + mod_timer(&yield_timer, jiffies + 2);
> + sp.sched_priority = 0;
> + sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
> + schedule();
> + sp.sched_priority = RCU_KTHREAD_PRIO;
> + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
> + del_timer(&yield_timer);
> +}
> +
> +/*
> + * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
> + * This can happen while the corresponding CPU is either coming online
> + * or going offline. We cannot wait until the CPU is fully online
> + * before starting the kthread, because the various notifier functions
> + * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
> + * the corresponding CPU is online.
> + *
> + * Return 1 if the kthread needs to stop, 0 otherwise.
> + *
> + * Caller must disable bh. This function can momentarily enable it.
> + */
> +static int rcu_cpu_kthread_should_stop(int cpu)
> +{
> + while (cpu_is_offline(cpu) || smp_processor_id() != cpu) {
> + if (kthread_should_stop())
> + return 1;
> + local_bh_enable();
> + schedule_timeout_uninterruptible(1);
> + if (smp_processor_id() != cpu)
> + set_cpus_allowed_ptr(current, cpumask_of(cpu));

The current task is PF_THREAD_BOUND,
Why do "set_cpus_allowed_ptr(current, cpumask_of(cpu));" ?

> + local_bh_disable();
> + }
> + return 0;
> +}
> +
> +/*
> + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
> + * earlier RCU softirq.
> + */
> +static int rcu_cpu_kthread(void *arg)
> +{
> + int cpu = (int)(long)arg;
> + unsigned long flags;
> + int spincnt = 0;
> + wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
> + char work;
> + char *workp = &per_cpu(rcu_cpu_has_work, cpu);
> +
> + for (;;) {
> + wait_event_interruptible(*wqp,
> + *workp != 0 || kthread_should_stop());
> + local_bh_disable();
> + if (rcu_cpu_kthread_should_stop(cpu)) {
> + local_bh_enable();
> + break;
> + }
> + local_irq_save(flags);
> + work = *workp;
> + *workp = 0;
> + local_irq_restore(flags);
> + if (work)
> + rcu_process_callbacks();
> + local_bh_enable();
> + if (*workp != 0)
> + spincnt++;
> + else
> + spincnt = 0;
> + if (spincnt > 10) {

"10" is a magic number here.

> + rcu_yield(cpu);
> + spincnt = 0;
> + }
> + }
> + return 0;
> +}
> +


> +/*
> + * Per-rcu_node kthread, which is in charge of waking up the per-CPU
> + * kthreads when needed.
> + */
> +static int rcu_node_kthread(void *arg)
> +{
> + int cpu;
> + unsigned long flags;
> + unsigned long mask;
> + struct rcu_node *rnp = (struct rcu_node *)arg;
> + struct sched_param sp;
> + struct task_struct *t;
> +
> + for (;;) {
> + wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0 ||
> + kthread_should_stop());
> + if (kthread_should_stop())
> + break;
> + raw_spin_lock_irqsave(&rnp->lock, flags);
> + mask = rnp->wakemask;
> + rnp->wakemask = 0;
> + raw_spin_unlock_irqrestore(&rnp->lock, flags);
> + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
> + if ((mask & 0x1) == 0)
> + continue;
> + preempt_disable();
> + per_cpu(rcu_cpu_has_work, cpu) = 1;
> + t = per_cpu(rcu_cpu_kthread_task, cpu);
> + if (t == NULL) {
> + preempt_enable();
> + continue;
> + }

Obviously preempt_disable() is not for protecting remote percpu data.
Is it for disabling cpu hotplug? I am afraid the @t may leave
and become invalid.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/