[PATCH] RCU for low latency (experimental)

From: Dipankar Sarma
Date: Tue Mar 23 2004 - 05:19:48 EST


Here is the RCU patch for low scheduling latency Andrew was talking
about in the other thread. I had done some measurements with
amlat on a 2.4 GHz P4 xeon box with 256MB memory running dbench
and it reduced worst case scheduling latencies from 800 microseconds
to about 400 microseconds.

It uses per-cpu kernel threads to execute excess callbacks and
pretty much relies on preemption. I added a CONFIG_LOW_LATENCY
option to make this conditional. The amount of callbacks to
invoke in softirq before punting to krcud can be set at boot
time using rcupdate.bhlimit parameter. The whole thing is meant
for experimenting only. The negative side of doing RCU this way
is that we may further delay the grace period with the
RCU kernel thread and thus there can be OOM situations.

I would be interested in all issues with this patch including
latencies and OOM situations.

Dipankar



Reduce bh processing time of rcu callbacks by using tunable per-cpu
krcud daemeons.


include/linux/rcupdate.h | 4 ++
include/linux/sched.h | 1
init/Kconfig | 9 ++++
kernel/rcupdate.c | 91 +++++++++++++++++++++++++++++++++++++++++++++--
kernel/sched.c | 6 +++
5 files changed, 108 insertions(+), 3 deletions(-)

diff -puN include/linux/rcupdate.h~rcu-low-lat include/linux/rcupdate.h
--- linux-2.6.4-rcu/include/linux/rcupdate.h~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/include/linux/rcupdate.h 2004-03-23 15:20:11.000000000 +0530
@@ -93,9 +93,11 @@ struct rcu_data {
long qsctr; /* User-mode/idle loop etc. */
long last_qsctr; /* value of qsctr at beginning */
/* of rcu grace period */
+ struct task_struct *krcud;
long batch; /* Batch # for current RCU batch */
struct list_head nxtlist;
struct list_head curlist;
+ struct list_head rcudlist;
};

DECLARE_PER_CPU(struct rcu_data, rcu_data);
@@ -103,9 +105,11 @@ extern struct rcu_ctrlblk rcu_ctrlblk;

#define RCU_qsctr(cpu) (per_cpu(rcu_data, (cpu)).qsctr)
#define RCU_last_qsctr(cpu) (per_cpu(rcu_data, (cpu)).last_qsctr)
+#define RCU_krcud(cpu) (per_cpu(rcu_data, (cpu)).krcud)
#define RCU_batch(cpu) (per_cpu(rcu_data, (cpu)).batch)
#define RCU_nxtlist(cpu) (per_cpu(rcu_data, (cpu)).nxtlist)
#define RCU_curlist(cpu) (per_cpu(rcu_data, (cpu)).curlist)
+#define RCU_rcudlist(cpu) (per_cpu(rcu_data, (cpu)).rcudlist)

#define RCU_QSCTR_INVALID 0

diff -puN include/linux/sched.h~rcu-low-lat include/linux/sched.h
--- linux-2.6.4-rcu/include/linux/sched.h~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/include/linux/sched.h 2004-03-23 15:20:12.000000000 +0530
@@ -552,6 +552,7 @@ extern int task_prio(task_t *p);
extern int task_nice(task_t *p);
extern int task_curr(task_t *p);
extern int idle_cpu(int cpu);
+extern int rq_has_rt_task(int cpu);

void yield(void);

diff -puN init/Kconfig~rcu-low-lat init/Kconfig
--- linux-2.6.4-rcu/init/Kconfig~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/init/Kconfig 2004-03-23 15:20:12.000000000 +0530
@@ -156,6 +156,14 @@ config HOTPLUG
agent" (/sbin/hotplug) to load modules and set up software needed
to use devices as you hotplug them.

+config LOW_LATENCY
+ bool "Enable kernel features for low scheduling latency" if EXPERIMENTAL
+ default n
+ ---help---
+ This option enables various features in the kernel that
+ help reduce scheduling latency while potentially sacrificing
+ throughput.
+
config IKCONFIG
bool "Kernel .config support"
---help---
@@ -181,7 +189,6 @@ config IKCONFIG_PROC
This option enables access to kernel configuration file and build
information through /proc/config.gz.

-
menuconfig EMBEDDED
bool "Remove kernel features (for embedded systems)"
help
diff -puN kernel/rcupdate.c~rcu-low-lat kernel/rcupdate.c
--- linux-2.6.4-rcu/kernel/rcupdate.c~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/kernel/rcupdate.c 2004-03-23 15:21:12.000000000 +0530
@@ -39,6 +39,7 @@
#include <asm/atomic.h>
#include <asm/bitops.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/completion.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
@@ -54,6 +55,11 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data
/* Fake initialization required by compiler */
static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
#define RCU_tasklet(cpu) (per_cpu(rcu_tasklet, cpu))
+#ifdef CONFIG_LOW_LATENCY
+static int bhlimit = 256;
+#else
+static int bhlimit = 0;
+#endif

/**
* call_rcu - Queue an RCU update request.
@@ -79,6 +85,13 @@ void fastcall call_rcu(struct rcu_head *
local_irq_restore(flags);
}

+static inline unsigned int rcu_bh_callback_limit(int cpu)
+{
+ if (in_softirq() && RCU_krcud(cpu))
+ return bhlimit;
+ return (unsigned int)-1;
+}
+
/*
* Invoke the completed RCU callbacks. They are expected to be in
* a per-cpu list.
@@ -87,13 +100,22 @@ static void rcu_do_batch(struct list_hea
{
struct list_head *entry;
struct rcu_head *head;
+ unsigned int count = 0;
+ int cpu = smp_processor_id();
+ unsigned int limit = rcu_bh_callback_limit(cpu);

while (!list_empty(list)) {
entry = list->next;
list_del(entry);
head = list_entry(entry, struct rcu_head, list);
head->func(head->arg);
+ if (++count > limit && rq_has_rt_task(cpu)) {
+ list_splice(list, &RCU_rcudlist(cpu));
+ wake_up_process(RCU_krcud(cpu));
+ break;
+ }
}
+
}

/*
@@ -198,12 +220,67 @@ void rcu_check_callbacks(int cpu, int us
tasklet_schedule(&RCU_tasklet(cpu));
}

+static int krcud(void * __bind_cpu)
+{
+ int cpu = (int) (long) __bind_cpu;
+
+ daemonize("krcud/%d", cpu);
+ set_user_nice(current, -19);
+ current->flags |= PF_IOTHREAD;
+
+ /* Migrate to the right CPU */
+ set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ BUG_ON(smp_processor_id() != cpu);
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ mb();
+
+ RCU_krcud(cpu) = current;
+
+ for (;;) {
+ LIST_HEAD(list);
+
+ if (list_empty(&RCU_rcudlist(cpu)))
+ schedule();
+
+ __set_current_state(TASK_RUNNING);
+
+ local_bh_disable();
+ while (!list_empty(&RCU_rcudlist(cpu))) {
+ list_splice(&RCU_rcudlist(cpu), &list);
+ INIT_LIST_HEAD(&RCU_rcudlist(cpu));
+ local_bh_enable();
+ rcu_do_batch(&list);
+ cond_resched();
+ local_bh_disable();
+ }
+ local_bh_enable();
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ }
+}
+
+static int start_krcud(int cpu)
+{
+ if (bhlimit) {
+ if (kernel_thread(krcud, (void *)(long)cpu, CLONE_KERNEL) < 0) {
+ printk("krcud for %i failed\n", cpu);
+ return -1;
+ }
+
+ while (!RCU_krcud(cpu))
+ yield();
+ }
+ return 0;
+}
+
static void __devinit rcu_online_cpu(int cpu)
{
memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data));
tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
INIT_LIST_HEAD(&RCU_nxtlist(cpu));
INIT_LIST_HEAD(&RCU_curlist(cpu));
+ INIT_LIST_HEAD(&RCU_rcudlist(cpu));
}

static int __devinit rcu_cpu_notify(struct notifier_block *self,
@@ -214,6 +291,10 @@ static int __devinit rcu_cpu_notify(stru
case CPU_UP_PREPARE:
rcu_online_cpu(cpu);
break;
+ case CPU_ONLINE:
+ if (start_krcud(cpu) != 0)
+ return NOTIFY_BAD;
+ break;
/* Space reserved for CPU_OFFLINE :) */
default:
break;
@@ -233,12 +314,17 @@ static struct notifier_block __devinitda
*/
void __init rcu_init(void)
{
- rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
+ rcu_online_cpu(smp_processor_id());
/* Register notifier for non-boot CPUs */
register_cpu_notifier(&rcu_nb);
}

+static int __init rcu_late_init(void)
+{
+ return start_krcud(smp_processor_id());
+}
+
+__initcall(rcu_late_init);

/* Because of FASTCALL declaration of complete, we use this wrapper */
static void wakeme_after_rcu(void *completion)
@@ -262,6 +348,7 @@ void synchronize_kernel(void)
wait_for_completion(&completion);
}

+module_param(bhlimit, int, 0);

EXPORT_SYMBOL(call_rcu);
EXPORT_SYMBOL(synchronize_kernel);
diff -puN kernel/sched.c~rcu-low-lat kernel/sched.c
--- linux-2.6.4-rcu/kernel/sched.c~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/kernel/sched.c 2004-03-23 15:20:12.000000000 +0530
@@ -341,6 +341,12 @@ static inline void enqueue_task(struct t
p->array = array;
}

+int rq_has_rt_task(int cpu)
+{
+ runqueue_t *rq = cpu_rq(cpu);
+ return (sched_find_first_bit(rq->active->bitmap) < MAX_RT_PRIO);
+}
+
/*
* effective_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.

_
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/