[RFC][PATCH] remove rq->lock from cpuacct cgroup (Was Re: [PATCH]cpuacct: add a branch prediction

From: KAMEZAWA Hiroyuki
Date: Thu Feb 26 2009 - 22:24:18 EST


On Thu, 26 Feb 2009 17:29:15 -0800
"Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx> wrote:
> > Can't we use seq_counter in include/linux/seqlock.h ?
> > There is only one writer and we don't need write-side lock.
>
> Yes, seqlock should work fine, good point!
>

This is a trial version. (may hunk with the latest mmotm)
seems overkill ?

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>

cgroup/cpuacct subsystem counts cpu usage by 64bit coutnter in
per-cpu object. In read-side (via cpuacct.usage file), for reading 64bit
value in safe manner, it takes rq->lock of (other) cpus.

In general, taking rq->lock of other cpus from codes not for scheduler
is not good. This patch tries to remove rq->lock used in read-side.

To read 64bit value in safe, this patch uses seqcounter.

Pros.
- rq->lock is not necessary.
Cons.
- When updating counter, sequence number must be updated.
(I hope this per-cpu sequence number is on cache...)
- not simple.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
---
kernel/sched.c | 141 ++++++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 105 insertions(+), 36 deletions(-)

Index: mmotm-2.6.29-Feb24/kernel/sched.c
===================================================================
--- mmotm-2.6.29-Feb24.orig/kernel/sched.c
+++ mmotm-2.6.29-Feb24/kernel/sched.c
@@ -9581,6 +9581,67 @@ struct cgroup_subsys cpu_cgroup_subsys =

#ifdef CONFIG_CGROUP_CPUACCT

+#ifndef CONFIG_64BIT
+DEFINE_PER_CPU(struct seqcount, cpuacct_cgroup_seq);
+
+static inline void cpuacct_start_counter_update(void)
+{
+ /* This is called under rq->lock and IRQ is off */
+ struct seqcount *s = &get_cpu_var(cpuacct_cgroup_seq);
+
+ write_seqcount_begin(s);
+ put_cpu_var(cpuacct_cgroup_seq);
+}
+
+static inline void cpuacct_end_counter_update(void)
+{
+ struct seqcount *s = &get_cpu_var(cpuacct_cgroup_seq);
+
+ write_seqcount_end(s);
+ put_cpu_var(cpuacct_cgroup_seq);
+}
+
+static inline u64
+cpuacct_read_counter(u64 *val, int cpu)
+{
+ struct seqcount *s = &per_cpu(cpuacct_cgroup_seq, cpu);
+ unsigned int seq;
+ u64 data;
+
+ do {
+ seq = read_seqcount_begin(s);
+ data = *val;
+ } while (read_seqcount_retry(s, seq));
+ return data;
+}
+/* This is a special funtion called against "offline" cpus. */
+static inline void cpuacct_reset_offline_counter(u64 *val, int cpu)
+{
+ struct seqcount *s = &per_cpu(cpuacct_cgroup_seq, cpu);
+
+ preempt_disable();
+ write_seqcount_begin(s);
+ *val = 0;
+ write_seqcount_end(s);
+ preempt_enable();
+}
+#else
+static inline void cpuacct_start_counter_update(void)
+{
+}
+static inline void cpuacct_end_counter_update(void)
+{
+}
+static inline u64 cpuacct_read_counter(u64 *val, int cpu)
+{
+ return *val;
+}
+static inline void cpuacct_reset_offline_counter(u64 *val, int cpu)
+{
+ *val = 0;
+}
+#endif
+
/*
* CPU accounting code for task groups.
*
@@ -9596,6 +9657,11 @@ struct cpuacct {
struct cpuacct *parent;
};

+struct cpuacct_work {
+ struct work_struct work;
+ struct cpuacct *cpuacct;
+};
+
struct cgroup_subsys cpuacct_subsys;

/* return cpu accounting group corresponding to this container */
@@ -9643,39 +9709,29 @@ cpuacct_destroy(struct cgroup_subsys *ss
kfree(ca);
}

+/* In 32bit enviroment, seqcounter is used for reading 64bit in safe way */
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
u64 data;

-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit platforms.
- */
- spin_lock_irq(&cpu_rq(cpu)->lock);
- data = *cpuusage;
- spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- data = *cpuusage;
-#endif
+ data = cpuacct_read_counter(cpuusage, cpu);

return data;
}

-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+/* called by per-cpu workqueue */
+static void cpuacct_cpuusage_reset_cpu(struct work_struct *work)
{
+ struct cpuacct_work *cw = container_of(work, struct cpuacct_work, work);
+ struct cpuacct *ca = cw->cpuacct;
+ int cpu = get_cpu();
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);

-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit write safe on 32-bit platforms.
- */
- spin_lock_irq(&cpu_rq(cpu)->lock);
- *cpuusage = val;
- spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- *cpuusage = val;
-#endif
+ cpuacct_start_counter_update();
+ *cpuusage = 0;
+ cpuacct_end_counter_update();
+ put_cpu();
}

/* return total cpu usage (in nanoseconds) of a group */
@@ -9691,23 +9747,34 @@ static u64 cpuusage_read(struct cgroup *
return totalcpuusage;
}

-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
- u64 reset)
+static int cpuacct_cpuusage_reset(struct cgroup *cgrp, unsigned int event)
{
struct cpuacct *ca = cgroup_ca(cgrp);
- int err = 0;
- int i;
-
- if (reset) {
- err = -EINVAL;
- goto out;
+ int cpu;
+ /*
+ * Reset All counters....doesn't need to be fast.
+ * "ca" will be stable while doing this. We are in write() syscall.
+ */
+ get_online_cpus();
+ /*
+ * Because we use alloc_percpu() for allocating counter, we have
+ * a counter per a possible cpu. Reset all online's by workqueue and
+ * reset offline cpu's directly.
+ */
+ for_each_possible_cpu(cpu) {
+ if (cpu_online(cpu)) {
+ struct cpuacct_work cw;
+ INIT_WORK(&cw.work, cpuacct_cpuusage_reset_cpu);
+ cw.cpuacct = ca;
+ schedule_work_on(cpu, &cw.work);
+ flush_work(&cw.work);
+ } else {
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ cpuacct_reset_offline_counter(cpuusage, cpu);
+ }
}
-
- for_each_present_cpu(i)
- cpuacct_cpuusage_write(ca, i, 0);
-
-out:
- return err;
+ put_online_cpus();
+ return 0;
}

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
@@ -9729,7 +9796,7 @@ static struct cftype files[] = {
{
.name = "usage",
.read_u64 = cpuusage_read,
- .write_u64 = cpuusage_write,
+ .trigger = cpuacct_cpuusage_reset,
},
{
.name = "usage_percpu",
@@ -9756,10 +9823,12 @@ static void cpuacct_charge(struct task_s
cpu = task_cpu(tsk);
ca = task_ca(tsk);

+ cpuacct_start_counter_update();
for (; ca; ca = ca->parent) {
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
+ cpuacct_end_counter_update();
}

struct cgroup_subsys cpuacct_subsys = {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/