Re: [patch V3] percpu_counter: scalability works

From: Shaohua Li
Date: Mon May 16 2011 - 02:37:22 EST

Next message: Weiping Pan: "[PATCH] platform/x86:delete two unused variables"
Previous message: Ingo Molnar: "Re: Possible sandybridge livelock issue"
In reply to: Eric Dumazet: "Re: [patch V3] percpu_counter: scalability works"
Next in thread: Eric Dumazet: "Re: [patch V3] percpu_counter: scalability works"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Mon, 2011-05-16 at 14:11 +0800, Eric Dumazet wrote:
> Le lundi 16 mai 2011 Ã 08:58 +0800, Shaohua Li a Ãcrit :
>
> > so if _sum starts and ends here, _sum can still get deviation.
>
> This makes no sense at all. If you have so many cpus 'here' right before
> you increment fbc->sum_cnt, then no matter how precise and super
> cautious you are in your _sum() implementation, as soon as you exit from
> sum(), other cpus already changed the percpu counter global value.
I don't agree here. The original implementation also just has quite
small window we have deviation, the window only exists between the two
lines:
atomic64_add(count, &fbc->count);
__this_cpu_write(*fbc->counters, 0);
if you think we should ignore it, we'd better not use any protection
here.

> > @@ -76,10 +74,20 @@ void __percpu_counter_add(struct percpu_
> > preempt_disable();
> > count = __this_cpu_read(*fbc->counters) + amount;
> > if (count >= batch || count <= -batch) {
> > - spin_lock(&fbc->lock);
> > - fbc->count += count;
> > + while (1) {
> > + atomic_inc_return(&fbc->add_start);
> > + if (atomic_read(&fbc->sum_start) != 0)
> > + atomic_dec(&fbc->add_start);
> > + else
> > + break;
> > + while (atomic_read(&fbc->sum_start) != 0)
> > + cpu_relax();
> > + }
> > +
> > + atomic64_add(count, &fbc->count);
> > __this_cpu_write(*fbc->counters, 0);
> > - spin_unlock(&fbc->lock);
> > +
> > + atomic_dec(&fbc->add_start);
> > } else {
> > __this_cpu_write(*fbc->counters, count);
> > }
> >
>
> This is way too heavy. You have 3 atomic ops here and a very slow
> atomic_inc_return() in fast path [ not all machines are x86].
>
> Not all percpu_counters are used in degenerated way. Most of them hit
> the global count not very often.
>
> Your version slows down a very common case (one cpu only calling _add()
> several times, for example network stack in input path)
>
> fbc->counters being in same cache line than fbc->add_start/sum_start and
> all, I bet everything will be very slow during a _sum() on a 4096 cpu
> machine, especially if this _sum() is interrupted by some long lasting
> interrupt.
as I wrote in the email, the atomic and cacheline issue can be resolved
with a per_cpu data, I just didn't post the patch. I post it this time,
please see below. There is no cache line bounce anymore.

> I believe the 'deviation' risk is almost null with my patch.
> Remember percpu_counter is not an exact counter but a very lazy one.
> (Only requirement is to not have drift)
>
> The risk is small especially if we move the :
> __this_cpu_write(*fbc->counters, 0);
> before the :
> atomic64_add(count, &fbc->count);
>
> and then do the sequence increment _after_ this.
>
>
>
> Here is my V4 : We dont need the second fbc->slowcount, given sum() get
> fbc->count after the folding, not before : If some cpus enter _add()
> while _sum() is running they'll seem sum_cnt signal and change
> fbc->count immediately.
>
> I also make following sequence in _add() :
>
> __this_cpu_write(*fbc->counters, 0);
we still have the deviation issue if _sum starts and ends here. this
doesn't change anything.

> atomic64_add(count, &pcrw->count);
> pcrw->sequence++;
>
>
> include/linux/percpu_counter.h | 25 +++++++--
> lib/percpu_counter.c | 78 ++++++++++++++++++++-----------
> 2 files changed, 70 insertions(+), 33 deletions(-)
>
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 46f6ba5..e3e62b1 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -15,13 +15,24 @@
>
> #ifdef CONFIG_SMP
>
> -struct percpu_counter {
> - spinlock_t lock;
> - s64 count;
> +/*
> + * For performance reasons, we keep this part in a separate cache line
> + */
> +struct percpu_counter_rw {
> + atomic64_t count;
> + unsigned int sequence;
> +
> + /* since we have plenty room, store list here, even if never used */
> #ifdef CONFIG_HOTPLUG_CPU
> struct list_head list; /* All percpu_counters are on a list */
> + struct percpu_counter *fbc;
> #endif
> - s32 __percpu *counters;
> +} ____cacheline_aligned_in_smp;
> +
> +struct percpu_counter {
> + atomic_t sum_cnt; /* count of in flight sum() */
> + struct percpu_counter_rw *pcrw;
> + s32 __percpu *counters;
> };
>
> extern int percpu_counter_batch;
> @@ -60,7 +71,9 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
>
> static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> {
> - return fbc->count;
> + struct percpu_counter_rw *pcrw = fbc->pcrw;
> +
> + return atomic64_read(&pcrw->count);
> }
>
> /*
> @@ -70,7 +83,7 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> */
> static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
> {
> - s64 ret = fbc->count;
> + s64 ret = percpu_counter_read(fbc);
>
> barrier(); /* Prevent reloads of fbc->count */
> if (ret >= 0)
> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
> index 28f2c33..27292ba 100644
> --- a/lib/percpu_counter.c
> +++ b/lib/percpu_counter.c
> @@ -9,6 +9,7 @@
> #include <linux/cpu.h>
> #include <linux/module.h>
> #include <linux/debugobjects.h>
> +#include <linux/slab.h>
>
> static LIST_HEAD(percpu_counters);
> static DEFINE_MUTEX(percpu_counters_lock);
> @@ -58,28 +59,32 @@ static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
> void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
> {
> int cpu;
> + struct percpu_counter_rw *pcrw = fbc->pcrw;
>
> - spin_lock(&fbc->lock);
> for_each_possible_cpu(cpu) {
> s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
> *pcount = 0;
> }
> - fbc->count = amount;
> - spin_unlock(&fbc->lock);
> + atomic64_set(&pcrw->count, amount);
> }
> EXPORT_SYMBOL(percpu_counter_set);
>
> void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
> {
> s64 count;
> + struct percpu_counter_rw *pcrw = fbc->pcrw;
> +
> + if (atomic_read(&fbc->sum_cnt)) {
> + atomic64_add(amount, &pcrw->count);
> + return;
> + }
>
> preempt_disable();
> count = __this_cpu_read(*fbc->counters) + amount;
> if (count >= batch || count <= -batch) {
> - spin_lock(&fbc->lock);
> - fbc->count += count;
> __this_cpu_write(*fbc->counters, 0);
> - spin_unlock(&fbc->lock);
> + atomic64_add(count, &pcrw->count);
smp_wmb() or atomic64_add_return() here to guarantee the changes are
seen before sequence++;

> + pcrw->sequence++;
sequence++ can introduce cache line bouncing.

add_start causes a lot of cache bouncing because it's updated by all
cpus. We can actually make it a percpu variable. This will completely
reduce the cache bouncing.
With the patch and last patch, I get about 7x faster running the
workload that last patch described. Only with last patch, the workload
is only about 4x faster.
This doesn't slow down _sum because we removed lock for _sum. I did
a stress test. 23 CPU run _add, one cpu runs _sum. In _add fast path
(don't hold) lock, _sum runs a little slow (about 20% slower). In
_add slow path (hold lock), _sum runs much faster (about 9x faster);

Signed-off-by: Shaohua Li <shaohua.li@xxxxxxxxx>
---
include/linux/percpu_counter.h | 3 ++-
lib/percpu_counter.c | 22 ++++++++++++++++------
2 files changed, 18 insertions(+), 7 deletions(-)

Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h 2011-05-16 10:26:05.000000000 +0800
+++ linux/include/linux/percpu_counter.h 2011-05-16 10:27:48.000000000 +0800
@@ -16,12 +16,13 @@
#ifdef CONFIG_SMP

struct percpu_counter {
- atomic_t sum_start, add_start;
+ atomic_t sum_start;
atomic64_t count;
#ifdef CONFIG_HOTPLUG_CPU
struct list_head list; /* All percpu_counters are on a list */
#endif
s32 __percpu *counters;
+ char __percpu *add_starts;
};

extern int percpu_counter_batch;
Index: linux/lib/percpu_counter.c
===================================================================
--- linux.orig/lib/percpu_counter.c 2011-05-16 10:26:58.000000000 +0800
+++ linux/lib/percpu_counter.c 2011-05-16 10:46:12.000000000 +0800
@@ -75,10 +75,12 @@ void __percpu_counter_add(struct percpu_
count = __this_cpu_read(*fbc->counters) + amount;
if (count >= batch || count <= -batch) {
while (1) {
- atomic_inc_return(&fbc->add_start);
+ __this_cpu_write(*fbc->add_starts, 1);
+ /* Guarantee add_starts is seen by _sum */
+ smp_wmb();
if (atomic_read(&fbc->sum_start) == 0)
break;
- atomic_dec(&fbc->add_start);
+ __this_cpu_write(*fbc->add_starts, 0);
while (atomic_read(&fbc->sum_start) != 0)
cpu_relax();
}
@@ -86,7 +88,7 @@ void __percpu_counter_add(struct percpu_
atomic64_add(count, &fbc->count);
__this_cpu_write(*fbc->counters, 0);

- atomic_dec(&fbc->add_start);
+ __this_cpu_write(*fbc->add_starts, 0);
} else {
__this_cpu_write(*fbc->counters, count);
}
@@ -104,8 +106,10 @@ s64 __percpu_counter_sum(struct percpu_c
int cpu;

atomic_inc_return(&fbc->sum_start);
- while (atomic_read(&fbc->add_start) != 0)
- cpu_relax();
+ for_each_online_cpu(cpu) {
+ while (*per_cpu_ptr(fbc->add_starts, cpu) != 0)
+ cpu_relax();
+ }

ret = atomic64_read(&fbc->count);
for_each_online_cpu(cpu) {
@@ -122,10 +126,15 @@ int percpu_counter_init(struct percpu_co
{
atomic64_set(&fbc->count, amount);
atomic_set(&fbc->sum_start, 0);
- atomic_set(&fbc->add_start, 0);
fbc->counters = alloc_percpu(s32);
if (!fbc->counters)
return -ENOMEM;
+ fbc->add_starts = alloc_percpu(char);
+ if (!fbc->add_starts) {
+ free_percpu(fbc->counters);
+ return -ENOMEM;
+ }
+

debug_percpu_counter_activate(fbc);

@@ -152,6 +161,7 @@ void percpu_counter_destroy(struct percp
mutex_unlock(&percpu_counters_lock);
#endif
free_percpu(fbc->counters);
+ free_percpu(fbc->add_starts);
fbc->counters = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Weiping Pan: "[PATCH] platform/x86:delete two unused variables"
Previous message: Ingo Molnar: "Re: Possible sandybridge livelock issue"
In reply to: Eric Dumazet: "Re: [patch V3] percpu_counter: scalability works"
Next in thread: Eric Dumazet: "Re: [patch V3] percpu_counter: scalability works"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]