Re: Severe performance regression w/ 4.4+ on Android due to cgroup locking changes

From: Peter Zijlstra
Date: Thu Jul 14 2016 - 15:35:49 EST


On Thu, Jul 14, 2016 at 08:36:09PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 14, 2016 at 08:09:52PM +0200, Oleg Nesterov wrote:
> > On 07/14, Peter Zijlstra wrote:
> > >
> > > The below is a compile tested only first draft so far. I'll go give it
> > > some runtime next.
> >
> > So I will wait for the new version, but at first glance this matches the
> > code I already reviewed in the past (at least, tried hard to review ;)
> > and it looks correct.
> >
> > Just a couple of almost cosmetic nits, feel free to ignore.
>
> Drat, I just mailed out the patches.. I can do a second version later.

How about so on top of what I sent?


--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -13,11 +13,10 @@ struct percpu_rw_semaphore {
unsigned int __percpu *read_count;
struct rw_semaphore rw_sem;
wait_queue_head_t writer;
- int state;
+ int readers_block;
};

-extern void __percpu_down_read(struct percpu_rw_semaphore *);
-extern int __percpu_down_read_trylock(struct percpu_rw_semaphore *);
+extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
extern void __percpu_up_read(struct percpu_rw_semaphore *);

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -37,7 +36,7 @@ static inline void percpu_down_read(stru
*/
__this_cpu_inc(*sem->read_count);
if (unlikely(!rcu_sync_is_idle(&sem->rss)))
- __percpu_down_read(sem); /* Unconditional memory barrier */
+ __percpu_down_read(sem, false); /* Unconditional memory barrier */
preempt_enable();
/*
* The barrier() from preempt_enable() prevents the compiler from
@@ -55,7 +54,7 @@ static inline int percpu_down_read_trylo
*/
__this_cpu_inc(*sem->read_count);
if (unlikely(!rcu_sync_is_idle(&sem->rss)))
- ret = __percpu_down_read_trylock(sem);
+ ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
preempt_enable();
/*
* The barrier() from preempt_enable() prevents the compiler from
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,8 +8,6 @@
#include <linux/sched.h>
#include <linux/errno.h>

-enum { readers_slow, readers_block };
-
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *rwsem_key)
{
@@ -21,7 +19,7 @@ int __percpu_init_rwsem(struct percpu_rw
rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
__init_rwsem(&sem->rw_sem, name, rwsem_key);
init_waitqueue_head(&sem->writer);
- sem->state = readers_slow;
+ sem->readers_block = 0;
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -41,21 +39,21 @@ void percpu_free_rwsem(struct percpu_rw_
}
EXPORT_SYMBOL_GPL(percpu_free_rwsem);

-void __percpu_down_read(struct percpu_rw_semaphore *sem)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
{
/*
* Due to having preemption disabled the decrement happens on
* the same CPU as the increment, avoiding the
* increment-on-one-CPU-and-decrement-on-another problem.
*
- * And yes, if the reader misses the writer's assignment of
- * readers_block to sem->state, then the writer is
- * guaranteed to see the reader's increment. Conversely, any
- * readers that increment their sem->read_count after the
- * writer looks are guaranteed to see the readers_block value,
+ * If the reader misses the writer's assignment of readers_block, then
+ * the writer is guaranteed to see the reader's increment.
+ *
+ * Conversely, any readers that increment their sem->read_count after
+ * the writer looks are guaranteed to see the readers_block value,
* which in turn means that they are guaranteed to immediately
- * decrement their sem->read_count, so that it doesn't matter
- * that the writer missed them.
+ * decrement their sem->read_count, so that it doesn't matter that the
+ * writer missed them.
*/

smp_mb(); /* A matches D */
@@ -64,8 +62,8 @@ void __percpu_down_read(struct percpu_rw
* If !readers_block the critical section starts here, matched by the
* release in percpu_up_write().
*/
- if (likely(smp_load_acquire(&sem->state) != readers_block))
- return;
+ if (likely(!smp_load_acquire(&sem->readers_block)))
+ return 1;

/*
* Per the above comment; we still have preemption disabled and
@@ -73,6 +71,9 @@ void __percpu_down_read(struct percpu_rw
*/
__percpu_up_read(sem);

+ if (try)
+ return 0;
+
/*
* We either call schedule() in the wait, or we'll fall through
* and reschedule on the preempt_enable() in percpu_down_read().
@@ -87,21 +88,10 @@ void __percpu_down_read(struct percpu_rw
__up_read(&sem->rw_sem);

preempt_disable();
+ return 1;
}
EXPORT_SYMBOL_GPL(__percpu_down_read);

-int __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
-{
- smp_mb(); /* A matches D */
-
- if (likely(smp_load_acquire(&sem->state) != readers_block))
- return 1;
-
- __percpu_up_read(sem);
- return 0;
-}
-EXPORT_SYMBOL_GPL(__percpu_down_read_trylock);
-
void __percpu_up_read(struct percpu_rw_semaphore *sem)
{
smp_mb(); /* B matches C */
@@ -150,23 +140,23 @@ static bool readers_active_check(struct

void percpu_down_write(struct percpu_rw_semaphore *sem)
{
- down_write(&sem->rw_sem);
-
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);

+ down_write(&sem->rw_sem);
+
/*
* Notify new readers to block; up until now, and thus throughout the
* longish rcu_sync_enter() above, new readers could still come in.
*/
- WRITE_ONCE(sem->state, readers_block);
+ WRITE_ONCE(sem->readers_block, 1);

smp_mb(); /* D matches A */

/*
- * If they don't see our writer of readers_block to sem->state,
- * then we are guaranteed to see their sem->read_count increment, and
- * therefore will wait for them.
+ * If they don't see our writer of readers_block, then we are
+ * guaranteed to see their sem->read_count increment, and therefore
+ * will wait for them.
*/

/* Wait for all now active readers to complete. */
@@ -186,7 +176,7 @@ void percpu_up_write(struct percpu_rw_se
* Therefore we force it through the slow path which guarantees an
* acquire and thereby guarantees the critical section's consistency.
*/
- smp_store_release(&sem->state, readers_slow);
+ smp_store_release(&sem->readers_block, 0);

/*
* Release the write lock, this will allow readers back in the game.
@@ -194,9 +184,9 @@ void percpu_up_write(struct percpu_rw_se
up_write(&sem->rw_sem);

/*
- * Once this completes (at least one RCU grace period hence) the reader
- * fast path will be available again. Safe to use outside the exclusive
- * write lock because its counting.
+ * Once this completes (at least one RCU-sched grace period hence) the
+ * reader fast path will be available again. Safe to use outside the
+ * exclusive write lock because its counting.
*/
rcu_sync_exit(&sem->rss);
}