Re: question about RCU dynticks_nesting

From: Paul E. McKenney
Date: Wed May 06 2015 - 02:06:45 EST


On Wed, May 06, 2015 at 05:44:54AM +0200, Mike Galbraith wrote:
> On Wed, 2015-05-06 at 03:49 +0200, Mike Galbraith wrote:
> > On Mon, 2015-05-04 at 22:54 -0700, Paul E. McKenney wrote:
> >
> > > You have RCU_FAST_NO_HZ=y, correct? Could you please try measuring with
> > > RCU_FAST_NO_HZ=n?
> >
> > FWIW, the syscall numbers I posted were RCU_FAST_NO_HZ=n. (I didn't
> > profile to see where costs lie though)
>
> (did that)

Nice, thank you!!!

> 100000000 * stat() on isolated cpu
>
> NO_HZ_FULL off inactive housekeeper nohz_full
> real 0m14.266s 0m14.367s 0m20.427s 0m27.921s
> user 0m1.756s 0m1.553s 0m1.976s 0m10.447s
> sys 0m12.508s 0m12.769s 0m18.400s 0m17.464s
> (real) 1.000 1.007 1.431 1.957
>
> inactive housekeeper nohz_full
> ----------------------------------------------------------------------------------------------------------------------------------------------
> 7.61% [.] __xstat64 11.12% [k] context_tracking_exit 7.41% [k] context_tracking_exit
> 7.04% [k] system_call 6.18% [k] context_tracking_enter 6.02% [k] native_sched_clock
> 6.96% [k] copy_user_enhanced_fast_string 5.18% [.] __xstat64 4.69% [k] rcu_eqs_enter_common.isra.37
> 6.57% [k] path_init 4.89% [k] system_call 4.35% [k] _raw_spin_lock
> 5.92% [k] system_call_after_swapgs 4.84% [k] copy_user_enhanced_fast_string 4.30% [k] context_tracking_enter
> 5.44% [k] lockref_put_return 4.46% [k] path_init 4.25% [k] kmem_cache_alloc
> 4.69% [k] link_path_walk 4.30% [k] system_call_after_swapgs 4.14% [.] __xstat64
> 4.47% [k] lockref_get_not_dead 4.12% [k] kmem_cache_free 3.89% [k] rcu_eqs_exit_common.isra.38
> 4.46% [k] kmem_cache_free 3.78% [k] link_path_walk 3.50% [k] system_call
> 4.20% [k] kmem_cache_alloc 3.62% [k] lockref_put_return 3.48% [k] copy_user_enhanced_fast_string
> 4.09% [k] cp_new_stat 3.43% [k] kmem_cache_alloc 3.02% [k] system_call_after_swapgs
> 3.38% [k] vfs_getattr_nosec 2.95% [k] lockref_get_not_dead 2.97% [k] kmem_cache_free
> 2.82% [k] vfs_fstatat 2.87% [k] cp_new_stat 2.88% [k] lockref_put_return
> 2.60% [k] user_path_at_empty 2.62% [k] syscall_trace_leave 2.61% [k] link_path_walk
> 2.47% [k] path_lookupat 1.91% [k] vfs_getattr_nosec 2.58% [k] path_init
> 2.14% [k] strncpy_from_user 1.89% [k] syscall_trace_enter_phase1 2.15% [k] lockref_get_not_dead
> 2.11% [k] getname_flags 1.77% [k] path_lookupat 2.04% [k] cp_new_stat
> 2.10% [k] generic_fillattr 1.67% [k] complete_walk 1.89% [k] generic_fillattr
> 2.05% [.] main 1.65% [k] vfs_fstatat 1.67% [k] syscall_trace_leave
> 1.89% [k] complete_walk 1.56% [k] generic_fillattr 1.59% [k] vfs_getattr_nosec
> 1.73% [k] generic_permission 1.55% [k] user_path_at_empty 1.49% [k] get_vtime_delta
> 1.50% [k] system_call_fastpath 1.54% [k] strncpy_from_user 1.32% [k] user_path_at_empty
> 1.37% [k] legitimize_mnt 1.53% [k] getname_flags 1.30% [k] syscall_trace_enter_phase1
> 1.30% [k] dput 1.46% [k] legitimize_mnt 1.21% [k] rcu_eqs_exit
> 1.26% [k] putname 1.34% [.] main 1.21% [k] vfs_fstatat
> 1.19% [k] path_put 1.32% [k] int_with_check 1.18% [k] path_lookupat
> 1.18% [k] filename_lookup 1.28% [k] generic_permission 1.15% [k] getname_flags
> 1.01% [k] SYSC_newstat 1.16% [k] int_very_careful 1.03% [k] strncpy_from_user
> 0.96% [k] mntput_no_expire 1.04% [k] putname 1.01% [k] account_system_time
> 0.79% [k] path_cleanup 0.94% [k] dput 1.00% [k] complete_walk
> 0.79% [k] mntput 0.91% [k] context_tracking_user_exit 0.99% [k] vtime_account_user
>

So we have rcu_eqs_enter_common(4.69%), rcu_eqs_exit_common(3.89%),
and rcu_eqs_exit(1.21%). Interesting that rcu_eqs_exit() appears at all,
given that it just does very simple operations, and rcu_eqs_exit_common()
is apparently not inlined (OK, perhaps it is partially inlined?).
This suggests that there are useful gains to be had via simple changes,
for exmaple, placing the warnings behind CONFIG_NO_HZ_DEBUG or some such.
Or not, as this overhead might instead be due to cache misses on first
access to the relevant data structures. But worth a try, perhaps.

Does the attached patch help at all? If so, there might be some similar
gains to be had.

Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4e6902005228..3f09e5abb7b0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -616,7 +616,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);

trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
idle_task(smp_processor_id());

@@ -635,7 +636,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
smp_mb__after_atomic(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ atomic_read(&rdtp->dynticks) & 0x1);
rcu_dynticks_task_enter();

/*
@@ -661,7 +663,8 @@ static void rcu_eqs_enter(bool user)

rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (oldval & DYNTICK_TASK_NEST_MASK) == 0);
if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_nesting = 0;
rcu_eqs_enter_common(oldval, user);
@@ -734,7 +737,8 @@ void rcu_irq_exit(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting < 0);
if (rdtp->dynticks_nesting)
trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
else
@@ -759,10 +763,12 @@ static void rcu_eqs_exit_common(long long oldval, int user)
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !(atomic_read(&rdtp->dynticks) & 0x1));
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
idle_task(smp_processor_id());

@@ -786,7 +792,7 @@ static void rcu_eqs_exit(bool user)

rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE(oldval < 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
@@ -859,7 +865,8 @@ void rcu_irq_enter(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting == 0);
if (oldval)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c4e1cf04cf57..b908048f8d6a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1373,6 +1373,17 @@ config RCU_TRACE
Say Y here if you want to enable RCU tracing
Say N if you are unsure.

+config RCU_EQS_DEBUG
+ bool "Use this when adding any sort of NO_HZ support to your arch"
+ depends on DEBUG_KERNEL
+ help
+ This option provides consistency checks in RCU's handling of
+ NO_HZ. These checks have proven quite helpful in detecting
+ bugs in arch-specific NO_HZ code.
+
+ Say N here if you need ultimate kernel/user switch latencies
+ Say Y if you are unsure
+
endmenu # "RCU Debugging"

config DEBUG_BLOCK_EXT_DEVT

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/