[git pull] scheduler and futex updates

From: Ingo Molnar
Date: Fri Feb 01 2008 - 11:58:56 EST



Linus, please pull the latest scheduler git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

Fixes, plus the futex "bitset filter" conditional futex wakeup API
variant that Ulrich asked for to reduce the number of false wakeups in
user-space rwlocks. Thanks,

Ingo

------------------>
Heiko Carstens (1):
latencytop: Change Kconfig dependency.

Peter Zijlstra (1):
hrtimer: fix hrtimer_init_sleeper() users

Thomas Gleixner (5):
timekeeping: update xtime_cache when time(zone) changes
tick-sched: add more debug information
x86: replace LOCK_PREFIX in futex.h
futex: Remove warn on in return fixup path
futex: Add bitset conditional wait/wakeup functionality

arch/x86/Kconfig | 3 +++
include/asm-x86/futex.h | 6 +++---
include/linux/futex.h | 10 ++++++++++
include/linux/thread_info.h | 1 +
include/linux/tick.h | 4 ++++
include/linux/time.h | 1 +
kernel/futex.c | 42 ++++++++++++++++++++++++++++++++----------
kernel/futex_compat.c | 3 ++-
kernel/hrtimer.c | 2 ++
kernel/time.c | 1 +
kernel/time/tick-sched.c | 2 ++
kernel/time/timekeeping.c | 6 ++++--
kernel/time/timer_list.c | 2 ++
lib/Kconfig.debug | 2 +-
14 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 65b4491..93e6667 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -44,6 +44,9 @@ config LOCKDEP_SUPPORT
config STACKTRACE_SUPPORT
def_bool y

+config HAVE_LATENCYTOP_SUPPORT
+ def_bool y
+
config SEMAPHORE_SLEEPERS
def_bool y

diff --git a/include/asm-x86/futex.h b/include/asm-x86/futex.h
index 62828d6..9d91926 100644
--- a/include/asm-x86/futex.h
+++ b/include/asm-x86/futex.h
@@ -30,7 +30,7 @@
"1: movl %2, %0\n \
movl %0, %3\n" \
insn "\n" \
-"2: " LOCK_PREFIX "cmpxchgl %3, %2\n \
+"2: lock; cmpxchgl %3, %2\n \
jnz 1b\n \
3: .section .fixup,\"ax\"\n \
4: mov %5, %1\n \
@@ -72,7 +72,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
__futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
break;
case FUTEX_OP_ADD:
- __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
+ __futex_atomic_op1("lock; xaddl %0, %2", ret, oldval,
uaddr, oparg);
break;
case FUTEX_OP_OR:
@@ -111,8 +111,8 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
return -EFAULT;

__asm__ __volatile__(
- "1: " LOCK_PREFIX "cmpxchgl %3, %1 \n"

+ "1: lock; cmpxchgl %3, %1 \n"
"2: .section .fixup, \"ax\" \n"
"3: mov %2, %0 \n"
" jmp 2b \n"
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 1a15f8e..90048fb 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -21,6 +21,8 @@ union ktime;
#define FUTEX_LOCK_PI 6
#define FUTEX_UNLOCK_PI 7
#define FUTEX_TRYLOCK_PI 8
+#define FUTEX_WAIT_BITSET 9
+#define FUTEX_WAKE_BITSET 10

#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG
@@ -33,6 +35,8 @@ union ktime;
#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)

/*
* Support for robust futexes: the kernel cleans up held futexes at
@@ -111,6 +115,12 @@ struct robust_list_head {
*/
#define ROBUST_LIST_LIMIT 2048

+/*
+ * bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a
+ * match of any bit.
+ */
+#define FUTEX_BITSET_MATCH_ANY 0xffffffff
+
#ifdef __KERNEL__
long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index dfbdfb9..421323e 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -23,6 +23,7 @@ struct restart_block {
u32 *uaddr;
u32 val;
u32 flags;
+ u32 bitset;
u64 time;
} futex;
};
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0fadf95..a881c65 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -39,6 +39,8 @@ enum tick_nohz_mode {
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
* @idle_entrytime: Time when the idle call was entered
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @sleep_length: Duration of the current idle sleep
*/
@@ -53,6 +55,8 @@ struct tick_sched {
unsigned long idle_sleeps;
int idle_active;
ktime_t idle_entrytime;
+ ktime_t idle_waketime;
+ ktime_t idle_exittime;
ktime_t idle_sleeptime;
ktime_t idle_lastupdate;
ktime_t sleep_length;
diff --git a/include/linux/time.h b/include/linux/time.h
index b04136d..ceaab9f 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -122,6 +122,7 @@ extern void monotonic_to_bootbased(struct timespec *ts);
extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
extern int timekeeping_is_continuous(void);
extern void update_wall_time(void);
+extern void update_xtime_cache(u64 nsec);

/**
* timespec_to_ns - Convert timespec to nanoseconds
diff --git a/kernel/futex.c b/kernel/futex.c
index db9824d..a6baaec 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -109,6 +109,9 @@ struct futex_q {
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
struct task_struct *task;
+
+ /* Bitset for the optional bitmasked wakeup */
+ u32 bitset;
};

/*
@@ -722,7 +725,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
* to this virtual address:
*/
static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
- int nr_wake)
+ int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
@@ -730,6 +733,9 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
union futex_key key;
int ret;

+ if (!bitset)
+ return -EINVAL;
+
futex_lock_mm(fshared);

ret = get_futex_key(uaddr, fshared, &key);
@@ -746,6 +752,11 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
ret = -EINVAL;
break;
}
+
+ /* Check if one of the bits is set in both bitsets */
+ if (!(this->bitset & bitset))
+ continue;
+
wake_futex(this);
if (++ret >= nr_wake)
break;
@@ -1156,7 +1167,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
static long futex_wait_restart(struct restart_block *restart);

static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
- u32 val, ktime_t *abs_time)
+ u32 val, ktime_t *abs_time, u32 bitset)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
@@ -1167,7 +1178,11 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
struct hrtimer_sleeper t;
int rem = 0;

+ if (!bitset)
+ return -EINVAL;
+
q.pi_state = NULL;
+ q.bitset = bitset;
retry:
futex_lock_mm(fshared);

@@ -1252,6 +1267,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
t.timer.expires = *abs_time;

hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;

/*
* the timer could have already expired, in which
@@ -1293,6 +1310,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
restart->futex.uaddr = (u32 *)uaddr;
restart->futex.val = val;
restart->futex.time = abs_time->tv64;
+ restart->futex.bitset = bitset;
restart->futex.flags = 0;

if (fshared)
@@ -1319,7 +1337,8 @@ static long futex_wait_restart(struct restart_block *restart)
restart->fn = do_no_restart_syscall;
if (restart->futex.flags & FLAGS_SHARED)
fshared = &current->mm->mmap_sem;
- return (long)futex_wait(uaddr, fshared, restart->futex.val, &t);
+ return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+ restart->futex.bitset);
}


@@ -1535,9 +1554,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
owner = rt_mutex_owner(&q.pi_state->pi_mutex);
res = fixup_pi_state_owner(uaddr, &q, owner);

- WARN_ON(rt_mutex_owner(&q.pi_state->pi_mutex) !=
- owner);
-
/* propagate -EFAULT, if the fixup failed */
if (res)
ret = res;
@@ -1943,7 +1959,8 @@ retry:
* PI futexes happens in exit_pi_state():
*/
if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, &curr->mm->mmap_sem, 1);
+ futex_wake(uaddr, &curr->mm->mmap_sem, 1,
+ FUTEX_BITSET_MATCH_ANY);
}
return 0;
}
@@ -2043,10 +2060,14 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,

switch (cmd) {
case FUTEX_WAIT:
- ret = futex_wait(uaddr, fshared, val, timeout);
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ case FUTEX_WAIT_BITSET:
+ ret = futex_wait(uaddr, fshared, val, timeout, val3);
break;
case FUTEX_WAKE:
- ret = futex_wake(uaddr, fshared, val);
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ case FUTEX_WAKE_BITSET:
+ ret = futex_wake(uaddr, fshared, val, val3);
break;
case FUTEX_FD:
/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
@@ -2086,7 +2107,8 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
u32 val2 = 0;
int cmd = op & FUTEX_CMD_MASK;

- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET)) {
if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
if (!timespec_valid(&ts))
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 0a43def..133d558 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -167,7 +167,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
int val2 = 0;
int cmd = op & FUTEX_CMD_MASK;

- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET)) {
if (get_compat_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index bd5d6b5..1069998 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1315,6 +1315,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod

} while (t->task && !signal_pending(current));

+ __set_current_state(TASK_RUNNING);
+
return t->task == NULL;
}

diff --git a/kernel/time.c b/kernel/time.c
index 09d3c45..4064c05 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -129,6 +129,7 @@ static inline void warp_clock(void)
write_seqlock_irq(&xtime_lock);
wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
xtime.tv_sec += sys_tz.tz_minuteswest * 60;
+ update_xtime_cache(0);
write_sequnlock_irq(&xtime_lock);
clock_was_set();
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 63f24b5..88267f0 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -137,6 +137,7 @@ void tick_nohz_update_jiffies(void)

cpu_clear(cpu, nohz_cpu_mask);
now = ktime_get();
+ ts->idle_waketime = now;

local_irq_save(flags);
tick_do_update_jiffies64(now);
@@ -400,6 +401,7 @@ void tick_nohz_restart_sched_tick(void)
* Cancel the scheduled timer and restore the tick
*/
ts->tick_stopped = 0;
+ ts->idle_exittime = now;
hrtimer_cancel(&ts->sched_timer);
ts->sched_timer.expires = ts->idle_tick;

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 092a236..cd5dbc4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -47,7 +47,7 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
static unsigned long total_sleep_time; /* seconds */

static struct timespec xtime_cache __attribute__ ((aligned (16)));
-static inline void update_xtime_cache(u64 nsec)
+void update_xtime_cache(u64 nsec)
{
xtime_cache = xtime;
timespec_add_ns(&xtime_cache, nsec);
@@ -145,6 +145,7 @@ int do_settimeofday(struct timespec *tv)

set_normalized_timespec(&xtime, sec, nsec);
set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+ update_xtime_cache(0);

clock->error = 0;
ntp_clear();
@@ -252,8 +253,8 @@ void __init timekeeping_init(void)
xtime.tv_nsec = 0;
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
+ update_xtime_cache(0);
total_sleep_time = 0;
-
write_sequnlock_irqrestore(&xtime_lock, flags);
}

@@ -290,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev)
}
/* Make sure that we have the correct xtime reference */
timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
+ update_xtime_cache(0);
/* re-base the last cycle value */
clock->cycle_last = clocksource_read(clock);
clock->error = 0;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 12c5f4c..d3d94c1 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -166,6 +166,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P(idle_calls);
P(idle_sleeps);
P_ns(idle_entrytime);
+ P_ns(idle_waketime);
+ P_ns(idle_exittime);
P_ns(idle_sleeptime);
P(last_jiffies);
P(next_jiffies);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 89f4035..0d8a5a4 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -581,7 +581,7 @@ config LATENCYTOP
select STACKTRACE
select SCHEDSTATS
select SCHED_DEBUG
- depends on X86 || X86_64
+ depends on HAVE_LATENCYTOP_SUPPORT
help
Enable this option if you want to use the LatencyTOP tool
to find out which userspace is blocking on what kernel operations.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/