[PATCH] sched: Add SCHED_BGND (background) scheduling policy

From: Peter Williams
Date: Tue Jul 04 2006 - 19:38:15 EST

Next message: Esben Nielsen: "Where can I get glibc with PI futex support (for -RT tests) ?"
Previous message: Brice Goglin: "Re: [patch 3/7] Check root chipset no_msi flag instead of all parentbusses flags"
Next in thread: Con Kolivas: "Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Problem:

There is a genuine need for the ability to put tasks in the background
(a la the SCHED_IDLEPRIO policy in Con Kolivas's -sc kernels) as is
evidenced by comments in LKML re a desire for SCHED_BATCH tasks
to run completely in the background.

Solution:

Of course, one option would have been to just modify SCHED_BATCH so
that tasks with that policy run completely in the background but there is a
genuine need for a non background batch policy so the solution adopted
is to implementa a new policy SCHED_BGND.

SCHED_BATCH means that it's a normal process and should get a fair
share of the CPU in accordance with its "nice" setting but it is NOT an
interactive task and should NOT receive any of the special treatment
that a task that is adjudged to be interactive receives. In particular,
it should always be moved to the expired array at the end of its time
slice as to do otherwise might result in CPU starvation for other tasks.

SCHED_BGND means it's totally unimportant and should only be given the
CPU if no one else wants it OR if not giving it the CPU could lead to
priority inversion or starvation of other tasks due to this tasks holding
system resources.

Signed-off-by: Peter Williams <pwil3058@xxxxxxxxxxxxxx>

---
include/linux/init_task.h | 6 -
include/linux/sched.h | 11 ++
kernel/fork.c | 1
kernel/mutex.c | 28 ++++++-
kernel/sched.c | 183 ++++++++++++++++++++++++++++++++++++++--------
5 files changed, 192 insertions(+), 37 deletions(-)

Index: MM-2.6.17-mm6/include/linux/init_task.h
===================================================================
--- MM-2.6.17-mm6.orig/include/linux/init_task.h 2006-07-04 14:37:42.000000000 +1000
+++ MM-2.6.17-mm6/include/linux/init_task.h 2006-07-04 14:38:12.000000000 +1000
@@ -99,9 +99,9 @@ extern struct group_info init_groups;
.usage = ATOMIC_INIT(2), \
.flags = 0, \
.lock_depth = -1, \
- .prio = MAX_PRIO-20, \
- .static_prio = MAX_PRIO-20, \
- .normal_prio = MAX_PRIO-20, \
+ .prio = MAX_RT_PRIO+20, \
+ .static_prio = MAX_RT_PRIO+20, \
+ .normal_prio = MAX_RT_PRIO+20, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \
Index: MM-2.6.17-mm6/include/linux/sched.h
===================================================================
--- MM-2.6.17-mm6.orig/include/linux/sched.h 2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/include/linux/sched.h 2006-07-04 14:38:12.000000000 +1000
@@ -34,6 +34,8 @@
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
+/* Scheduler class for background tasks */
+#define SCHED_BGND 4

#ifdef __KERNEL__

@@ -503,13 +505,16 @@ struct signal_struct {
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO

-#define MAX_PRIO (MAX_RT_PRIO + 40)
+#define BGND_PRIO (MAX_RT_PRIO + 40)
+/* add another slot for SCHED_BGND tasks */
+#define MAX_PRIO (BGND_PRIO + 1)

#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
#define rt_task(p) rt_prio((p)->prio)
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
#define has_rt_policy(p) \
- unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
+ unlikely((p)->policy != SCHED_NORMAL && (p)->policy < SCHED_BATCH)
+#define bgnd_task(p) (unlikely((p)->policy == SCHED_BGND))

/*
* Some day this will be a full-fledged user tracking system..
@@ -810,6 +815,7 @@ struct task_struct {
unsigned long sleep_avg;
unsigned long long timestamp, last_ran;
unsigned long long sched_time; /* sched_clock time spent running */
+ unsigned int mutexes_held; /* for knowing when it's safe to repress SCHED_BGND tasks */
enum sleep_type sleep_type;

unsigned long policy;
@@ -1090,6 +1096,7 @@ static inline void put_task_struct(struc
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
+#define PF_UIWAKE 0x08000000 /* Just woken from uninterrptible sleep */
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */

Index: MM-2.6.17-mm6/kernel/sched.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/sched.c 2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/sched.c 2006-07-04 14:38:12.000000000 +1000
@@ -59,7 +59,7 @@

/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * to static priority [ MAX_RT_PRIO..BGND_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
@@ -73,7 +73,7 @@
*/
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+#define MAX_USER_PRIO (USER_PRIO(BGND_PRIO))

/*
* Some helpers for converting nanosecond timing to jiffy resolution
@@ -171,7 +171,7 @@
*/

#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+ max(x * (BGND_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)

static unsigned int static_prio_timeslice(int static_prio)
{
@@ -186,6 +186,11 @@ static inline unsigned int task_timeslic
return static_prio_timeslice(p->static_prio);
}

+#define task_in_background(p) unlikely((p)->prio == BGND_PRIO)
+#define safe_to_background(p) \
+ (!((p)->mutexes_held || \
+ (p)->flags & (PF_FREEZE | PF_UIWAKE | PF_EXITING)))
+
/*
* These are the runqueue data structures:
*/
@@ -715,13 +720,17 @@ static inline int __normal_prio(struct t
{
int bonus, prio;

+ /* Ensure that background tasks stay at BGND_PRIO */
+ if (bgnd_task(p) && safe_to_background(p))
+ return BGND_PRIO;
+
bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;

prio = p->static_prio - bonus;
if (prio < MAX_RT_PRIO)
prio = MAX_RT_PRIO;
- if (prio > MAX_PRIO-1)
- prio = MAX_PRIO-1;
+ if (prio > BGND_PRIO-1)
+ prio = BGND_PRIO-1;
return prio;
}

@@ -761,8 +770,18 @@ static void set_load_weight(struct task_
else
#endif
p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
- } else
- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+ } else {
+ /*
+ * Reduce the probability of a task escaping the background
+ * due to load balancing leaving it on a lighly used CPU
+ * Can't use zero as that would kill load balancing when only
+ * background tasks are running.
+ */
+ if (bgnd_task(p))
+ p->load_weight = LOAD_WEIGHT(MIN_TIMESLICE / 2 ? : 1);
+ else
+ p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+ }
}

static inline void
@@ -834,7 +853,10 @@ static void __activate_task(struct task_
{
struct prio_array *target = rq->active;

- if (batch_task(p))
+ /* Don't punish batch tasks just tasks actually in the background
+ * as anything else is counter productive from a system wide aspect
+ */
+ if (task_in_background(p))
target = rq->expired;
enqueue_task(p, target);
inc_nr_running(p, rq);
@@ -942,6 +964,8 @@ static void activate_task(struct task_st
if (!rt_task(p))
p->prio = recalc_task_prio(p, now);

+ p->flags &= ~PF_UIWAKE;
+
/*
* This checks to make sure it's not an uninterruptible task
* that is now waking up.
@@ -1484,6 +1508,7 @@ out_activate:
* sleep_avg beyond just interactive state.
*/
p->sleep_type = SLEEP_NONINTERACTIVE;
+ p->flags |= PF_UIWAKE;
} else

/*
@@ -3024,6 +3049,48 @@ void scheduler_tick(void)
}
goto out_unlock;
}
+
+ if (bgnd_task(p)) {
+ /*
+ * Do this even if there's only one task on the queue as
+ * we want to set the priority low so that any waking tasks
+ * can preempt.
+ */
+ if (task_in_background(p)) {
+ /*
+ * Tasks currently in the background will be
+ * at BGND_PRIO priority and preemption
+ * should be enough to keep them in check provided we
+ * don't let them adversely effect tasks on the expired
+ * array.
+ */
+ if (!safe_to_background(p)) {
+ dequeue_task(p, rq->active);
+ p->prio = effective_prio(p);
+ enqueue_task(p, rq->active);
+ } else if (rq->expired->nr_active &&
+ rq->best_expired_prio < p->prio) {
+ dequeue_task(p, rq->active);
+ enqueue_task(p, rq->expired);
+ set_tsk_need_resched(p);
+ goto out_unlock;
+ }
+ }
+ else if (safe_to_background(p)) {
+ dequeue_task(p, rq->active);
+ p->normal_prio = BGND_PRIO;
+ /* this should be safe for PI purposes */
+ p->prio = p->normal_prio;
+ enqueue_task(p, rq->expired);
+ /*
+ * think about making this conditional to reduce
+ * context switch rate
+ */
+ set_tsk_need_resched(p);
+ goto out_unlock;
+ }
+ }
+
if (!--p->time_slice) {
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
@@ -3033,6 +3100,11 @@ void scheduler_tick(void)

if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
+ /*
+ * No need to do anything special for background tasks here
+ * as TASK_INTERACTIVE() should fail when they're in the
+ * background.
+ */
if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
enqueue_task(p, rq->expired);
if (p->static_prio < rq->best_expired_prio)
@@ -3122,6 +3194,33 @@ smt_slice(struct task_struct *p, struct
}

/*
+ * task time slice for SMT dependent idle purposes
+ */
+static unsigned int smt_timeslice(struct task_struct *p)
+{
+ if (task_in_background(p))
+ return 0;
+
+ return task_timeslice(p);
+}
+
+/*
+ * Is the thisp a higher priority task than thatp for SMT dependent idle
+ * purposes?
+ */
+static int task_priority_gt(const struct task_struct *thisp,
+ const struct task_struct *thatp)
+{
+ if (task_in_background(thisp))
+ return !task_in_background(thatp);
+
+ if (task_in_background(thatp))
+ return 1;
+
+ return thisp->static_prio < thatp->static_prio;
+}
+
+/*
* To minimise lock contention and not have to drop this_rq's runlock we only
* trylock the sibling runqueues and bypass those runqueues if we fail to
* acquire their lock. As we only trylock the normal locking order does not
@@ -3180,9 +3279,9 @@ dependent_sleeper(int this_cpu, struct r
(sd->per_cpu_gain * DEF_TIMESLICE / 100))
ret = 1;
} else {
- if (smt_curr->static_prio < p->static_prio &&
+ if (task_priority_gt(smt_curr, p) &&
!TASK_PREEMPTS_CURR(p, smt_rq) &&
- smt_slice(smt_curr, sd) > task_timeslice(p))
+ smt_slice(smt_curr, sd) > smt_timeslice(p))
ret = 1;
}
unlock:
@@ -3245,6 +3344,22 @@ static inline int interactive_sleep(enum
}

/*
+ * Switch the active and expired arrays.
+ */
+static struct prio_array *switch_arrays(struct rq *rq, int best_active_prio)
+{
+ struct prio_array *array = rq->active;
+
+ schedstat_inc(rq, sched_switch);
+ rq->active = rq->expired;
+ rq->expired = array;
+ rq->expired_timestamp = 0;
+ rq->best_expired_prio = best_active_prio;
+
+ return rq->active;
+}
+
+/*
* schedule() is the main scheduler function.
*/
asmlinkage void __sched schedule(void)
@@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
}

array = rq->active;
- if (unlikely(!array->nr_active)) {
- /*
- * Switch the active and expired arrays.
- */
- schedstat_inc(rq, sched_switch);
- rq->active = rq->expired;
- rq->expired = array;
- array = rq->active;
- rq->expired_timestamp = 0;
- rq->best_expired_prio = MAX_PRIO;
- }
+ if (unlikely(!array->nr_active))
+ array = switch_arrays(rq, MAX_PRIO);

idx = sched_find_first_bit(array->bitmap);
+get_next:
queue = array->queue + idx;
next = list_entry(queue->next, struct task_struct, run_list);
+ /* very strict backgrounding */
+ if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
+ int tmp = sched_find_first_bit(rq->expired->bitmap);
+
+ if (likely(tmp < idx)) {
+ array = switch_arrays(rq, idx);
+ idx = tmp;
+ goto get_next;
+ }
+ }

- if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+ if (!rt_task(next) && interactive_sleep(next->sleep_type) && !bgnd_task(next)) {
unsigned long long delta = now - next->timestamp;
if (unlikely((long long)(now - next->timestamp) < 0))
delta = 0;
@@ -4052,7 +4169,8 @@ recheck:
if (policy < 0)
policy = oldpolicy = p->policy;
else if (policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH)
+ policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_BGND)
return -EINVAL;
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4063,8 +4181,8 @@ recheck:
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
- != (param->sched_priority == 0))
+ if ((policy == SCHED_NORMAL || policy == SCHED_BATCH ||
+ policy == SCHED_BGND) != (param->sched_priority == 0))
return -EINVAL;

/*
@@ -4072,15 +4190,20 @@ recheck:
*/
if (!capable(CAP_SYS_NICE)) {
/*
- * can't change policy, except between SCHED_NORMAL
- * and SCHED_BATCH:
+ * can't change policy, except between SCHED_NORMAL,
+ * SCHED_BATCH or SCHED_BGND:
*/
- if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
- (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
+ if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH &&
+ p->policy != SCHED_BGND) &&
+ (policy != SCHED_BATCH && p->policy != SCHED_NORMAL &&
+ p->policy != SCHED_BGND) &&
+ (policy != SCHED_BGND && p->policy != SCHED_NORMAL &&
+ p->policy != SCHED_BATCH)) &&
!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
return -EPERM;
/* can't increase priority */
- if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
+ if ((policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_BGND) &&
param->sched_priority > p->rt_priority &&
param->sched_priority >
p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
Index: MM-2.6.17-mm6/kernel/mutex.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/mutex.c 2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/mutex.c 2006-07-04 14:38:12.000000000 +1000
@@ -51,6 +51,16 @@ __mutex_init(struct mutex *lock, const c

EXPORT_SYMBOL(__mutex_init);

+static inline void inc_mutex_count(void)
+{
+ current->mutexes_held++;
+}
+
+static inline void dec_mutex_count(void)
+{
+ current->mutexes_held--;
+}
+
/*
* We split the mutex lock/unlock logic into separate fastpath and
* slowpath functions, to reduce the register pressure on the fastpath.
@@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock(
* 'unlocked' into 'locked' state.
*/
__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+ inc_mutex_count();
}

EXPORT_SYMBOL(mutex_lock);
@@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc
* into 'unlocked' state:
*/
__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+ dec_mutex_count();
}

EXPORT_SYMBOL(mutex_unlock);
@@ -274,9 +286,16 @@ __mutex_lock_interruptible_slowpath(atom
*/
int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
{
+ int ret;
+
might_sleep();
- return __mutex_fastpath_lock_retval
+ ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_interruptible_slowpath);
+
+ if (likely(!ret))
+ inc_mutex_count();
+
+ return ret;
}

EXPORT_SYMBOL(mutex_lock_interruptible);
@@ -331,8 +350,13 @@ static inline int __mutex_trylock_slowpa
*/
int fastcall __sched mutex_trylock(struct mutex *lock)
{
- return __mutex_fastpath_trylock(&lock->count,
+ int ret = __mutex_fastpath_trylock(&lock->count,
__mutex_trylock_slowpath);
+
+ if (likely(ret))
+ inc_mutex_count();
+
+ return ret;
}

EXPORT_SYMBOL(mutex_trylock);
Index: MM-2.6.17-mm6/kernel/fork.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/fork.c 2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/fork.c 2006-07-04 14:38:12.000000000 +1000
@@ -1029,6 +1029,7 @@ static struct task_struct *copy_process(
p->wchar = 0; /* I/O counter: bytes written */
p->syscr = 0; /* I/O counter: read syscalls */
p->syscw = 0; /* I/O counter: write syscalls */
+ p->mutexes_held = 0;
acct_clear_integrals(p);

p->it_virt_expires = cputime_zero;

--
Peter Williams pwil3058@xxxxxxxxxxxxxx

"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Esben Nielsen: "Where can I get glibc with PI futex support (for -RT tests) ?"
Previous message: Brice Goglin: "Re: [patch 3/7] Check root chipset no_msi flag instead of all parentbusses flags"
Next in thread: Con Kolivas: "Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]