modifying CFS failure

From: cs044024
Date: Fri Apr 11 2008 - 23:56:58 EST

Next message: Andrew Morton: "Re: [PATCH 4/4] autofs4 - add miscelaneous device for ioctls"
Previous message: Mike Snitzer: "Re: Western Digital GreenPower drives and Linux"
Next in thread: Peter Zijlstra: "Re: modifying CFS failure"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

We tried to replace RB-Trees in CFS by AVL trees to compare the
performance benchmarks. For this we wrote a code for avl tree, and made
changes in sched-cfs-v2.6.22.13-v24.patch by replacing each occurance of
RB-tree function by corrosponding AVL tree function. We successfully
applied the patch on the required kernel version and successfully compiled
it. But we are not able to boot the kernel. Following are important
portions from the boot error

call trace

enqueue_entity
enqueue_task_fair
enqueue_task
activate_task
wake_up_new_task
do_fork
kernel_init
kernel_thread
kernel_init
kernel_thread_helper
rest_init
start_kernel
unknown_boot option

EIP[<c01e3a76>] insert_avl_node kernel panic attempting to kill the idle
task!

insert_avl_node is the function to insert a node in the avl tree.

We have properly tested the avl tree code and its working fine. After
trying to trace the error for a few days, we feel that we may be missing
something (In addition to replacing all the occurances of RB by avl in the
patch and including the avl tree code in kernel library)

Attached with this mail are the avl tree codes, the modified patch. The
patch is same as the file sched-cfs-v2.6.22.13-v24.patch, except all the
occurances of RB tree have been replaced by corrosponding functions of avl
tree.

At present we are devoid of any ideas of proceeding forward with the task.
We shall be thankful if someone can guide us to proceed further and
rectifying the errors.

Rohit Gupta
NIT Allahabad India

/*
@author Rohit Gupta
*/

#include <linux/avltree.h>
#include <linux/module.h>

static void left_avl_rotate(struct avl_node*nd,struct avl_root *root)
{
struct avl_node*node=nd;
struct avl_node*A = node->avl_right;
struct avl_node*B = A->avl_left;
A->avl_left = node;
A->avl_parent = node->avl_parent;
if(A->avl_parent==NULL)
root->avl_node = A;
else if(node == node->avl_parent->avl_left)
A->avl_parent->avl_left = A;
else
A->avl_parent->avl_right = A;
node->avl_parent = A;
node->avl_right = B;
if(B!=NULL)
B->avl_parent = node;
}

static void right_avl_rotate(struct avl_node*nd,struct avl_root *root)
{
struct avl_node * node = nd;
struct avl_node*A = node->avl_left;
struct avl_node*B = A->avl_right;
A->avl_right = node;
A->avl_parent = node->avl_parent;
if(A->avl_parent==NULL)
root->avl_node = A;
else if(node==A->avl_parent->avl_left)
A->avl_parent->avl_left = A;
else
A->avl_parent->avl_right = A;
node->avl_parent = A;
node->avl_left = B;
if(B!=NULL)
B->avl_parent = node;
}

void insert_avl_node(struct avl_node*node, struct avl_root *root)
{
struct avl_node* B;

/*Adjust balance factor that are affected after the insertion*/
B=node;
do
{
if((B->avl_parent)->avl_left==B)
{
B=B->avl_parent;
B->balf-=1;
}
else
{
B=B->avl_parent;
B->balf++;
}
}while(B->balf!=0 && B->balf<=1 && B->balf>=-1 && B!=root->avl_node);
/*Balance factor adjustment ends here*/

/*Balance the nodes if avl rule violated*/
if(B->balf > 1) //Case 1 (++)
{
if((B->avl_right)->balf == 1) //case 1.1 (++ -> +)
{
/*
Adjust balance factors so that the new balance factors for the nodes
are the one generated after rotation. Perform rotation after this step.
*/
B->balf = 0;
(B->avl_right)->balf=0;
/*rotate*/
left_avl_rotate(B,root);
}
else if((B->avl_right)->balf==-1) //case 1.1 (++ -> -)
{
struct avl_node * temp = (B->avl_right)->avl_left;
/*
Adjust balance factors so that the new balance factors for the nodes
are the one generated after rotation. Perform rotation after this step.
*/
if(temp->balf ==1)
{
(B->avl_right)->balf=0;
B->balf = -1;
}
else if(temp->balf == -1)
{
B->balf = 0;
(B->avl_right)->balf = 1;
}
else if(temp->balf == 0)
{
B->balf = 0;
(B->avl_right)->balf = 0;
}
temp->balf = 0;
/*rotate*/
right_avl_rotate((B->avl_right),root);
left_avl_rotate(B,root);
}
}
else if(B->balf < -1) //Case 2 (--)
{
if((B->avl_left)->balf == -1) //case 2.1 (-- -> -)
{
/*
Adjust balance factors so that the new balance factors for the nodes
are the one generated after rotation. Perform rotation after this step.
*/
B->balf = 0;
(B->avl_left)->balf = 0;
/*Rotate*/
right_avl_rotate(B,root);
}
else if((B->avl_left)->balf==1) //case 2.2 (-- -> +)
{
struct avl_node * temp = (B->avl_left)->avl_right;
/*
Adjust balance factors so that the new balance factors for the nodes
are the one generated after rotation. Perform rotation after this step.
*/
if(temp->balf ==1)
{
B->balf=0;
(B->avl_left)->balf = -1;
}
else if(temp->balf == -1)
{
B->balf = -1;
(B->avl_left)->balf = 0;
}
else if(temp->balf == 0)
{
B->balf = 0;
(B->avl_left)->balf = 0;
}
temp->balf = 0;
/*Rotate*/
left_avl_rotate((B->avl_left),root);
right_avl_rotate(B,root);
}
}
}
EXPORT_SYMBOL(insert_avl_node);

void delete_avl_node(struct avl_node *node, struct avl_root *root)
{
struct avl_node *temp = node,*temp1=NULL,*temp2=NULL,*B=NULL,*A=NULL;
register int tbalf;
if(temp->avl_left!=NULL && temp->avl_right!=NULL)
/*here the node has both valid children..
therefore adjustment of the node, that is to be deleted, is done. Pointers manipulated*/
{
temp = temp->avl_right;
while(temp->avl_left!=NULL) temp = temp->avl_left;

if(temp == node->avl_right)
{
temp1 = node->avl_parent;
temp2 = node->avl_left;

node->avl_right = temp->avl_right;
if(node->avl_right) node->avl_right->avl_parent = node;
node->avl_left = temp->avl_left;
if(node->avl_left) node->avl_left->avl_parent = node;

node->avl_parent = temp;

temp->avl_left = temp2;
if(temp2)temp2->avl_parent = temp;
temp->avl_right = node;
temp->avl_parent = temp1;

if(temp->avl_parent && temp->avl_parent->avl_right == node)
temp->avl_parent->avl_right = temp;
else if(temp->avl_parent)
temp->avl_parent->avl_left = temp;
else root->avl_node = temp;

tbalf = temp->balf;
temp->balf = node->balf;
node->balf = tbalf;
temp = node; //
}

else
{
temp1 = node->avl_left;
temp2 = node->avl_right;
B=node->avl_parent;

node->avl_right = temp->avl_right;
if(node->avl_right) node->avl_right->avl_parent = node;
node->avl_left = temp->avl_left;
if(node->avl_left) node->avl_left->avl_parent = node;
node->avl_parent = temp->avl_parent;
node->avl_parent->avl_left = node;

temp->avl_left = temp1;
if(temp1) temp1->avl_parent = temp;
temp->avl_right = temp2;
if(temp2) temp2->avl_parent = temp;

temp->avl_parent = B;
if(B && B->avl_right == node) B->avl_right = temp;
else if(B) B->avl_left = temp;
else root->avl_node = temp;

tbalf = temp->balf;
temp->balf = node->balf;
node->balf = tbalf;
temp = node; //
}
}

temp1=NULL;temp2=NULL;B=NULL;

if(temp->avl_left != NULL)
temp1 = temp->avl_left;
else
temp1 = temp->avl_right;

if(temp1!=NULL)
temp1->avl_parent = temp->avl_parent;

if(temp->avl_parent == NULL) //case: Deletion of root.. No change required
root->avl_node = temp1; //Entire avl_node's height decreases.. Terminate here. CH
else
{
temp2=temp;
while(temp2->avl_parent)
{
if((temp2->avl_parent)->balf == 0) //case: No balancing required. CH
{
if(temp2 == (temp2->avl_parent)->avl_left) (temp2->avl_parent)->balf++;
else (temp2->avl_parent)->balf--;
break;
}

else if((temp2->avl_parent)->balf == 1 && temp2==(temp2->avl_parent)->avl_right)//CH
{
(temp2->avl_parent)->balf = 0; //Case: No rotation.. Only change in bal factor
temp2 = temp2->avl_parent;
}

else if((temp2->avl_parent)->balf == -1 && temp2==(temp2->avl_parent)->avl_left)//CH
{
(temp2->avl_parent)->balf = 0; //Case: No rotation.. Only change in bal factor
temp2 = temp2->avl_parent;
}

else if((temp2->avl_parent)->balf == 1 && temp2==(temp2->avl_parent)->avl_left)
{
B=temp2->avl_parent;
B->balf++;
if((B->avl_right)->balf == 1) //case 1.1 (++ -> +) //CH
{

//Adjust balance factors so that the new balance factors for the nodes
//are the one generated after rotation. Perform rotation after this step.

B->balf = 0;
(B->avl_right)->balf=0;
//rotate
temp2=B->avl_right;//In case of left rotate.. The node which takes position of B
left_avl_rotate(B,root);
}
else if((B->avl_right)->balf==-1) //case 1.2 (++ -> -) //CH
{
A = (B->avl_right)->avl_left;

//Adjust balance factors so that the new balance factors for the nodes
//are the one generated after rotation. Perform rotation after this step.

if(A->balf ==1)
{
(B->avl_right)->balf=0;
B->balf = -1;
}
else if(A->balf == -1)
{
B->balf = 0;
(B->avl_right)->balf = 1;
}
else if(A->balf == 0)
{
B->balf = 0;
(B->avl_right)->balf = 0;
}
A->balf = 0;
//rotate
right_avl_rotate((B->avl_right),root);
temp2 = B->avl_right;// In case of left rotate.. The node which takes position of B
left_avl_rotate(B,root);
}
else //CH
{
//Adjustment of Balance Factor required here.
B->balf -= 1;
B->avl_right->balf -= 1;
left_avl_rotate(B,root);
break;
}
}

else if((temp2->avl_parent)->balf == -1 && temp2==(temp2->avl_parent)->avl_right)
{
B=temp2->avl_parent;
B->balf--;
if((B->avl_left)->balf == -1) //case 2.1 (-- -> -) //CH
{

// Adjust balance factors so that the new balance factors for the nodes
// are the one generated after rotation. Perform rotation after this step.

B->balf = 0;
(B->avl_left)->balf = 0;
//Rotate
temp2 = B->avl_left; //In the case of right roatate.. The node which takes position of B
right_avl_rotate(B,root);
}
else if((B->avl_left)->balf==1) //case 2.2 (-- -> +) //CH
{
A = (B->avl_left)->avl_right;

//Adjust balance factors so that the new balance factors for the nodes
//are the one generated after rotation. Perform rotation after this step.

if(A->balf ==1)
{
B->balf=0;
(B->avl_left)->balf = -1;
}
else if(A->balf == -1)
{
B->balf = -1;
(B->avl_left)->balf = 0;
}
else if(A->balf == 0)
{
B->balf = 0;
(B->avl_left)->balf = 0;
}
A->balf = 0;
//Rotate
left_avl_rotate((B->avl_left),root);
temp2 = B->avl_left;//In the case of right roatate.. The node which takes position of B
right_avl_rotate(B,root);
}
else //CH
{
//Adjustment of balance factor required here.
B->balf += 1;
B->avl_left->balf += 1;
right_avl_rotate(B,root);
break;
}
}
}
//Delete the node...
if(temp == (temp->avl_parent)->avl_left)
(temp->avl_parent)->avl_left = temp1;
else
(temp->avl_parent)->avl_right = temp1;
}
}
EXPORT_SYMBOL(delete_avl_node);

struct avl_node* avl_next(struct avl_node *node)
{
if(node->avl_right!=NULL)
{
node = node->avl_right;
while(node->avl_left !=NULL) node = node->avl_left;
return node;
}
else
{
struct avl_node*y = node->avl_parent;
while(y!=NULL && node == y->avl_right)
{
node = y;
y = y->avl_parent;
}
return y;
}
}
EXPORT_SYMBOL(avl_next);

struct avl_node* avl_prev(struct avl_node *node)
{
if(node->avl_left!=NULL)
{
node = node->avl_left;
while(node->avl_right !=NULL) node = node->avl_right;
return node;
}
else
{
struct avl_node*y = node->avl_parent;
while(y!=NULL && node == y->avl_left)
{
node = y;
y = y->avl_parent;
}
return y;
}
}
EXPORT_SYMBOL(avl_prev);

struct avl_node *avl_first(struct avl_root *root)
{
struct avl_node *n;
n = root->avl_node;
if (!n)
return NULL;
while (n->avl_left)
n = n->avl_left;
return n;
}
EXPORT_SYMBOL(avl_first);

struct avl_node *avl_last(struct avl_root *root)
{
struct avl_node *n;
n = root->avl_node;
if (!n)
return NULL;
while (n->avl_right)
n = n->avl_right;
return n;
}
EXPORT_SYMBOL(avl_last);
/*
@author Rohit Gupta
*/
#ifndef _LINUX_AVLTREE_H
#define _LINUX_AVLTREE_H

#include <linux/kernel.h>
#include <linux/stddef.h>

struct avl_node
{
int balf; //Balance Factor
struct avl_node *avl_left;
struct avl_node *avl_right;
struct avl_node *avl_parent;
};

struct avl_root
{
struct avl_node *avl_node;
};

#define AVL_ROOT (struct avl_root) {NULL, }
#define avl_entry(ptr, type, member) container_of(ptr, type, member)
#define AVL_EMPTY_ROOT(root) ((root)->avl_node == NULL)
#define AVL_EMPTY_NODE(node) (((node)->avl_parent) == node)

/*Inserts node in avl tree*/
extern void insert_avl_node(struct avl_node*,struct avl_root *);

/*Deletes node form avl tree*/
extern void delete_avl_node(struct avl_node*,struct avl_root*);

/* Find logical next and previous nodes in a tree */
extern struct avl_node *avl_next(struct avl_node *);
extern struct avl_node *avl_prev(struct avl_node *);
extern struct avl_node *avl_first(struct avl_root *);
extern struct avl_node *avl_last(struct avl_root *);

static inline void avl_link_node(struct avl_node * node, struct avl_node * parent,struct avl_node ** avl_link)
{
node->avl_parent = parent;
node->avl_left = node->avl_right = NULL;
node->balf = 0;
*avl_link = node;
}

#endif /* _LINUX_AVLTREE_H */CFS modified by rohit, v2.6.22.13, v24
---
Documentation/kernel-parameters.txt | 43
Documentation/sched-design-CFS.txt | 186 +
Makefile | 2
arch/i386/Kconfig | 11
arch/i386/kernel/smpboot.c | 12
arch/i386/kernel/tsc.c | 14
arch/ia64/kernel/setup.c | 6
arch/mips/kernel/smp.c | 11
arch/sparc/kernel/smp.c | 10
arch/sparc64/kernel/smp.c | 27
block/cfq-iosched.c | 3
drivers/acpi/processor_idle.c | 32
drivers/kvm/kvm.h | 11
fs/pipe.c | 9
fs/proc/array.c | 144 -
fs/proc/base.c | 73
fs/proc/proc_misc.c | 15
include/asm-generic/bitops/sched.h | 21
include/linux/cgroup.h | 12
include/linux/cpu.h | 2
include/linux/cpuset.h | 5
include/linux/hardirq.h | 13
include/linux/init_task.h | 2
include/linux/kernel.h | 7
include/linux/kernel_stat.h | 3
include/linux/nodemask.h | 94
include/linux/nsproxy.h | 3
include/linux/sched.h | 408 ++
include/linux/taskstats.h | 10
include/linux/topology.h | 10
include/linux/user_namespace.h | 61
init/Kconfig | 26
init/main.c | 8
kernel/Makefile | 2
kernel/delayacct.c | 18
kernel/exit.c | 11
kernel/fork.c | 15
kernel/ksysfs.c | 8
kernel/nsproxy.c | 73
kernel/posix-cpu-timers.c | 34
kernel/sched.c | 5142 +++++++++++++++++++-----------------
kernel/sched_debug.c | 394 ++
kernel/sched_fair.c | 1164 ++++++++
kernel/sched_idletask.c | 89
kernel/sched_rt.c | 259 +
kernel/sched_stats.h | 236 +
kernel/softirq.c | 1
kernel/sys.c | 6
kernel/sysctl.c | 128
kernel/timer.c | 7
kernel/tsacct.c | 4
kernel/user.c | 304 +-
kernel/user_namespace.c | 88
lib/Kconfig.debug | 9
mm/memory_hotplug.c | 7
mm/page_alloc.c | 50
mm/vmscan.c | 4
net/unix/af_unix.c | 4
58 files changed, 6523 insertions(+), 2828 deletions(-)

Index: linux-cfs-2.6.22.13.q/Documentation/kernel-parameters.txt
===================================================================
--- linux-cfs-2.6.22.13.q.orig/Documentation/kernel-parameters.txt
+++ linux-cfs-2.6.22.13.q/Documentation/kernel-parameters.txt
@@ -1009,49 +1009,6 @@ and is between 256 and 4096 characters.

mga= [HW,DRM]

- migration_cost=
- [KNL,SMP] debug: override scheduler migration costs
- Format: <level-1-usecs>,<level-2-usecs>,...
- This debugging option can be used to override the
- default scheduler migration cost matrix. The numbers
- are indexed by 'CPU domain distance'.
- E.g. migration_cost=1000,2000,3000 on an SMT NUMA
- box will set up an intra-core migration cost of
- 1 msec, an inter-core migration cost of 2 msecs,
- and an inter-node migration cost of 3 msecs.
-
- WARNING: using the wrong values here can break
- scheduler performance, so it's only for scheduler
- development purposes, not production environments.
-
- migration_debug=
- [KNL,SMP] migration cost auto-detect verbosity
- Format=<0|1|2>
- If a system's migration matrix reported at bootup
- seems erroneous then this option can be used to
- increase verbosity of the detection process.
- We default to 0 (no extra messages), 1 will print
- some more information, and 2 will be really
- verbose (probably only useful if you also have a
- serial console attached to the system).
-
- migration_factor=
- [KNL,SMP] multiply/divide migration costs by a factor
- Format=<percent>
- This debug option can be used to proportionally
- increase or decrease the auto-detected migration
- costs for all entries of the migration matrix.
- E.g. migration_factor=150 will increase migration
- costs by 50%. (and thus the scheduler will be less
- eager migrating cache-hot tasks)
- migration_factor=80 will decrease migration costs
- by 20%. (thus the scheduler will be more eager to
- migrate tasks)
-
- WARNING: using the wrong values here can break
- scheduler performance, so it's only for scheduler
- development purposes, not production environments.
-
mousedev.tap_time=
[MOUSE] Maximum time between finger touching and
leaving touchpad surface for touch to be considered
Index: linux-cfs-2.6.22.13.q/Documentation/sched-design-CFS.txt
===================================================================
--- /dev/null
+++ linux-cfs-2.6.22.13.q/Documentation/sched-design-CFS.txt
@@ -0,0 +1,186 @@
+
+This is the CFS scheduler.
+
+80% of CFS's design can be summed up in a single sentence: CFS basically
+models an "ideal, precise multi-tasking CPU" on real hardware.
+
+"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100%
+physical power and which can run each task at precise equal speed, in
+parallel, each at 1/nr_running speed. For example: if there are 2 tasks
+running then it runs each at 50% physical power - totally in parallel.
+
+On real hardware, we can run only a single task at once, so while that
+one task runs, the other tasks that are waiting for the CPU are at a
+disadvantage - the current task gets an unfair amount of CPU time. In
+CFS this fairness imbalance is expressed and tracked via the per-task
+p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
+time the task should now run on the CPU for it to become completely fair
+and balanced.
+
+( small detail: on 'ideal' hardware, the p->wait_runtime value would
+ always be zero - no task would ever get 'out of balance' from the
+ 'ideal' share of CPU time. )
+
+CFS's task picking logic is based on this p->wait_runtime value and it
+is thus very simple: it always tries to run the task with the largest
+p->wait_runtime value. In other words, CFS tries to run the task with
+the 'gravest need' for more CPU time. So CFS always tries to split up
+CPU time between runnable tasks as close to 'ideal multitasking
+hardware' as possible.
+
+Most of the rest of CFS's design just falls out of this really simple
+concept, with a few add-on embellishments like nice levels,
+multiprocessing and various algorithm variants to recognize sleepers.
+
+In practice it works like this: the system runs a task a bit, and when
+the task schedules (or a scheduler tick happens) the task's CPU usage is
+'accounted for': the (small) time it just spent using the physical CPU
+is deducted from p->wait_runtime. [minus the 'fair share' it would have
+gotten anyway]. Once p->wait_runtime gets low enough so that another
+task becomes the 'leftmost task' of the time-ordered rbtree it maintains
+(plus a small amount of 'granularity' distance relative to the leftmost
+task so that we do not over-schedule tasks and trash the cache) then the
+new leftmost task is picked and the current task is preempted.
+
+The rq->fair_clock value tracks the 'CPU time a runnable task would have
+fairly gotten, had it been runnable during that time'. So by using
+rq->fair_clock values we can accurately timestamp and measure the
+'expected CPU time' a task should have gotten. All runnable tasks are
+sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
+CFS picks the 'leftmost' task and sticks to it. As the system progresses
+forwards, newly woken tasks are put into the tree more and more to the
+right - slowly but surely giving a chance for every task to become the
+'leftmost task' and thus get on the CPU within a deterministic amount of
+time.
+
+Some implementation details:
+
+ - the introduction of Scheduling Classes: an extensible hierarchy of
+ scheduler modules. These modules encapsulate scheduling policy
+ details and are handled by the scheduler core without the core
+ code assuming about them too much.
+
+ - sched_fair.c implements the 'CFS desktop scheduler': it is a
+ replacement for the vanilla scheduler's SCHED_OTHER interactivity
+ code.
+
+ I'd like to give credit to Con Kolivas for the general approach here:
+ he has proven via RSDL/SD that 'fair scheduling' is possible and that
+ it results in better desktop scheduling. Kudos Con!
+
+ The CFS patch uses a completely different approach and implementation
+ from RSDL/SD. My goal was to make CFS's interactivity quality exceed
+ that of RSDL/SD, which is a high standard to meet :-) Testing
+ feedback is welcome to decide this one way or another. [ and, in any
+ case, all of SD's logic could be added via a kernel/sched_sd.c module
+ as well, if Con is interested in such an approach. ]
+
+ CFS's design is quite radical: it does not use runqueues, it uses a
+ time-ordered rbtree to build a 'timeline' of future task execution,
+ and thus has no 'array switch' artifacts (by which both the vanilla
+ scheduler and RSDL/SD are affected).
+
+ CFS uses nanosecond granularity accounting and does not rely on any
+ jiffies or other HZ detail. Thus the CFS scheduler has no notion of
+ 'timeslices' and has no heuristics whatsoever. There is only one
+ central tunable:
+
+ /proc/sys/kernel/sched_granularity_ns
+
+ which can be used to tune the scheduler from 'desktop' (low
+ latencies) to 'server' (good batching) workloads. It defaults to a
+ setting suitable for desktop workloads. SCHED_BATCH is handled by the
+ CFS scheduler module too.
+
+ Due to its design, the CFS scheduler is not prone to any of the
+ 'attacks' that exist today against the heuristics of the stock
+ scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
+ work fine and do not impact interactivity and produce the expected
+ behavior.
+
+ the CFS scheduler has a much stronger handling of nice levels and
+ SCHED_BATCH: both types of workloads should be isolated much more
+ agressively than under the vanilla scheduler.
+
+ ( another detail: due to nanosec accounting and timeline sorting,
+ sched_yield() support is very simple under CFS, and in fact under
+ CFS sched_yield() behaves much better than under any other
+ scheduler i have tested so far. )
+
+ - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
+ way than the vanilla scheduler does. It uses 100 runqueues (for all
+ 100 RT priority levels, instead of 140 in the vanilla scheduler)
+ and it needs no expired array.
+
+ - reworked/sanitized SMP load-balancing: the runqueue-walking
+ assumptions are gone from the load-balancing code now, and
+ iterators of the scheduling modules are used. The balancing code got
+ quite a bit simpler as a result.
+
+
+Group scheduler extension to CFS
+================================
+
+Normally the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks
+and provide fair CPU time to each such task group. For example, it may
+be desirable to first provide fair CPU time to each user on the system
+and then to each task belonging to a user.
+
+CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
+SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
+groups. At present, there are two (mutually exclusive) mechanisms to group
+tasks for CPU bandwidth control purpose:
+
+ - Based on user id (CONFIG_FAIR_USER_SCHED)
+ In this option, tasks are grouped according to their user id.
+ - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
+ This options lets the administrator create arbitrary groups
+ of tasks, using the "cgroup" pseudo filesystem. See
+ Documentation/cgroups.txt for more information about this
+ filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+Group scheduler tunables:
+
+When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
+each new user and a "cpu_share" file is added in that directory.
+
+ # cd /sys/kernel/uids
+ # cat 512/cpu_share # Display user 512's CPU share
+ 1024
+ # echo 2048 > 512/cpu_share # Modify user 512's CPU share
+ # cat 512/cpu_share # Display user 512's CPU share
+ 2048
+ #
+
+CPU bandwidth between two users are divided in the ratio of their CPU shares.
+For ex: if you would like user "root" to get twice the bandwidth of user
+"guest", then set the cpu_share for both the users such that "root"'s
+cpu_share is twice "guest"'s cpu_share
+
+
+When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
+for each group created using the pseudo filesystem. See example steps
+below to create task groups and modify their CPU share using the "cgroups"
+pseudo filesystem
+
+ # mkdir /dev/cpuctl
+ # mount -t cgroup -ocpu none /dev/cpuctl
+ # cd /dev/cpuctl
+
+ # mkdir multimedia # create "multimedia" group of tasks
+ # mkdir browser # create "browser" group of tasks
+
+ # #Configure the multimedia group to receive twice the CPU bandwidth
+ # #that of browser group
+
+ # echo 2048 > multimedia/cpu.shares
+ # echo 1024 > browser/cpu.shares
+
+ # firefox & # Launch firefox and move it to "browser" group
+ # echo <firefox_pid> > browser/tasks
+
+ # #Launch gmplayer (or your favourite movie player)
+ # echo <movie_player_pid> > multimedia/tasks
Index: linux-cfs-2.6.22.13.q/Makefile
===================================================================
--- linux-cfs-2.6.22.13.q.orig/Makefile
+++ linux-cfs-2.6.22.13.q/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 22
-EXTRAVERSION = .13
+EXTRAVERSION = .13-cfs-v24
NAME = Holy Dancing Manatees, Batman!

# *DOCUMENTATION*
Index: linux-cfs-2.6.22.13.q/arch/i386/Kconfig
===================================================================
--- linux-cfs-2.6.22.13.q.orig/arch/i386/Kconfig
+++ linux-cfs-2.6.22.13.q/arch/i386/Kconfig
@@ -210,6 +210,17 @@ config X86_ES7000

endchoice

+config SCHED_NO_NO_OMIT_FRAME_POINTER
+ bool "Single-depth WCHAN output"
+ default y
+ help
+ Calculate simpler /proc/<PID>/wchan values. If this option
+ is disabled then wchan values will recurse back to the
+ caller function. This provides more accurate wchan values,
+ at the expense of slightly more scheduling overhead.
+
+ If in doubt, say "Y".
+
config PARAVIRT
bool "Paravirtualization support (EXPERIMENTAL)"
depends on EXPERIMENTAL
Index: linux-cfs-2.6.22.13.q/arch/i386/kernel/smpboot.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/arch/i386/kernel/smpboot.c
+++ linux-cfs-2.6.22.13.q/arch/i386/kernel/smpboot.c
@@ -941,17 +941,6 @@ exit:
}
#endif

-static void smp_tune_scheduling(void)
-{
- if (cpu_khz) {
- /* cache size in kB */
- long cachesize = boot_cpu_data.x86_cache_size;
-
- if (cachesize > 0)
- max_cache_size = cachesize * 1024;
- }
-}
-
/*
* Cycle through the processors sending APIC IPIs to boot each.
*/
@@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigne
x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;

current_thread_info()->cpu = 0;
- smp_tune_scheduling();

set_cpu_sibling_map(0);

Index: linux-cfs-2.6.22.13.q/arch/i386/kernel/tsc.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/arch/i386/kernel/tsc.c
+++ linux-cfs-2.6.22.13.q/arch/i386/kernel/tsc.c
@@ -4,6 +4,7 @@
* See comments there for proper credits.
*/

+#include <linux/sched.h>
#include <linux/clocksource.h>
#include <linux/workqueue.h>
#include <linux/cpufreq.h>
@@ -26,6 +27,7 @@ static int tsc_enabled;
* an extra value to store the TSC freq
*/
unsigned int tsc_khz;
+EXPORT_SYMBOL_GPL(tsc_khz);

int tsc_disable;

@@ -57,10 +59,11 @@ __setup("notsc", tsc_setup);
*/
static int tsc_unstable;

-static inline int check_tsc_unstable(void)
+int check_tsc_unstable(void)
{
return tsc_unstable;
}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);

/* Accellerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
@@ -83,7 +86,7 @@ static inline int check_tsc_unstable(voi
*
* -johnstul@xxxxxxxxxx "math is hard, lets go shopping!"
*/
-static unsigned long cyc2ns_scale __read_mostly;
+unsigned long cyc2ns_scale __read_mostly;

#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */

@@ -106,8 +109,13 @@ unsigned long long sched_clock(void)

/*
* Fall back to jiffies if there's no TSC available:
+ * ( But note that we still use it if the TSC is marked
+ * unstable. We do this because unlike Time Of Day,
+ * the scheduler clock tolerates small errors and it's
+ * very important for it to be as fast as the platform
+ * can achive it. )
*/
- if (unlikely(!tsc_enabled))
+ if (unlikely(!tsc_enabled && !tsc_unstable))
/* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);

Index: linux-cfs-2.6.22.13.q/arch/ia64/kernel/setup.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/arch/ia64/kernel/setup.c
+++ linux-cfs-2.6.22.13.q/arch/ia64/kernel/setup.c
@@ -805,7 +805,6 @@ static void __cpuinit
get_max_cacheline_size (void)
{
unsigned long line_size, max = 1;
- unsigned int cache_size = 0;
u64 l, levels, unique_caches;
pal_cache_config_info_t cci;
s64 status;
@@ -835,8 +834,6 @@ get_max_cacheline_size (void)
line_size = 1 << cci.pcci_line_size;
if (line_size > max)
max = line_size;
- if (cache_size < cci.pcci_cache_size)
- cache_size = cci.pcci_cache_size;
if (!cci.pcci_unified) {
status = ia64_pal_cache_config_info(l,
/* cache_type (instruction)= */ 1,
@@ -853,9 +850,6 @@ get_max_cacheline_size (void)
ia64_i_cache_stride_shift = cci.pcci_stride;
}
out:
-#ifdef CONFIG_SMP
- max_cache_size = max(max_cache_size, cache_size);
-#endif
if (max > ia64_max_cacheline_size)
ia64_max_cacheline_size = max;
}
Index: linux-cfs-2.6.22.13.q/arch/mips/kernel/smp.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/arch/mips/kernel/smp.c
+++ linux-cfs-2.6.22.13.q/arch/mips/kernel/smp.c
@@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS]; /* Map
EXPORT_SYMBOL(phys_cpu_present_map);
EXPORT_SYMBOL(cpu_online_map);

-/* This happens early in bootup, can't really do it better */
-static void smp_tune_scheduling (void)
-{
- struct cache_desc *cd = &current_cpu_data.scache;
- unsigned long cachesize = cd->linesz * cd->sets * cd->ways;
-
- if (cachesize > max_cache_size)
- max_cache_size = cachesize;
-}
-
extern void __init calibrate_delay(void);
extern ATTRIB_NORET void cpu_idle(void);

@@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned in
{
init_new_context(current, &init_mm);
current_thread_info()->cpu = 0;
- smp_tune_scheduling();
plat_prepare_cpus(max_cpus);
#ifndef CONFIG_HOTPLUG_CPU
cpu_present_map = cpu_possible_map;
Index: linux-cfs-2.6.22.13.q/arch/sparc/kernel/smp.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/arch/sparc/kernel/smp.c
+++ linux-cfs-2.6.22.13.q/arch/sparc/kernel/smp.c
@@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id
cpu_data(id).prom_node = cpu_node;
cpu_data(id).mid = cpu_get_hwmid(cpu_node);

- /* this is required to tune the scheduler correctly */
- /* is it possible to have CPUs with different cache sizes? */
- if (id == boot_cpu_id) {
- int cache_line,cache_nlines;
- cache_line = 0x20;
- cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
- cache_nlines = 0x8000;
- cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
- max_cache_size = cache_line * cache_nlines;
- }
if (cpu_data(id).mid < 0)
panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
}
Index: linux-cfs-2.6.22.13.q/arch/sparc64/kernel/smp.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/arch/sparc64/kernel/smp.c
+++ linux-cfs-2.6.22.13.q/arch/sparc64/kernel/smp.c
@@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int m
return -EINVAL;
}

-static void __init smp_tune_scheduling(void)
-{
- unsigned int smallest = ~0U;
- int i;
-
- for (i = 0; i < NR_CPUS; i++) {
- unsigned int val = cpu_data(i).ecache_size;
-
- if (val && val < smallest)
- smallest = val;
- }
-
- /* Any value less than 256K is nonsense. */
- if (smallest < (256U * 1024U))
- smallest = 256 * 1024;
-
- max_cache_size = smallest;
-
- if (smallest < 1U * 1024U * 1024U)
- printk(KERN_INFO "Using max_cache_size of %uKB\n",
- smallest / 1024U);
- else
- printk(KERN_INFO "Using max_cache_size of %uMB\n",
- smallest / 1024U / 1024U);
-}
-
/* Constrain the number of cpus to max_cpus. */
void __init smp_prepare_cpus(unsigned int max_cpus)
{
@@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned in
}

cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy;
- smp_tune_scheduling();
}

void __devinit smp_prepare_boot_cpu(void)
Index: linux-cfs-2.6.22.13.q/block/cfq-iosched.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/block/cfq-iosched.c
+++ linux-cfs-2.6.22.13.q/block/cfq-iosched.c
@@ -1280,6 +1280,8 @@ static void cfq_init_prio_data(struct cf
/*
* no prio set, place us in the middle of the BE classes
*/
+ if (tsk->policy == SCHED_IDLE)
+ goto set_class_idle;
cfqq->ioprio = task_nice_ioprio(tsk);
cfqq->ioprio_class = IOPRIO_CLASS_BE;
break;
@@ -1292,6 +1294,7 @@ static void cfq_init_prio_data(struct cf
cfqq->ioprio_class = IOPRIO_CLASS_BE;
break;
case IOPRIO_CLASS_IDLE:
+ set_class_idle:
cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
cfqq->ioprio = 7;
cfq_clear_cfqq_idle_window(cfqq);
Index: linux-cfs-2.6.22.13.q/drivers/acpi/processor_idle.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/drivers/acpi/processor_idle.c
+++ linux-cfs-2.6.22.13.q/drivers/acpi/processor_idle.c
@@ -63,6 +63,7 @@
ACPI_MODULE_NAME("processor_idle");
#define ACPI_PROCESSOR_FILE_POWER "power"
#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY)
#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
static void (*pm_idle_save) (void) __read_mostly;
@@ -479,6 +480,9 @@ static void acpi_processor_idle(void)
* TBD: Can't get time duration while in C1, as resumes
* go to an ISR rather than here. Need to instrument
* base interrupt handler.
+ *
+ * Note: the TSC better not stop in C1, sched_clock() will
+ * skew otherwise.
*/
sleep_ticks = 0xFFFFFFFF;
break;
@@ -486,6 +490,8 @@ static void acpi_processor_idle(void)
case ACPI_STATE_C2:
/* Get start time (ticks) */
t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+ /* Tell the scheduler that we are going deep-idle: */
+ sched_clock_idle_sleep_event();
/* Invoke C2 */
acpi_state_timer_broadcast(pr, cx, 1);
acpi_cstate_enter(cx);
@@ -496,17 +502,22 @@ static void acpi_processor_idle(void)
/* TSC halts in C2, so notify users */
mark_tsc_unstable("possible TSC halt in C2");
#endif
+ /* Compute time (ticks) that we were actually asleep */
+ sleep_ticks = ticks_elapsed(t1, t2);
+
+ /* Tell the scheduler how much we idled: */
+ sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+
/* Re-enable interrupts */
local_irq_enable();
+ /* Do not account our idle-switching overhead: */
+ sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
+
current_thread_info()->status |= TS_POLLING;
- /* Compute time (ticks) that we were actually asleep */
- sleep_ticks =
- ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
acpi_state_timer_broadcast(pr, cx, 0);
break;

case ACPI_STATE_C3:
-
if (pr->flags.bm_check) {
if (atomic_inc_return(&c3_cpu_count) ==
num_online_cpus()) {
@@ -525,6 +536,8 @@ static void acpi_processor_idle(void)
t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
/* Invoke C3 */
acpi_state_timer_broadcast(pr, cx, 1);
+ /* Tell the scheduler that we are going deep-idle: */
+ sched_clock_idle_sleep_event();
acpi_cstate_enter(cx);
/* Get end time (ticks) */
t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
@@ -538,12 +551,17 @@ static void acpi_processor_idle(void)
/* TSC halts in C3, so notify users */
mark_tsc_unstable("TSC halts in C3");
#endif
+ /* Compute time (ticks) that we were actually asleep */
+ sleep_ticks = ticks_elapsed(t1, t2);
+ /* Tell the scheduler how much we idled: */
+ sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+
/* Re-enable interrupts */
local_irq_enable();
+ /* Do not account our idle-switching overhead: */
+ sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
+
current_thread_info()->status |= TS_POLLING;
- /* Compute time (ticks) that we were actually asleep */
- sleep_ticks =
- ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
acpi_state_timer_broadcast(pr, cx, 0);
break;

Index: linux-cfs-2.6.22.13.q/drivers/kvm/kvm.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/drivers/kvm/kvm.h
+++ linux-cfs-2.6.22.13.q/drivers/kvm/kvm.h
@@ -11,6 +11,7 @@
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
+#include <linux/sched.h>
#include <asm/signal.h>

#include "vmx.h"
@@ -531,6 +532,16 @@ void kvm_mmu_free_some_pages(struct kvm_

int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);

+static inline void kvm_guest_enter(void)
+{
+ current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+ current->flags &= ~PF_VCPU;
+}
+
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
Index: linux-cfs-2.6.22.13.q/fs/pipe.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/fs/pipe.c
+++ linux-cfs-2.6.22.13.q/fs/pipe.c
@@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *p
* Pipes are system-local resources, so sleeping on them
* is considered a noninteractive wait:
*/
- prepare_to_wait(&pipe->wait, &wait,
- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
+ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
if (pipe->inode)
mutex_unlock(&pipe->inode->i_mutex);
schedule();
@@ -323,7 +322,7 @@ redo:

/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (ret > 0)
@@ -496,7 +495,7 @@ redo2:
out:
mutex_unlock(&inode->i_mutex);
if (do_wakeup) {
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (ret > 0)
@@ -590,7 +589,7 @@ pipe_release(struct inode *inode, int de
if (!pipe->readers && !pipe->writers) {
free_pipe_info(inode);
} else {
- wake_up_interruptible(&pipe->wait);
+ wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
Index: linux-cfs-2.6.22.13.q/fs/proc/array.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/fs/proc/array.c
+++ linux-cfs-2.6.22.13.q/fs/proc/array.c
@@ -62,6 +62,8 @@
#include <linux/mman.h>
#include <linux/proc_fs.h>
#include <linux/ioport.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
@@ -76,9 +78,7 @@
#include <linux/rcupdate.h>
#include <linux/delayacct.h>

-#include <asm/uaccess.h>
#include <asm/pgtable.h>
-#include <asm/io.h>
#include <asm/processor.h>
#include "internal.h"

@@ -87,10 +87,10 @@
do { memcpy(buffer, string, strlen(string)); \
buffer += strlen(string); } while (0)

-static inline char * task_name(struct task_struct *p, char * buf)
+static inline char *task_name(struct task_struct *p, char *buf)
{
int i;
- char * name;
+ char *name;
char tcomm[sizeof(p->comm)];

get_task_comm(tcomm, p);
@@ -138,7 +138,7 @@ static const char *task_state_array[] =
"X (dead)" /* 32 */
};

-static inline const char * get_task_state(struct task_struct *tsk)
+static inline const char *get_task_state(struct task_struct *tsk)
{
unsigned int state = (tsk->state & (TASK_RUNNING |
TASK_INTERRUPTIBLE |
@@ -156,7 +156,7 @@ static inline const char * get_task_stat
return *p;
}

-static inline char * task_state(struct task_struct *p, char *buffer)
+static inline char *task_state(struct task_struct *p, char *buffer)
{
struct group_info *group_info;
int g;
@@ -165,7 +165,6 @@ static inline char * task_state(struct t
rcu_read_lock();
buffer += sprintf(buffer,
"State:\t%s\n"
- "SleepAVG:\t%lu%%\n"
"Tgid:\t%d\n"
"Pid:\t%d\n"
"PPid:\t%d\n"
@@ -173,9 +172,8 @@ static inline char * task_state(struct t
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n",
get_task_state(p),
- (p->sleep_avg/1024)*100/(1020000000/1024),
- p->tgid, p->pid,
- pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+ p->tgid, p->pid,
+ pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
p->uid, p->euid, p->suid, p->fsuid,
p->gid, p->egid, p->sgid, p->fsgid);
@@ -193,15 +191,15 @@ static inline char * task_state(struct t
get_group_info(group_info);
task_unlock(p);

- for (g = 0; g < min(group_info->ngroups,NGROUPS_SMALL); g++)
- buffer += sprintf(buffer, "%d ", GROUP_AT(group_info,g));
+ for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
+ buffer += sprintf(buffer, "%d ", GROUP_AT(group_info, g));
put_group_info(group_info);

buffer += sprintf(buffer, "\n");
return buffer;
}

-static char * render_sigset_t(const char *header, sigset_t *set, char *buffer)
+static char *render_sigset_t(const char *header, sigset_t *set, char *buffer)
{
int i, len;

@@ -241,7 +239,7 @@ static void collect_sigign_sigcatch(stru
}
}

-static inline char * task_sig(struct task_struct *p, char *buffer)
+static inline char *task_sig(struct task_struct *p, char *buffer)
{
unsigned long flags;
sigset_t pending, shpending, blocked, ignored, caught;
@@ -291,14 +289,23 @@ static inline char *task_cap(struct task
cap_t(p->cap_effective));
}

-int proc_pid_status(struct task_struct *task, char * buffer)
+static inline char *task_context_switch_counts(struct task_struct *p,
+ char *buffer)
{
- char * orig = buffer;
+ return buffer + sprintf(buffer, "voluntary_ctxt_switches:\t%lu\n"
+ "nonvoluntary_ctxt_switches:\t%lu\n",
+ p->nvcsw,
+ p->nivcsw);
+}
+
+int proc_pid_status(struct task_struct *task, char *buffer)
+{
+ char *orig = buffer;
struct mm_struct *mm = get_task_mm(task);

buffer = task_name(task, buffer);
buffer = task_state(task, buffer);
-
+
if (mm) {
buffer = task_mem(mm, buffer);
mmput(mm);
@@ -309,10 +316,70 @@ int proc_pid_status(struct task_struct *
#if defined(CONFIG_S390)
buffer = task_show_regs(task, buffer);
#endif
+ buffer = task_context_switch_counts(task, buffer);
return buffer - orig;
}

-static int do_task_stat(struct task_struct *task, char * buffer, int whole)
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+static cputime_t task_utime(struct task_struct *p)
+{
+ return p->utime;
+}
+
+static cputime_t task_stime(struct task_struct *p)
+{
+ return p->stime;
+}
+#else
+static cputime_t task_utime(struct task_struct *p)
+{
+ clock_t utime = cputime_to_clock_t(p->utime),
+ total = utime + cputime_to_clock_t(p->stime);
+ u64 temp;
+
+ /*
+ * Use CFS's precise accounting:
+ */
+ temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+
+ if (total) {
+ temp *= utime;
+ do_div(temp, total);
+ }
+ utime = (clock_t)temp;
+
+ p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+ return p->prev_utime;
+}
+
+static cputime_t task_stime(struct task_struct *p)
+{
+ clock_t stime;
+
+ /*
+ * Use CFS's precise accounting. (we subtract utime from
+ * the total, to make sure the total observed by userspace
+ * grows monotonically - apps rely on that):
+ */
+ stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+ cputime_to_clock_t(task_utime(p));
+
+ if (stime >= 0)
+ p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+
+ return p->prev_stime;
+}
+#endif
+
+static cputime_t task_gtime(struct task_struct *p)
+{
+ return p->gtime;
+}
+
+static int do_task_stat(struct task_struct *task, char *buffer, int whole)
{
unsigned long vsize, eip, esp, wchan = ~0UL;
long priority, nice;
@@ -320,13 +387,14 @@ static int do_task_stat(struct task_stru
sigset_t sigign, sigcatch;
char state;
int res;
- pid_t ppid = 0, pgid = -1, sid = -1;
+ pid_t ppid = 0, pgid = -1, sid = -1;
int num_threads = 0;
struct mm_struct *mm;
unsigned long long start_time;
unsigned long cmin_flt = 0, cmaj_flt = 0;
unsigned long min_flt = 0, maj_flt = 0;
cputime_t cutime, cstime, utime, stime;
+ cputime_t cgtime, gtime;
unsigned long rsslim = 0;
char tcomm[sizeof(task->comm)];
unsigned long flags;
@@ -345,6 +413,7 @@ static int do_task_stat(struct task_stru
sigemptyset(&sigign);
sigemptyset(&sigcatch);
cutime = cstime = utime = stime = cputime_zero;
+ cgtime = gtime = cputime_zero;

rcu_read_lock();
if (lock_task_sighand(task, &flags)) {
@@ -362,6 +431,7 @@ static int do_task_stat(struct task_stru
cmaj_flt = sig->cmaj_flt;
cutime = sig->cutime;
cstime = sig->cstime;
+ cgtime = sig->cgtime;
rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;

/* add up live thread stats at the group level */
@@ -370,8 +440,9 @@ static int do_task_stat(struct task_stru
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
- utime = cputime_add(utime, t->utime);
- stime = cputime_add(stime, t->stime);
+ utime = cputime_add(utime, task_utime(t));
+ stime = cputime_add(stime, task_stime(t));
+ gtime = cputime_add(gtime, task_gtime(t));
t = next_thread(t);
} while (t != task);

@@ -379,6 +450,7 @@ static int do_task_stat(struct task_stru
maj_flt += sig->maj_flt;
utime = cputime_add(utime, sig->utime);
stime = cputime_add(stime, sig->stime);
+ gtime = cputime_add(gtime, sig->gtime);
}

sid = signal_session(sig);
@@ -389,13 +461,14 @@ static int do_task_stat(struct task_stru
}
rcu_read_unlock();

- if (!whole || num_threads<2)
+ if (!whole || num_threads < 2)
wchan = get_wchan(task);
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- utime = task->utime;
- stime = task->stime;
+ utime = task_utime(task);
+ stime = task_stime(task);
+ gtime = task_gtime(task);
}

/* scale priority and nice values from timeslices to -20..20 */
@@ -405,14 +478,15 @@ static int do_task_stat(struct task_stru

/* Temporary variable needed for gcc-2.96 */
/* convert timespec -> nsec*/
- start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC
+ start_time =
+ (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC
+ task->start_time.tv_nsec;
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(start_time);

- res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %u %lu \
+ res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
task->pid,
tcomm,
state,
@@ -436,7 +510,7 @@ static int do_task_stat(struct task_stru
start_time,
vsize,
mm ? get_mm_rss(mm) : 0,
- rsslim,
+ rsslim,
mm ? mm->start_code : 0,
mm ? mm->end_code : 0,
mm ? mm->start_stack : 0,
@@ -457,18 +531,20 @@ static int do_task_stat(struct task_stru
task_cpu(task),
task->rt_priority,
task->policy,
- (unsigned long long)delayacct_blkio_ticks(task));
- if(mm)
+ (unsigned long long)delayacct_blkio_ticks(task),
+ cputime_to_clock_t(gtime),
+ cputime_to_clock_t(cgtime));
+ if (mm)
mmput(mm);
return res;
}

-int proc_tid_stat(struct task_struct *task, char * buffer)
+int proc_tid_stat(struct task_struct *task, char *buffer)
{
return do_task_stat(task, buffer, 0);
}

-int proc_tgid_stat(struct task_struct *task, char * buffer)
+int proc_tgid_stat(struct task_struct *task, char *buffer)
{
return do_task_stat(task, buffer, 1);
}
@@ -477,12 +553,12 @@ int proc_pid_statm(struct task_struct *t
{
int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
-
+
if (mm) {
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
}

- return sprintf(buffer,"%d %d %d %d %d %d %d\n",
+ return sprintf(buffer, "%d %d %d %d %d %d %d\n",
size, resident, shared, text, lib, data, 0);
}
Index: linux-cfs-2.6.22.13.q/fs/proc/base.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/fs/proc/base.c
+++ linux-cfs-2.6.22.13.q/fs/proc/base.c
@@ -296,10 +296,10 @@ static int proc_pid_wchan(struct task_st
*/
static int proc_pid_schedstat(struct task_struct *task, char *buffer)
{
- return sprintf(buffer, "%lu %lu %lu\n",
+ return sprintf(buffer, "%llu %llu %lu\n",
task->sched_info.cpu_time,
task->sched_info.run_delay,
- task->sched_info.pcnt);
+ task->sched_info.pcount);
}
#endif

@@ -929,6 +929,69 @@ static const struct file_operations proc
};
#endif

+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * Print out various scheduling related per-task fields:
+ */
+static int sched_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+
+ WARN_ON(!inode);
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+ proc_sched_show_task(p, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t
+sched_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct task_struct *p;
+
+ WARN_ON(!inode);
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+ proc_sched_set_task(p);
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int sched_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = single_open(filp, sched_show, NULL);
+ if (!ret) {
+ struct seq_file *m = filp->private_data;
+
+ m->private = inode;
+ }
+ return ret;
+}
+
+static const struct file_operations proc_pid_sched_operations = {
+ .open = sched_open,
+ .read = seq_read,
+ .write = sched_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif
+
static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
@@ -1963,6 +2026,9 @@ static const struct pid_entry tgid_base_
INF("environ", S_IRUSR, pid_environ),
INF("auxv", S_IRUSR, pid_auxv),
INF("status", S_IRUGO, pid_status),
+#ifdef CONFIG_SCHED_DEBUG
+ REG("sched", S_IRUGO|S_IWUSR, pid_sched),
+#endif
INF("cmdline", S_IRUGO, pid_cmdline),
INF("stat", S_IRUGO, tgid_stat),
INF("statm", S_IRUGO, pid_statm),
@@ -2247,6 +2313,9 @@ static const struct pid_entry tid_base_s
INF("environ", S_IRUSR, pid_environ),
INF("auxv", S_IRUSR, pid_auxv),
INF("status", S_IRUGO, pid_status),
+#ifdef CONFIG_SCHED_DEBUG
+ REG("sched", S_IRUGO|S_IWUSR, pid_sched),
+#endif
INF("cmdline", S_IRUGO, pid_cmdline),
INF("stat", S_IRUGO, tid_stat),
INF("statm", S_IRUGO, pid_statm),
Index: linux-cfs-2.6.22.13.q/fs/proc/proc_misc.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/fs/proc/proc_misc.c
+++ linux-cfs-2.6.22.13.q/fs/proc/proc_misc.c
@@ -442,10 +442,12 @@ static int show_stat(struct seq_file *p,
int i;
unsigned long jif;
cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+ cputime64_t guest;
u64 sum = 0;

user = nice = system = idle = iowait =
irq = softirq = steal = cputime64_zero;
+ guest = cputime64_zero;
jif = - wall_to_monotonic.tv_sec;
if (wall_to_monotonic.tv_nsec)
--jif;
@@ -461,11 +463,12 @@ static int show_stat(struct seq_file *p,
irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+ guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
for (j = 0 ; j < NR_IRQS ; j++)
sum += kstat_cpu(i).irqs[j];
}

- seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
(unsigned long long)cputime64_to_clock_t(system),
@@ -473,7 +476,8 @@ static int show_stat(struct seq_file *p,
(unsigned long long)cputime64_to_clock_t(iowait),
(unsigned long long)cputime64_to_clock_t(irq),
(unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal));
+ (unsigned long long)cputime64_to_clock_t(steal),
+ (unsigned long long)cputime64_to_clock_t(guest));
for_each_online_cpu(i) {

/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -485,7 +489,9 @@ static int show_stat(struct seq_file *p,
irq = kstat_cpu(i).cpustat.irq;
softirq = kstat_cpu(i).cpustat.softirq;
steal = kstat_cpu(i).cpustat.steal;
- seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ guest = kstat_cpu(i).cpustat.guest;
+ seq_printf(p,
+ "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
i,
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
@@ -494,7 +500,8 @@ static int show_stat(struct seq_file *p,
(unsigned long long)cputime64_to_clock_t(iowait),
(unsigned long long)cputime64_to_clock_t(irq),
(unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal));
+ (unsigned long long)cputime64_to_clock_t(steal),
+ (unsigned long long)cputime64_to_clock_t(guest));
}
seq_printf(p, "intr %llu", (unsigned long long)sum);

Index: linux-cfs-2.6.22.13.q/include/asm-generic/bitops/sched.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/asm-generic/bitops/sched.h
+++ linux-cfs-2.6.22.13.q/include/asm-generic/bitops/sched.h
@@ -6,28 +6,23 @@

/*
* Every architecture must define this function. It's the fastest
- * way of searching a 140-bit bitmap where the first 100 bits are
- * unlikely to be set. It's guaranteed that at least one of the 140
- * bits is cleared.
+ * way of searching a 100-bit bitmap. It's guaranteed that at least
+ * one of the 100 bits is cleared.
*/
static inline int sched_find_first_bit(const unsigned long *b)
{
#if BITS_PER_LONG == 64
- if (unlikely(b[0]))
+ if (b[0])
return __ffs(b[0]);
- if (likely(b[1]))
- return __ffs(b[1]) + 64;
- return __ffs(b[2]) + 128;
+ return __ffs(b[1]) + 64;
#elif BITS_PER_LONG == 32
- if (unlikely(b[0]))
+ if (b[0])
return __ffs(b[0]);
- if (unlikely(b[1]))
+ if (b[1])
return __ffs(b[1]) + 32;
- if (unlikely(b[2]))
+ if (b[2])
return __ffs(b[2]) + 64;
- if (b[3])
- return __ffs(b[3]) + 96;
- return __ffs(b[4]) + 128;
+ return __ffs(b[3]) + 96;
#else
#error BITS_PER_LONG not defined
#endif
Index: linux-cfs-2.6.22.13.q/include/linux/cgroup.h
===================================================================
--- /dev/null
+++ linux-cfs-2.6.22.13.q/include/linux/cgroup.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_CGROUP_H
+#define _LINUX_CGROUP_H
+
+/*
+ * Control groups are not backported - we use a few compatibility
+ * defines to be able to use the upstream sched.c as-is:
+ */
+#define task_pid_nr(task) (task)->pid
+#define task_pid_vnr(task) (task)->pid
+#define find_task_by_vpid(pid) find_task_by_pid(pid)
+
+#endif
Index: linux-cfs-2.6.22.13.q/include/linux/cpu.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/cpu.h
+++ linux-cfs-2.6.22.13.q/include/linux/cpu.h
@@ -41,8 +41,6 @@ extern void cpu_remove_sysdev_attr(struc
extern int cpu_add_sysdev_attr_group(struct attribute_group *attrs);
extern void cpu_remove_sysdev_attr_group(struct attribute_group *attrs);

-extern struct sysdev_attribute attr_sched_mc_power_savings;
-extern struct sysdev_attribute attr_sched_smt_power_savings;
extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);

#ifdef CONFIG_HOTPLUG_CPU
Index: linux-cfs-2.6.22.13.q/include/linux/cpuset.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/cpuset.h
+++ linux-cfs-2.6.22.13.q/include/linux/cpuset.h
@@ -146,6 +146,11 @@ static inline int cpuset_do_slab_mem_spr

static inline void cpuset_track_online_nodes(void) {}

+static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p)
+{
+ return cpu_possible_map;
+}
+
#endif /* !CONFIG_CPUSETS */

#endif /* _LINUX_CPUSET_H */
Index: linux-cfs-2.6.22.13.q/include/linux/hardirq.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/hardirq.h
+++ linux-cfs-2.6.22.13.q/include/linux/hardirq.h
@@ -79,6 +79,19 @@
#endif

#ifdef CONFIG_PREEMPT
+# define PREEMPT_CHECK_OFFSET 1
+#else
+# define PREEMPT_CHECK_OFFSET 0
+#endif
+
+/*
+ * Check whether we were atomic before we did preempt_disable():
+ * (used by the scheduler)
+ */
+#define in_atomic_preempt_off() \
+ ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
+
+#ifdef CONFIG_PREEMPT
# define preemptible() (preempt_count() == 0 && !irqs_disabled())
# define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
#else
Index: linux-cfs-2.6.22.13.q/include/linux/init_task.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/init_task.h
+++ linux-cfs-2.6.22.13.q/include/linux/init_task.h
@@ -8,6 +8,7 @@
#include <linux/lockdep.h>
#include <linux/ipc.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>

#define INIT_FDTABLE \
{ \
@@ -78,6 +79,7 @@ extern struct nsproxy init_nsproxy;
.uts_ns = &init_uts_ns, \
.mnt_ns = NULL, \
INIT_IPC_NS(ipc_ns) \
+ .user_ns = &init_user_ns, \
}

#define INIT_SIGHAND(sighand) { \
Index: linux-cfs-2.6.22.13.q/include/linux/kernel.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/kernel.h
+++ linux-cfs-2.6.22.13.q/include/linux/kernel.h
@@ -60,6 +60,13 @@ extern const char linux_proc_banner[];
#define KERN_INFO "<6>" /* informational */
#define KERN_DEBUG "<7>" /* debug-level messages */

+/*
+ * Annotation for a "continued" line of log printout (only done after a
+ * line that had no enclosing \n). Only to be used by core/arch code
+ * during early bootup (a continued line is not SMP-safe otherwise).
+ */
+#define KERN_CONT ""
+
extern int console_printk[];

#define console_loglevel (console_printk[0])
Index: linux-cfs-2.6.22.13.q/include/linux/kernel_stat.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/kernel_stat.h
+++ linux-cfs-2.6.22.13.q/include/linux/kernel_stat.h
@@ -23,6 +23,7 @@ struct cpu_usage_stat {
cputime64_t idle;
cputime64_t iowait;
cputime64_t steal;
+ cputime64_t guest;
};

struct kernel_stat {
@@ -52,7 +53,9 @@ static inline int kstat_irqs(int irq)
}

extern void account_user_time(struct task_struct *, cputime_t);
+extern void account_user_time_scaled(struct task_struct *, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t);
+extern void account_system_time_scaled(struct task_struct *, cputime_t);
extern void account_steal_time(struct task_struct *, cputime_t);

#endif /* _LINUX_KERNEL_STAT_H */
Index: linux-cfs-2.6.22.13.q/include/linux/nodemask.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/nodemask.h
+++ linux-cfs-2.6.22.13.q/include/linux/nodemask.h
@@ -338,31 +338,88 @@ static inline void __nodes_remap(nodemas
#endif /* MAX_NUMNODES */

/*
+ * Bitmasks that are kept for all the nodes.
+ */
+enum node_states {
+ N_POSSIBLE, /* The node could become online at some point */
+ N_ONLINE, /* The node is online */
+ N_NORMAL_MEMORY, /* The node has regular memory */
+#ifdef CONFIG_HIGHMEM
+ N_HIGH_MEMORY, /* The node has regular or high memory */
+#else
+ N_HIGH_MEMORY = N_NORMAL_MEMORY,
+#endif
+ N_CPU, /* The node has one or more cpus */
+ NR_NODE_STATES
+};
+
+/*
* The following particular system nodemasks and operations
* on them manage all possible and online nodes.
*/

-extern nodemask_t node_online_map;
-extern nodemask_t node_possible_map;
+extern nodemask_t node_states[NR_NODE_STATES];

#if MAX_NUMNODES > 1
-#define num_online_nodes() nodes_weight(node_online_map)
-#define num_possible_nodes() nodes_weight(node_possible_map)
-#define node_online(node) node_isset((node), node_online_map)
-#define node_possible(node) node_isset((node), node_possible_map)
-#define first_online_node first_node(node_online_map)
-#define next_online_node(nid) next_node((nid), node_online_map)
+static inline int node_state(int node, enum node_states state)
+{
+ return node_isset(node, node_states[state]);
+}
+
+static inline void node_set_state(int node, enum node_states state)
+{
+ __node_set(node, &node_states[state]);
+}
+
+static inline void node_clear_state(int node, enum node_states state)
+{
+ __node_clear(node, &node_states[state]);
+}
+
+static inline int num_node_state(enum node_states state)
+{
+ return nodes_weight(node_states[state]);
+}
+
+#define for_each_node_state(__node, __state) \
+ for_each_node_mask((__node), node_states[__state])
+
+#define first_online_node first_node(node_states[N_ONLINE])
+#define next_online_node(nid) next_node((nid), node_states[N_ONLINE])
+
extern int nr_node_ids;
#else
-#define num_online_nodes() 1
-#define num_possible_nodes() 1
-#define node_online(node) ((node) == 0)
-#define node_possible(node) ((node) == 0)
+
+static inline int node_state(int node, enum node_states state)
+{
+ return node == 0;
+}
+
+static inline void node_set_state(int node, enum node_states state)
+{
+}
+
+static inline void node_clear_state(int node, enum node_states state)
+{
+}
+
+static inline int num_node_state(enum node_states state)
+{
+ return 1;
+}
+
+#define for_each_node_state(node, __state) \
+ for ( (node) = 0; (node) == 0; (node) = 1)
+
#define first_online_node 0
#define next_online_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1
+
#endif

+#define node_online_map node_states[N_ONLINE]
+#define node_possible_map node_states[N_POSSIBLE]
+
#define any_online_node(mask) \
({ \
int node; \
@@ -372,10 +429,15 @@ extern int nr_node_ids;
node; \
})

-#define node_set_online(node) set_bit((node), node_online_map.bits)
-#define node_set_offline(node) clear_bit((node), node_online_map.bits)
+#define num_online_nodes() num_node_state(N_ONLINE)
+#define num_possible_nodes() num_node_state(N_POSSIBLE)
+#define node_online(node) node_state((node), N_ONLINE)
+#define node_possible(node) node_state((node), N_POSSIBLE)
+
+#define node_set_online(node) node_set_state((node), N_ONLINE)
+#define node_set_offline(node) node_clear_state((node), N_ONLINE)

-#define for_each_node(node) for_each_node_mask((node), node_possible_map)
-#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
+#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
+#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

#endif /* __LINUX_NODEMASK_H */
Index: linux-cfs-2.6.22.13.q/include/linux/nsproxy.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/nsproxy.h
+++ linux-cfs-2.6.22.13.q/include/linux/nsproxy.h
@@ -28,10 +28,11 @@ struct nsproxy {
struct ipc_namespace *ipc_ns;
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns;
+ struct user_namespace *user_ns;
};
extern struct nsproxy init_nsproxy;

-int copy_namespaces(int flags, struct task_struct *tsk);
+int copy_namespaces(unsigned long flags, struct task_struct *tsk);
void get_task_namespaces(struct task_struct *tsk);
void free_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
Index: linux-cfs-2.6.22.13.q/include/linux/sched.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/sched.h
+++ linux-cfs-2.6.22.13.q/include/linux/sched.h
@@ -2,6 +2,15 @@
#define _LINUX_SCHED_H

#include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
+#include <linux/cgroup.h> /* CFS backport details */
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED(x,y) \
+ DEFINE_PER_CPU(x,y) ____cacheline_aligned_in_smp
+
+#define COMPAT_REGISTER_SYSCTL
+
+/* backporting helper macro: */
+#define cpu_sibling_map(cpu) cpu_sibling_map[cpu]

/*
* cloning flags:
@@ -26,6 +35,7 @@
#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
+#define CLONE_NEWUSER 0x10000000 /* New user namespace */

/*
* Scheduling policies
@@ -34,6 +44,8 @@
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
+/* SCHED_ISO: reserved but not implemented yet */
+#define SCHED_IDLE 5

#ifdef __KERNEL__

@@ -83,6 +95,7 @@ struct sched_param {
#include <linux/timer.h>
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
+#include <linux/kobject.h>

#include <asm/processor.h>

@@ -110,7 +123,7 @@ extern unsigned long avenrun[]; /* Load

#define FSHIFT 11 /* nr of bits of precision */
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
-#define LOAD_FREQ (5*HZ) /* 5 sec intervals */
+#define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */
#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
#define EXP_5 2014 /* 1/exp(5sec/5min) */
#define EXP_15 2037 /* 1/exp(5sec/15min) */
@@ -130,6 +143,27 @@ extern unsigned long nr_active(void);
extern unsigned long nr_iowait(void);
extern unsigned long weighted_cpuload(const int cpu);

+struct seq_file;
+struct cfs_rq;
+struct task_group;
+#ifdef CONFIG_SCHED_DEBUG
+extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+extern void proc_sched_set_task(struct task_struct *p);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+#else
+static inline void
+proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+}
+static inline void proc_sched_set_task(struct task_struct *p)
+{
+}
+static inline void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+}
+#endif

/*
* Task state bitmask. NOTE! These bits are also
@@ -150,8 +184,7 @@ extern unsigned long weighted_cpuload(co
#define EXIT_ZOMBIE 16
#define EXIT_DEAD 32
/* in tsk->state again */
-#define TASK_NONINTERACTIVE 64
-#define TASK_DEAD 128
+#define TASK_DEAD 64

#define __set_task_state(tsk, state_value) \
do { (tsk)->state = (state_value); } while (0)
@@ -193,6 +226,7 @@ struct task_struct;
extern void sched_init(void);
extern void sched_init_smp(void);
extern void init_idle(struct task_struct *idle, int cpu);
+extern void init_idle_bootup_task(struct task_struct *idle);

extern cpumask_t nohz_cpu_mask;
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
@@ -264,6 +298,7 @@ extern signed long schedule_timeout_unin
asmlinkage void schedule(void);

struct nsproxy;
+struct user_namespace;

/* Maximum number of active map areas.. This is a random (large) number */
#define DEFAULT_MAX_MAP_COUNT 65536
@@ -469,6 +504,8 @@ struct signal_struct {
* in __exit_signal, except for the group leader.
*/
cputime_t utime, stime, cutime, cstime;
+ cputime_t gtime;
+ cputime_t cgtime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
unsigned long inblock, oublock, cinblock, coublock;
@@ -479,7 +516,7 @@ struct signal_struct {
* from jiffies_to_ns(utime + stime) if sched_clock uses something
* other than jiffies.)
*/
- unsigned long long sched_time;
+ unsigned long long sum_sched_runtime;

/*
* We don't bother to synchronize most readers of this at all,
@@ -521,31 +558,6 @@ struct signal_struct {
#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */

-
-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space. This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-
-#define MAX_USER_RT_PRIO 100
-#define MAX_RT_PRIO MAX_USER_RT_PRIO
-
-#define MAX_PRIO (MAX_RT_PRIO + 40)
-
-#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
-#define rt_task(p) rt_prio((p)->prio)
-#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
-#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
-
/*
* Some day this will be a full-fledged user tracking system..
*/
@@ -568,10 +580,25 @@ struct user_struct {
#endif

/* Hash table maintenance information */
- struct list_head uidhash_list;
+ struct hlist_node uidhash_node;
uid_t uid;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+ struct task_group *tg;
+#ifdef CONFIG_SYSFS
+ struct kset kset;
+ struct subsys_attribute user_attr;
+ struct work_struct work;
+#endif
+#endif
};

+#ifdef CONFIG_FAIR_USER_SCHED
+extern int uids_kobject_init(void);
+#else
+static inline int uids_kobject_init(void) { return 0; }
+#endif
+
extern struct user_struct *find_user(uid_t);

extern struct user_struct root_user;
@@ -583,13 +610,17 @@ struct reclaim_state;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info {
/* cumulative counters */
- unsigned long cpu_time, /* time spent on the cpu */
- run_delay, /* time spent waiting on a runqueue */
- pcnt; /* # of timeslices run on this cpu */
+ unsigned long pcount; /* # of times run on this cpu */
+ unsigned long long cpu_time, /* time spent on the cpu */
+ run_delay; /* time spent waiting on a runqueue */

/* timestamps */
- unsigned long last_arrival, /* when we last ran on a cpu */
- last_queued; /* when we were last queued to run */
+ unsigned long long last_arrival,/* when we last ran on a cpu */
+ last_queued; /* when we were last queued to run */
+#ifdef CONFIG_SCHEDSTATS
+ /* BKL stats */
+ unsigned int bkl_count;
+#endif
};
#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */

@@ -639,18 +670,24 @@ static inline int sched_info_on(void)
#endif
}

-enum idle_type
-{
- SCHED_IDLE,
- NOT_IDLE,
- NEWLY_IDLE,
- MAX_IDLE_TYPES
+enum cpu_idle_type {
+ CPU_IDLE,
+ CPU_NOT_IDLE,
+ CPU_NEWLY_IDLE,
+ CPU_MAX_IDLE_TYPES
};

/*
* sched-domains (multiprocessor balancing) declarations:
*/
-#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */
+
+/*
+ * Increase resolution of nice-level calculations:
+ */
+#define SCHED_LOAD_SHIFT 10
+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
+
+#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE

#ifdef CONFIG_SMP
#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
@@ -703,7 +740,6 @@ struct sched_domain {
unsigned long max_interval; /* Maximum balance interval ms */
unsigned int busy_factor; /* less balancing by factor if busy */
unsigned int imbalance_pct; /* No balance until over watermark */
- unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int busy_idx;
unsigned int idle_idx;
@@ -719,48 +755,57 @@ struct sched_domain {

#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
- unsigned long lb_cnt[MAX_IDLE_TYPES];
- unsigned long lb_failed[MAX_IDLE_TYPES];
- unsigned long lb_balanced[MAX_IDLE_TYPES];
- unsigned long lb_imbalance[MAX_IDLE_TYPES];
- unsigned long lb_gained[MAX_IDLE_TYPES];
- unsigned long lb_hot_gained[MAX_IDLE_TYPES];
- unsigned long lb_nobusyg[MAX_IDLE_TYPES];
- unsigned long lb_nobusyq[MAX_IDLE_TYPES];
+ unsigned int lb_count[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
+ unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];

/* Active load balancing */
- unsigned long alb_cnt;
- unsigned long alb_failed;
- unsigned long alb_pushed;
+ unsigned int alb_count;
+ unsigned int alb_failed;
+ unsigned int alb_pushed;

/* SD_BALANCE_EXEC stats */
- unsigned long sbe_cnt;
- unsigned long sbe_balanced;
- unsigned long sbe_pushed;
+ unsigned int sbe_count;
+ unsigned int sbe_balanced;
+ unsigned int sbe_pushed;

/* SD_BALANCE_FORK stats */
- unsigned long sbf_cnt;
- unsigned long sbf_balanced;
- unsigned long sbf_pushed;
+ unsigned int sbf_count;
+ unsigned int sbf_balanced;
+ unsigned int sbf_pushed;

/* try_to_wake_up() stats */
- unsigned long ttwu_wake_remote;
- unsigned long ttwu_move_affine;
- unsigned long ttwu_move_balance;
+ unsigned int ttwu_wake_remote;
+ unsigned int ttwu_move_affine;
+ unsigned int ttwu_move_balance;
#endif
};

-extern int partition_sched_domains(cpumask_t *partition1,
- cpumask_t *partition2);
-
-/*
- * Maximum cache size the migration-costs auto-tuning code will
- * search from:
- */
-extern unsigned int max_cache_size;
+extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);

#endif /* CONFIG_SMP */

+/*
+ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
+ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
+ * task of nice 0 or enough lower priority tasks to bring up the
+ * weighted_cpuload
+ */
+static inline int above_background_load(void)
+{
+ unsigned long cpu;
+
+ for_each_online_cpu(cpu) {
+ if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
+ return 1;
+ }
+ return 0;
+}

struct io_context; /* See blkdev.h */
struct cpuset;
@@ -809,14 +854,101 @@ struct mempolicy;
struct pipe_inode_info;
struct uts_namespace;

-enum sleep_type {
- SLEEP_NORMAL,
- SLEEP_NONINTERACTIVE,
- SLEEP_INTERACTIVE,
- SLEEP_INTERRUPTED,
+struct rq;
+struct sched_domain;
+
+struct sched_class {
+ const struct sched_class *next;
+
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+ void (*yield_task) (struct rq *rq);
+
+ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
+
+ struct task_struct * (*pick_next_task) (struct rq *rq);
+ void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
+#ifdef CONFIG_SMP
+ unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
+ struct rq *busiest, unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio);
+
+ int (*move_one_task) (struct rq *this_rq, int this_cpu,
+ struct rq *busiest, struct sched_domain *sd,
+ enum cpu_idle_type idle);
+#endif
+
+ void (*set_curr_task) (struct rq *rq);
+ void (*task_tick) (struct rq *rq, struct task_struct *p);
+ void (*task_new) (struct rq *rq, struct task_struct *p);
};

-struct prio_array;
+struct load_weight {
+ unsigned long weight, inv_weight;
+};
+
+/*
+ * CFS stats for a schedulable entity (task, task-group etc)
+ *
+ * Current field usage histogram:
+ *
+ * 4 se->block_start
+ * 4 se->run_node
+ * 4 se->sleep_start
+ * 6 se->load.weight
+ */
+struct sched_entity {
+ struct load_weight load; /* for load-balancing */
+ struct avl_node run_node;
+ unsigned int on_rq;
+
+ u64 exec_start;
+ u64 sum_exec_runtime;
+ u64 vruntime;
+ u64 prev_sum_exec_runtime;
+
+#ifdef CONFIG_SCHEDSTATS
+ u64 wait_start;
+ u64 wait_max;
+
+ u64 sleep_start;
+ u64 sleep_max;
+ s64 sum_sleep_runtime;
+
+ u64 block_start;
+ u64 block_max;
+ u64 exec_max;
+ u64 slice_max;
+
+ u64 nr_migrations;
+ u64 nr_migrations_cold;
+ u64 nr_failed_migrations_affine;
+ u64 nr_failed_migrations_running;
+ u64 nr_failed_migrations_hot;
+ u64 nr_forced_migrations;
+ u64 nr_forced2_migrations;
+
+ u64 nr_wakeups;
+ u64 nr_wakeups_sync;
+ u64 nr_wakeups_migrate;
+ u64 nr_wakeups_local;
+ u64 nr_wakeups_remote;
+ u64 nr_wakeups_affine;
+ u64 nr_wakeups_affine_attempts;
+ u64 nr_wakeups_passive;
+ u64 nr_wakeups_idle;
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *parent;
+ /* rq on which this entity is (to be) queued: */
+ struct cfs_rq *cfs_rq;
+ /* rq "owned" by this entity/group: */
+ struct cfs_rq *my_q;
+#endif
+};

struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
@@ -832,23 +964,25 @@ struct task_struct {
int oncpu;
#endif
#endif
- int load_weight; /* for niceness load balancing purposes */
+
int prio, static_prio, normal_prio;
struct list_head run_list;
- struct prio_array *array;
+ const struct sched_class *sched_class;
+ struct sched_entity se;
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ /* list of struct preempt_notifier: */
+ struct hlist_head preempt_notifiers;
+#endif

unsigned short ioprio;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
- unsigned long sleep_avg;
- unsigned long long timestamp, last_ran;
- unsigned long long sched_time; /* sched_clock time spent running */
- enum sleep_type sleep_type;

unsigned int policy;
cpumask_t cpus_allowed;
- unsigned int time_slice, first_time_slice;
+ unsigned int time_slice;

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
@@ -903,9 +1037,12 @@ struct task_struct {
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

unsigned int rt_priority;
- cputime_t utime, stime;
+ cputime_t utime, stime, utimescaled, stimescaled;
+ cputime_t gtime;
+ cputime_t prev_utime, prev_stime;
unsigned long nvcsw, nivcsw; /* context switch counts */
- struct timespec start_time;
+ struct timespec start_time; /* monotonic time */
+ struct timespec real_start_time; /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;

@@ -1078,6 +1215,37 @@ struct task_struct {
#endif
};

+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space. This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO 100
+#define MAX_RT_PRIO MAX_USER_RT_PRIO
+
+#define MAX_PRIO (MAX_RT_PRIO + 40)
+#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
+
+static inline int rt_prio(int prio)
+{
+ if (unlikely(prio < MAX_RT_PRIO))
+ return 1;
+ return 0;
+}
+
+static inline int rt_task(struct task_struct *p)
+{
+ return rt_prio(p->prio);
+}
+
static inline pid_t process_group(struct task_struct *tsk)
{
return tsk->signal->pgrp;
@@ -1163,6 +1331,7 @@ static inline void put_task_struct(struc
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
+#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* dumped core */
@@ -1222,8 +1391,15 @@ static inline int set_cpus_allowed(struc
#endif

extern unsigned long long sched_clock(void);
+
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+extern unsigned long long cpu_clock(int cpu);
+
extern unsigned long long
-current_sched_time(const struct task_struct *current_task);
+task_sched_runtime(struct task_struct *task);

/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
@@ -1232,6 +1408,9 @@ extern void sched_exec(void);
#define sched_exec() {}
#endif

+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+
#ifdef CONFIG_HOTPLUG_CPU
extern void idle_task_exit(void);
#else
@@ -1240,6 +1419,27 @@ static inline void idle_task_exit(void)

extern void sched_idle_next(void);

+#ifdef CONFIG_SCHED_DEBUG
+extern unsigned int sysctl_sched_latency;
+extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern unsigned int sysctl_sched_batch_wakeup_granularity;
+extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_features;
+extern unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_nr_migrate;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern unsigned int sysctl_sched_min_bal_int_shares;
+extern unsigned int sysctl_sched_max_bal_int_shares;
+#endif
+
+int sched_nr_latency_handler(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length,
+ loff_t *ppos);
+#endif
+
+extern unsigned int sysctl_sched_compat_yield;
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1295,7 +1495,7 @@ extern struct task_struct *find_task_by_
extern void __set_special_pids(pid_t session, pid_t pgrp);

/* per-UID process charging. */
-extern struct user_struct * alloc_uid(uid_t);
+extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
static inline struct user_struct *get_uid(struct user_struct *u)
{
atomic_inc(&u->__count);
@@ -1303,6 +1503,7 @@ static inline struct user_struct *get_ui
}
extern void free_uid(struct user_struct *);
extern void switch_uid(struct user_struct *);
+extern void release_uids(struct user_namespace *ns);

#include <asm/current.h>

@@ -1317,8 +1518,8 @@ extern void FASTCALL(wake_up_new_task(st
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif
-extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
-extern void FASTCALL(sched_exit(struct task_struct * p));
+extern void sched_fork(struct task_struct *p, int clone_flags);
+extern void sched_dead(struct task_struct *p);

extern int in_group_p(gid_t);
extern int in_egroup_p(gid_t);
@@ -1406,7 +1607,7 @@ extern struct mm_struct * mm_alloc(void)
extern void FASTCALL(__mmdrop(struct mm_struct *));
static inline void mmdrop(struct mm_struct * mm)
{
- if (atomic_dec_and_test(&mm->mm_count))
+ if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm);
}

@@ -1638,10 +1839,7 @@ static inline unsigned int task_cpu(cons
return task_thread_info(p)->cpu;
}

-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
- task_thread_info(p)->cpu = cpu;
-}
+extern void set_task_cpu(struct task_struct *p, unsigned int cpu);

#else

@@ -1674,6 +1872,18 @@ extern int sched_mc_power_savings, sched

extern void normalize_rt_tasks(void);

+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+extern struct task_group init_task_group;
+
+extern struct task_group *sched_create_group(void);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_move_task(struct task_struct *tsk);
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern unsigned long sched_group_shares(struct task_group *tg);
+
+#endif
+
#ifdef CONFIG_TASK_XACCT
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
@@ -1712,6 +1922,14 @@ static inline void inc_syscw(struct task
}
#endif

+#ifdef CONFIG_SMP
+void migration_init(void);
+#else
+static inline void migration_init(void)
+{
+}
+#endif
+
#endif /* __KERNEL__ */

#endif
Index: linux-cfs-2.6.22.13.q/include/linux/taskstats.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/taskstats.h
+++ linux-cfs-2.6.22.13.q/include/linux/taskstats.h
@@ -31,7 +31,7 @@
*/

-#define TASKSTATS_VERSION 4
+#define TASKSTATS_VERSION 6
#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
* in linux/sched.h */

@@ -149,6 +149,14 @@ struct taskstats {
__u64 read_bytes; /* bytes of read I/O */
__u64 write_bytes; /* bytes of write I/O */
__u64 cancelled_write_bytes; /* bytes of cancelled write I/O */
+
+ __u64 nvcsw; /* voluntary_ctxt_switches */
+ __u64 nivcsw; /* nonvoluntary_ctxt_switches */
+
+ /* time accounting for SMT machines */
+ __u64 ac_utimescaled; /* utime scaled on frequency etc */
+ __u64 ac_stimescaled; /* stime scaled on frequency etc */
+ __u64 cpu_scaled_run_real_total; /* scaled cpu_run_real_total */
};

Index: linux-cfs-2.6.22.13.q/include/linux/topology.h
===================================================================
--- linux-cfs-2.6.22.13.q.orig/include/linux/topology.h
+++ linux-cfs-2.6.22.13.q/include/linux/topology.h
@@ -50,10 +50,10 @@
for_each_online_node(node) \
if (nr_cpus_node(node))

-#ifndef node_distance
/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE 10
#define REMOTE_DISTANCE 20
+#ifndef node_distance
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
#ifndef RECLAIM_DISTANCE
@@ -98,7 +98,7 @@
.cache_nice_tries = 0, \
.busy_idx = 0, \
.idle_idx = 0, \
- .newidle_idx = 1, \
+ .newidle_idx = 0, \
.wake_idx = 0, \
.forkexec_idx = 0, \
.flags = SD_LOAD_BALANCE \
@@ -128,14 +128,15 @@
.imbalance_pct = 125, \
.cache_nice_tries = 1, \
.busy_idx = 2, \
- .idle_idx = 1, \
- .newidle_idx = 2, \
+ .idle_idx = 0, \
+ .newidle_idx = 0, \
.wake_idx = 1, \
.forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \
+ | SD_WAKE_IDLE \
| SD_SHARE_PKG_RESOURCES\
| BALANCE_FOR_MC_POWER, \
.last_balance = jiffies, \
@@ -183,7 +184,6 @@
.max_interval = 64*num_online_cpus(), \
.busy_factor = 128, \
.imbalance_pct = 133, \
- .cache_hot_time = (10*1000000), \
.cache_nice_tries = 1, \
.busy_idx = 3, \
.idle_idx = 3, \
Index: linux-cfs-2.6.22.13.q/include/linux/user_namespace.h
===================================================================
--- /dev/null
+++ linux-cfs-2.6.22.13.q/include/linux/user_namespace.h
@@ -0,0 +1,61 @@
+#ifndef _LINUX_USER_NAMESPACE_H
+#define _LINUX_USER_NAMESPACE_H
+
+#include <linux/kref.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+
+#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
+#define UIDHASH_SZ (1 << UIDHASH_BITS)
+
+struct user_namespace {
+ struct kref kref;
+ struct hlist_head uidhash_table[UIDHASH_SZ];
+ struct user_struct *root_user;
+};
+
+extern struct user_namespace init_user_ns;
+
+#ifdef CONFIG_USER_NS
+
+static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+{
+ if (ns)
+ kref_get(&ns->kref);
+ return ns;
+}
+
+extern struct user_namespace *copy_user_ns(int flags,
+ struct user_namespace *old_ns);
+extern void free_user_ns(struct kref *kref);
+
+static inline void put_user_ns(struct user_namespace *ns)
+{
+ if (ns)
+ kref_put(&ns->kref, free_user_ns);
+}
+
+#else
+
+static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+{
+ return &init_user_ns;
+}
+
+static inline struct user_namespace *copy_user_ns(int flags,
+ struct user_namespace *old_ns)
+{
+ if (flags & CLONE_NEWUSER)
+ return ERR_PTR(-EINVAL);
+
+ return old_ns;
+}
+
+static inline void put_user_ns(struct user_namespace *ns)
+{
+}
+
+#endif
+
+#endif /* _LINUX_USER_H */
Index: linux-cfs-2.6.22.13.q/init/Kconfig
===================================================================
--- linux-cfs-2.6.22.13.q.orig/init/Kconfig
+++ linux-cfs-2.6.22.13.q/init/Kconfig
@@ -284,6 +284,11 @@ config LOG_BUF_SHIFT
config CPUSETS
bool "Cpuset support"
depends on SMP
+ #
+ # disabled for now - depends on control groups, which
+ # are hard to backport:
+ #
+ depends on 0
help
This option will let you create and manage CPUSETs which
allow dynamically partitioning a system into sets of CPUs and
@@ -292,6 +297,27 @@ config CPUSETS

Say N if unsure.

+config FAIR_GROUP_SCHED
+ bool "Fair group CPU scheduler"
+ default y
+ depends on EXPERIMENTAL
+ help
+ This feature lets CPU scheduler recognize task groups and control CPU
+ bandwidth allocation to such task groups.
+
+choice
+ depends on FAIR_GROUP_SCHED
+ prompt "Basis for grouping tasks"
+ default FAIR_USER_SCHED
+
+config FAIR_USER_SCHED
+ bool "user id"
+ help
+ This option will choose userid as the basis for grouping
+ tasks, thus providing equal CPU bandwidth to each user.
+
+endchoice
+
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y
Index: linux-cfs-2.6.22.13.q/init/main.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/init/main.c
+++ linux-cfs-2.6.22.13.q/init/main.c
@@ -436,15 +436,16 @@ static void noinline __init_refok rest_i

/*
* The boot idle thread must execute schedule()
- * at least one to get things moving:
+ * at least once to get things moving:
*/
+ init_idle_bootup_task(current);
preempt_enable_no_resched();
schedule();
preempt_disable();

/* Call into cpu_idle with preempt disabled */
cpu_idle();
-}
+}

/* Check for early params. */
static int __init do_early_param(char *param, char *val)
@@ -727,11 +728,8 @@ static void __init do_basic_setup(void)
static void __init do_pre_smp_initcalls(void)
{
extern int spawn_ksoftirqd(void);
-#ifdef CONFIG_SMP
- extern int migration_init(void);

migration_init();
-#endif
spawn_ksoftirqd();
spawn_softlockup_task();
}
Index: linux-cfs-2.6.22.13.q/kernel/Makefile
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/Makefile
+++ linux-cfs-2.6.22.13.q/kernel/Makefile
@@ -4,7 +4,7 @@

obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
exit.o itimer.o time.o softirq.o resource.o \
- sysctl.o capability.o ptrace.o timer.o user.o \
+ sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
Index: linux-cfs-2.6.22.13.q/kernel/delayacct.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/delayacct.c
+++ linux-cfs-2.6.22.13.q/kernel/delayacct.c
@@ -99,9 +99,10 @@ void __delayacct_blkio_end(void)
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
s64 tmp;
- struct timespec ts;
- unsigned long t1,t2,t3;
+ unsigned long t1;
+ unsigned long long t2,t3;
unsigned long flags;
+ struct timespec ts;

/* Though tsk->delays accessed later, early exit avoids
* unnecessary returning of other data
@@ -114,21 +115,26 @@ int __delayacct_add_tsk(struct taskstats
tmp += timespec_to_ns(&ts);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;

+ tmp = (s64)d->cpu_scaled_run_real_total;
+ cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+ tmp += timespec_to_ns(&ts);
+ d->cpu_scaled_run_real_total =
+ (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
+
/*
* No locking available for sched_info (and too expensive to add one)
* Mitigate by taking snapshot of values
*/
- t1 = tsk->sched_info.pcnt;
+ t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
t3 = tsk->sched_info.cpu_time;

d->cpu_count += t1;

- jiffies_to_timespec(t2, &ts);
- tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
+ tmp = (s64)d->cpu_delay_total + t2;
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;

- tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+ tmp = (s64)d->cpu_run_virtual_total + t3;
d->cpu_run_virtual_total =
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;

Index: linux-cfs-2.6.22.13.q/kernel/exit.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/exit.c
+++ linux-cfs-2.6.22.13.q/kernel/exit.c
@@ -118,13 +118,14 @@ static void __exit_signal(struct task_st
*/
sig->utime = cputime_add(sig->utime, tsk->utime);
sig->stime = cputime_add(sig->stime, tsk->stime);
+ sig->gtime = cputime_add(sig->gtime, tsk->gtime);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
sig->nivcsw += tsk->nivcsw;
- sig->sched_time += tsk->sched_time;
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
+ sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig = NULL; /* Marker for below. */
}

@@ -182,7 +183,6 @@ repeat:
zap_leader = (leader->exit_signal == -1);
}

- sched_exit(p);
write_unlock_irq(&tasklist_lock);
proc_flush_task(p);
release_thread(p);
@@ -291,7 +291,7 @@ static void reparent_to_kthreadd(void)
/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;

- if (!has_rt_policy(current) && (task_nice(current) < 0))
+ if (task_nice(current) < 0)
set_user_nice(current, 0);
/* cpus_allowed? */
/* rt_priority? */
@@ -1216,6 +1216,11 @@ static int wait_task_zombie(struct task_
cputime_add(p->stime,
cputime_add(sig->stime,
sig->cstime)));
+ psig->cgtime =
+ cputime_add(psig->cgtime,
+ cputime_add(p->gtime,
+ cputime_add(sig->gtime,
+ sig->cgtime)));
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
Index: linux-cfs-2.6.22.13.q/kernel/fork.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/fork.c
+++ linux-cfs-2.6.22.13.q/kernel/fork.c
@@ -874,10 +874,12 @@ static inline int copy_signal(unsigned l
sig->tty_old_pgrp = NULL;

sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+ sig->gtime = cputime_zero;
+ sig->cgtime = cputime_zero;
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
- sig->sched_time = 0;
+ sig->sum_sched_runtime = 0;
INIT_LIST_HEAD(&sig->cpu_timers[0]);
INIT_LIST_HEAD(&sig->cpu_timers[1]);
INIT_LIST_HEAD(&sig->cpu_timers[2]);
@@ -999,7 +1001,7 @@ static struct task_struct *copy_process(
if (atomic_read(&p->user->processes) >=
p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
- p->user != &root_user)
+ p->user != current->nsproxy->user_ns->root_user)
goto bad_fork_free;
}

@@ -1040,7 +1042,12 @@ static struct task_struct *copy_process(

p->utime = cputime_zero;
p->stime = cputime_zero;
- p->sched_time = 0;
+ p->prev_utime = cputime_zero;
+ p->prev_stime = cputime_zero;
+ p->gtime = cputime_zero;
+ p->utimescaled = cputime_zero;
+ p->stimescaled = cputime_zero;
+
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
@@ -1601,7 +1608,7 @@ asmlinkage long sys_unshare(unsigned lon
err = -EINVAL;
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC))
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER))
goto bad_unshare_out;

if ((err = unshare_thread(unshare_flags)))
Index: linux-cfs-2.6.22.13.q/kernel/ksysfs.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/ksysfs.c
+++ linux-cfs-2.6.22.13.q/kernel/ksysfs.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kexec.h>
+#include <linux/sched.h>

#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -88,6 +89,13 @@ static int __init ksysfs_init(void)
error = sysfs_create_group(&kernel_subsys.kobj,
&kernel_attr_group);

+ /*
+ * Create "/sys/kernel/uids" directory and corresponding root user's
+ * directory under it.
+ */
+ if (!error)
+ error = uids_kobject_init();
+
return error;
}

Index: linux-cfs-2.6.22.13.q/kernel/nsproxy.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/nsproxy.c
+++ linux-cfs-2.6.22.13.q/kernel/nsproxy.c
@@ -20,6 +20,9 @@
#include <linux/mnt_namespace.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+
+static struct kmem_cache *nsproxy_cachep;

struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

@@ -43,9 +46,11 @@ static inline struct nsproxy *clone_nspr
{
struct nsproxy *ns;

- ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
- if (ns)
+ ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
+ if (ns) {
+ memcpy(ns, orig, sizeof(struct nsproxy));
atomic_set(&ns->count, 1);
+ }
return ns;
}

@@ -54,33 +59,51 @@ static inline struct nsproxy *clone_nspr
* Return the newly created nsproxy. Do not attach this to the task,
* leave it to the caller to do proper locking and attach it to task.
*/
-static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk,
- struct fs_struct *new_fs)
+static struct nsproxy *create_new_namespaces(unsigned long flags,
+ struct task_struct *tsk, struct fs_struct *new_fs)
{
struct nsproxy *new_nsp;
+ int err;

new_nsp = clone_nsproxy(tsk->nsproxy);
if (!new_nsp)
return ERR_PTR(-ENOMEM);

new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
- if (IS_ERR(new_nsp->mnt_ns))
+ if (IS_ERR(new_nsp->mnt_ns)) {
+ err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
+ }

new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
- if (IS_ERR(new_nsp->uts_ns))
+ if (IS_ERR(new_nsp->uts_ns)) {
+ err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;
+ }

new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
- if (IS_ERR(new_nsp->ipc_ns))
+ if (IS_ERR(new_nsp->ipc_ns)) {
+ err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;
+ }

new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
- if (IS_ERR(new_nsp->pid_ns))
+ if (IS_ERR(new_nsp->pid_ns)) {
+ err = PTR_ERR(new_nsp->pid_ns);
goto out_pid;
+ }
+
+ new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
+ if (IS_ERR(new_nsp->user_ns)) {
+ err = PTR_ERR(new_nsp->user_ns);
+ goto out_user;
+ }

return new_nsp;

+out_user:
+ if (new_nsp->pid_ns)
+ put_pid_ns(new_nsp->pid_ns);
out_pid:
if (new_nsp->ipc_ns)
put_ipc_ns(new_nsp->ipc_ns);
@@ -91,15 +114,15 @@ out_uts:
if (new_nsp->mnt_ns)
put_mnt_ns(new_nsp->mnt_ns);
out_ns:
- kfree(new_nsp);
- return ERR_PTR(-ENOMEM);
+ kmem_cache_free(nsproxy_cachep, new_nsp);
+ return ERR_PTR(err);
}

/*
* called from clone. This now handles copy for nsproxy and all
* namespaces therein.
*/
-int copy_namespaces(int flags, struct task_struct *tsk)
+int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct nsproxy *new_ns;
@@ -110,7 +133,7 @@ int copy_namespaces(int flags, struct ta

get_nsproxy(old_ns);

- if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+ if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER)))
return 0;

if (!capable(CAP_SYS_ADMIN)) {
@@ -140,7 +163,9 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns)
put_pid_ns(ns->pid_ns);
- kfree(ns);
+ if (ns->user_ns)
+ put_user_ns(ns->user_ns);
+ kmem_cache_free(nsproxy_cachep, ns);
}

/*
@@ -152,19 +177,10 @@ int unshare_nsproxy_namespaces(unsigned
{
int err = 0;

- if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+ if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWUSER)))
return 0;

-#ifndef CONFIG_IPC_NS
- if (unshare_flags & CLONE_NEWIPC)
- return -EINVAL;
-#endif
-
-#ifndef CONFIG_UTS_NS
- if (unshare_flags & CLONE_NEWUTS)
- return -EINVAL;
-#endif
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;

@@ -174,3 +190,12 @@ int unshare_nsproxy_namespaces(unsigned
err = PTR_ERR(*new_nsp);
return err;
}
+
+static int __init nsproxy_cache_init(void)
+{
+ nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy),
+ 0, SLAB_PANIC, NULL, NULL);
+ return 0;
+}
+
+module_init(nsproxy_cache_init);
Index: linux-cfs-2.6.22.13.q/kernel/posix-cpu-timers.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/posix-cpu-timers.c
+++ linux-cfs-2.6.22.13.q/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struc
}
static inline unsigned long long sched_ns(struct task_struct *p)
{
- return (p == current) ? current_sched_time(p) : p->sched_time;
+ return task_sched_runtime(p);
}

int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked
} while (t != p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->signal->sched_time;
+ cpu->sched = p->signal->sum_sched_runtime;
/* Add in each other live thread. */
while ((t = next_thread(t)) != p) {
- cpu->sched += t->sched_time;
+ cpu->sched += t->se.sum_exec_runtime;
}
cpu->sched += sched_ns(p);
break;
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer
*/
static void cleanup_timers(struct list_head *head,
cputime_t utime, cputime_t stime,
- unsigned long long sched_time)
+ unsigned long long sum_exec_runtime)
{
struct cpu_timer_list *timer, *next;
cputime_t ptime = cputime_add(utime, stime);
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_h
++head;
list_for_each_entry_safe(timer, next, head, entry) {
list_del_init(&timer->entry);
- if (timer->expires.sched < sched_time) {
+ if (timer->expires.sched < sum_exec_runtime) {
timer->expires.sched = 0;
} else {
- timer->expires.sched -= sched_time;
+ timer->expires.sched -= sum_exec_runtime;
}
}
}
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_h
void posix_cpu_timers_exit(struct task_struct *tsk)
{
cleanup_timers(tsk->cpu_timers,
- tsk->utime, tsk->stime, tsk->sched_time);
+ tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);

}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct
cleanup_timers(tsk->signal->cpu_timers,
cputime_add(tsk->utime, tsk->signal->utime),
cputime_add(tsk->stime, tsk->signal->stime),
- tsk->sched_time + tsk->signal->sched_time);
+ tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
}

@@ -536,7 +536,7 @@ static void process_timer_rebalance(stru
nsleft = max_t(unsigned long long, nsleft, 1);
do {
if (likely(!(t->flags & PF_EXITING))) {
- ns = t->sched_time + nsleft;
+ ns = t->se.sum_exec_runtime + nsleft;
if (t->it_sched_expires == 0 ||
t->it_sched_expires > ns) {
t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct t
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
- if (!--maxfire || tsk->sched_time < t->expires.sched) {
+ if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
tsk->it_sched_expires = t->expires.sched;
break;
}
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct
int maxfire;
struct signal_struct *const sig = tsk->signal;
cputime_t utime, stime, ptime, virt_expires, prof_expires;
- unsigned long long sched_time, sched_expires;
+ unsigned long long sum_sched_runtime, sched_expires;
struct task_struct *t;
struct list_head *timers = sig->cpu_timers;

@@ -1044,12 +1044,12 @@ static void check_process_timers(struct
*/
utime = sig->utime;
stime = sig->stime;
- sched_time = sig->sched_time;
+ sum_sched_runtime = sig->sum_sched_runtime;
t = tsk;
do {
utime = cputime_add(utime, t->utime);
stime = cputime_add(stime, t->stime);
- sched_time += t->sched_time;
+ sum_sched_runtime += t->se.sum_exec_runtime;
t = next_thread(t);
} while (t != tsk);
ptime = cputime_add(utime, stime);
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
- if (!--maxfire || sched_time < t->expires.sched) {
+ if (!--maxfire || sum_sched_runtime < t->expires.sched) {
sched_expires = t->expires.sched;
break;
}
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct
virt_left = cputime_sub(virt_expires, utime);
virt_left = cputime_div_non_zero(virt_left, nthreads);
if (sched_expires) {
- sched_left = sched_expires - sched_time;
+ sched_left = sched_expires - sum_sched_runtime;
do_div(sched_left, nthreads);
sched_left = max_t(unsigned long long, sched_left, 1);
} else {
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct
t->it_virt_expires = ticks;
}

- sched = t->sched_time + sched_left;
+ sched = t->se.sum_exec_runtime + sched_left;
if (sched_expires && (t->it_sched_expires == 0 ||
t->it_sched_expires > sched)) {
t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_st

if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
(tsk->it_sched_expires == 0 ||
- tsk->sched_time < tsk->it_sched_expires))
+ tsk->se.sum_exec_runtime < tsk->it_sched_expires))
return;

#undef UNEXPIRED
Index: linux-cfs-2.6.22.13.q/kernel/sched.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/sched.c
+++ linux-cfs-2.6.22.13.q/kernel/sched.c
@@ -16,13 +16,19 @@
* by Davide Libenzi, preemptible kernel bits by Robert Love.
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
+ * 2007-04-15 Work begun on replacing all interactivity tuning with a
+ * fair scheduling design by Con Kolivas.
+ * 2007-05-05 Load balancing (smp-nice) and other improvements
+ * by Peter Williams
+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
*/

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/highmem.h>
#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
@@ -38,6 +44,7 @@
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
+#include <linux/pid_namespace.h>
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/timer.h>
@@ -47,15 +54,18 @@
#include <linux/percpu.h>
#include <linux/kthread.h>
#include <linux/seq_file.h>
+#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/times.h>
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
#include <linux/reciprocal_div.h>
+#include <linux/unistd.h>
+#include <linux/pagemap.h>

#include <asm/tlb.h>
-#include <asm/unistd.h>
+#include <asm/irq_regs.h>

/*
* Scheduler clock - returns current time in nanosec units.
@@ -64,7 +74,7 @@
*/
unsigned long long __attribute__((weak)) sched_clock(void)
{
- return (unsigned long long)jiffies * (1000000000 / HZ);
+ return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
}

/*
@@ -88,99 +98,19 @@ unsigned long long __attribute__((weak))
/*
* Some helpers for converting nanosecond timing to jiffy resolution
*/
-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
-#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
+#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
+
+#define NICE_0_LOAD SCHED_LOAD_SCALE
+#define NICE_0_SHIFT SCHED_LOAD_SHIFT

/*
* These are the 'tuning knobs' of the scheduler:
*
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
* Timeslices get refilled after they expire.
*/
-#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
#define DEF_TIMESLICE (100 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT 30
-#define CHILD_PENALTY 95
-#define PARENT_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_BONUS_RATIO 25
-#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT (MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- * priority range a task can explore, a value of '1' means the
- * task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define CURRENT_BONUS(p) \
- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
- MAX_SLEEP_AVG)
-
-#define GRANULARITY (10 * HZ / 1000 ? : 1)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
- num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
- (v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
- INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
- ((p)->prio <= (p)->static_prio - DELTA(p))
-
-#define INTERACTIVE_SLEEP(p) \
- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
-
-#define TASK_PREEMPTS_CURR(p, rq) \
- ((p)->prio < (rq)->curr->prio)
-
-#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
-static unsigned int static_prio_timeslice(int static_prio)
-{
- if (static_prio < NICE_TO_PRIO(0))
- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
- else
- return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}

#ifdef CONFIG_SMP
/*
@@ -203,28 +133,201 @@ static inline void sg_inc_cpu_power(stru
}
#endif

-/*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- */
+static inline int rt_policy(int policy)
+{
+ if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+ return 1;
+ return 0;
+}

-static inline unsigned int task_timeslice(struct task_struct *p)
+static inline int task_has_rt_policy(struct task_struct *p)
{
- return static_prio_timeslice(p->static_prio);
+ return rt_policy(p->policy);
}

/*
- * These are the runqueue data structures:
+ * This is the priority-queue data structure of the RT scheduling class:
+ */
+struct rt_prio_array {
+ DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+ struct list_head queue[MAX_RT_PRIO];
+};
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+#include <linux/cgroup.h>
+
+struct cfs_rq;
+
+/* task group related information */
+struct task_group {
+#ifdef CONFIG_FAIR_CGROUP_SCHED
+ struct cgroup_subsys_state css;
+#endif
+ /* schedulable entities of this group on each cpu */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each cpu */
+ struct cfs_rq **cfs_rq;
+
+ /* shares assigned to a task group governs how much of cpu bandwidth
+ * is allocated to the group. The more shares a group has, the more is
+ * the cpu bandwidth allocated to it.
+ *
+ * For ex, lets say that there are three task groups, A, B and C which
+ * have been assigned shares 1000, 2000 and 3000 respectively. Then,
+ * cpu bandwidth allocated by the scheduler to task groups A, B and C
+ * should be:
+ *
+ * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
+ * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
+ * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
+ *
+ * The weight assigned to a task group's schedulable entities on every
+ * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
+ * group's shares. For ex: lets say that task group A has been
+ * assigned shares of 1000 and there are two CPUs in a system. Then,
+ *
+ * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
+ *
+ * Note: It's not necessary that each of a task's group schedulable
+ * entity have the same weight on all CPUs. If the group
+ * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
+ * better distribution of weight could be:
+ *
+ * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
+ * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
+ *
+ * rebalance_shares() is responsible for distributing the shares of a
+ * task groups like this among the group's schedulable entities across
+ * cpus.
+ *
+ */
+ unsigned long shares;
+
+ /* lock to serialize modification to shares */
+ struct mutex lock;
+
+ unsigned long last_total_load;
+ struct rcu_head rcu;
+};
+
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+
+static struct sched_entity *init_sched_entity_p[NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+
+static DEFINE_MUTEX(doms_cur_mutex); /* serialize access to doms_curr[] array */
+
+#ifdef CONFIG_SMP
+/* kernel thread that runs rebalance_shares() periodically */
+static struct task_struct *lb_monitor_task;
+
+static int load_balance_monitor(void *unused);
+#endif
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/* Default task group.
+ * Every task in system belong to this group at bootup.
*/
+struct task_group init_task_group = {
+ .se = init_sched_entity_p,
+ .cfs_rq = init_cfs_rq_p,
+};
+
+#ifdef CONFIG_FAIR_USER_SCHED
+# define INIT_TASK_GROUP_LOAD 2*NICE_0_LOAD /* root user's cpu share */
+#else
+# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
+#endif
+
+#define MIN_GROUP_SHARES 100
+
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+
+/* return group to which a task belongs */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ struct task_group *tg;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+ tg = p->user->tg;
+#elif defined(CONFIG_FAIR_CGROUP_SCHED)
+ tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
+ struct task_group, css);
+#else
+ tg = &init_task_group;
+#endif
+
+ return tg;
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
+{
+ p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+ p->se.parent = task_group(p)->se[cpu];
+}
+
+static inline void lock_doms_cur(void)
+{
+ mutex_lock(&doms_cur_mutex);
+}
+
+static inline void unlock_doms_cur(void)
+{
+ mutex_unlock(&doms_cur_mutex);
+}
+
+#else
+
+static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
+static inline void lock_doms_cur(void) { }
+static inline void unlock_doms_cur(void) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+ struct load_weight load;
+ unsigned long nr_running;
+
+ u64 exec_clock;
+ u64 min_vruntime;
+
+ struct avl_root tasks_timeline;
+ struct avl_node *avl_leftmost;
+ struct avl_node *avl_load_balance_curr;
+ /* 'curr' points to currently running entity on this cfs_rq.
+ * It is set to NULL otherwise (i.e when none are currently running).
+ */
+ struct sched_entity *curr;
+
+ unsigned long nr_spread_over;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
+
+ /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
+ *
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
+ */
+ struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+ struct task_group *tg; /* group that "owns" this runqueue */
+#endif
+};

-struct prio_array {
- unsigned int nr_active;
- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
- struct list_head queue[MAX_PRIO];
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+ struct rt_prio_array active;
+ int rt_load_balance_idx;
+ struct list_head *rt_load_balance_head, *rt_load_balance_curr;
};

/*
@@ -235,6 +338,7 @@ struct prio_array {
* acquire operations must be ordered by ascending &runqueue.
*/
struct rq {
+ /* runqueue lock: */
spinlock_t lock;

/*
@@ -242,15 +346,23 @@ struct rq {
* remote CPUs use both these fields when doing load calculation.
*/
unsigned long nr_running;
- unsigned long raw_weighted_load;
-#ifdef CONFIG_SMP
- unsigned long cpu_load[3];
+ #define CPU_LOAD_IDX_MAX 5
+ unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
+ /* capture load from *all* tasks on this cpu: */
+ struct load_weight load;
+ unsigned long nr_load_updates;
+ u64 nr_switches;
+
+ struct cfs_rq cfs;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* list of leaf cfs_rq on this cpu: */
+ struct list_head leaf_cfs_rq_list;
#endif
- unsigned long long nr_switches;
+ struct rt_rq rt;

/*
* This is part of a global counter where only the total sum
@@ -260,14 +372,18 @@ struct rq {
*/
unsigned long nr_uninterruptible;

- unsigned long expired_timestamp;
- /* Cached timestamp set by update_cpu_clock() */
- unsigned long long most_recent_timestamp;
struct task_struct *curr, *idle;
unsigned long next_balance;
struct mm_struct *prev_mm;
- struct prio_array *active, *expired, arrays[2];
- int best_expired_prio;
+
+ u64 clock, prev_clock_raw;
+ s64 clock_max_delta;
+
+ unsigned int clock_warps, clock_overflows;
+ u64 idle_clock;
+ unsigned int clock_deep_idle_events;
+ u64 tick_timestamp;
+
atomic_t nr_iowait;

#ifdef CONFIG_SMP
@@ -276,7 +392,8 @@ struct rq {
/* For active balancing */
int active_balance;
int push_cpu;
- int cpu; /* cpu of this runqueue */
+ /* cpu of this runqueue: */
+ int cpu;

struct task_struct *migration_thread;
struct list_head migration_queue;
@@ -287,26 +404,34 @@ struct rq {
struct sched_info rq_sched_info;

/* sys_sched_yield() stats */
- unsigned long yld_exp_empty;
- unsigned long yld_act_empty;
- unsigned long yld_both_empty;
- unsigned long yld_cnt;
+ unsigned int yld_exp_empty;
+ unsigned int yld_act_empty;
+ unsigned int yld_both_empty;
+ unsigned int yld_count;

/* schedule() stats */
- unsigned long sched_switch;
- unsigned long sched_cnt;
- unsigned long sched_goidle;
+ unsigned int sched_switch;
+ unsigned int sched_count;
+ unsigned int sched_goidle;

/* try_to_wake_up() stats */
- unsigned long ttwu_cnt;
- unsigned long ttwu_local;
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
+
+ /* BKL stats */
+ unsigned int bkl_count;
#endif
struct lock_class_key rq_lock_key;
};

-static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static DEFINE_MUTEX(sched_hotcpu_mutex);

+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+{
+ rq->curr->sched_class->check_preempt_curr(rq, p);
+}
+
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -317,6 +442,53 @@ static inline int cpu_of(struct rq *rq)
}

/*
+ * Update the per-runqueue clock, as finegrained as the platform can give
+ * us, but without assuming monotonicity, etc.:
+ */
+static void __update_rq_clock(struct rq *rq)
+{
+ u64 prev_raw = rq->prev_clock_raw;
+ u64 now = sched_clock();
+ s64 delta = now - prev_raw;
+ u64 clock = rq->clock;
+
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+#endif
+ /*
+ * Protect against sched_clock() occasionally going backwards:
+ */
+ if (unlikely(delta < 0)) {
+ clock++;
+ rq->clock_warps++;
+ } else {
+ /*
+ * Catch too large forward jumps too:
+ */
+ if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
+ if (clock < rq->tick_timestamp + TICK_NSEC)
+ clock = rq->tick_timestamp + TICK_NSEC;
+ else
+ clock++;
+ rq->clock_overflows++;
+ } else {
+ if (unlikely(delta > rq->clock_max_delta))
+ rq->clock_max_delta = delta;
+ clock += delta;
+ }
+ }
+
+ rq->prev_clock_raw = now;
+ rq->clock = clock;
+}
+
+static void update_rq_clock(struct rq *rq)
+{
+ if (likely(smp_processor_id() == cpu_of(rq)))
+ __update_rq_clock(rq);
+}
+
+/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
*
@@ -331,6 +503,61 @@ static inline int cpu_of(struct rq *rq)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# define const_debug __read_mostly
+#else
+# define const_debug static const
+#endif
+
+/*
+ * Debugging: various feature bits
+ */
+enum {
+ SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
+ SCHED_FEAT_WAKEUP_PREEMPT = 2,
+ SCHED_FEAT_START_DEBIT = 4,
+ SCHED_FEAT_TREE_AVG = 8,
+ SCHED_FEAT_APPROX_AVG = 16,
+};
+
+const_debug unsigned int sysctl_sched_features =
+ SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
+ SCHED_FEAT_WAKEUP_PREEMPT * 1 |
+ SCHED_FEAT_START_DEBIT * 1 |
+ SCHED_FEAT_TREE_AVG * 0 |
+ SCHED_FEAT_APPROX_AVG * 0;
+
+#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
+
+/*
+ * Number of tasks to iterate in a single balance run.
+ * Limited because this is done with IRQs disabled.
+ */
+const_debug unsigned int sysctl_sched_nr_migrate = 32;
+
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+unsigned long long cpu_clock(int cpu)
+{
+ unsigned long long now;
+ unsigned long flags;
+ struct rq *rq;
+
+ local_irq_save(flags);
+ rq = cpu_rq(cpu);
+ update_rq_clock(rq);
+ now = rq->clock;
+ local_irq_restore(flags);
+
+ return now;
+}
+EXPORT_SYMBOL_GPL(cpu_clock);
+
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
@@ -415,16 +642,13 @@ static inline void finish_lock_switch(st
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
- struct rq *rq;
-
-repeat_lock_task:
- rq = task_rq(p);
- spin_lock(&rq->lock);
- if (unlikely(rq != task_rq(p))) {
+ for (;;) {
+ struct rq *rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ return rq;
spin_unlock(&rq->lock);
- goto repeat_lock_task;
}
- return rq;
}

/*
@@ -437,18 +661,17 @@ static struct rq *task_rq_lock(struct ta
{
struct rq *rq;

-repeat_lock_task:
- local_irq_save(*flags);
- rq = task_rq(p);
- spin_lock(&rq->lock);
- if (unlikely(rq != task_rq(p))) {
+ for (;;) {
+ local_irq_save(*flags);
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ return rq;
spin_unlock_irqrestore(&rq->lock, *flags);
- goto repeat_lock_task;
}
- return rq;
}

-static inline void __task_rq_unlock(struct rq *rq)
+static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
spin_unlock(&rq->lock);
@@ -460,138 +683,10 @@ static inline void task_rq_unlock(struct
spin_unlock_irqrestore(&rq->lock, *flags);
}

-#ifdef CONFIG_SCHEDSTATS
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 14
-
-static int show_schedstat(struct seq_file *seq, void *v)
-{
- int cpu;
-
- seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
- seq_printf(seq, "timestamp %lu\n", jiffies);
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
- struct sched_domain *sd;
- int dcnt = 0;
-#endif
-
- /* runqueue-specific stats */
- seq_printf(seq,
- "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
- cpu, rq->yld_both_empty,
- rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
- rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
- rq->ttwu_cnt, rq->ttwu_local,
- rq->rq_sched_info.cpu_time,
- rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
-
- seq_printf(seq, "\n");
-
-#ifdef CONFIG_SMP
- /* domain-specific stats */
- preempt_disable();
- for_each_domain(cpu, sd) {
- enum idle_type itype;
- char mask_str[NR_CPUS];
-
- cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
- seq_printf(seq, "domain%d %s", dcnt++, mask_str);
- for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
- itype++) {
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
- "%lu",
- sd->lb_cnt[itype],
- sd->lb_balanced[itype],
- sd->lb_failed[itype],
- sd->lb_imbalance[itype],
- sd->lb_gained[itype],
- sd->lb_hot_gained[itype],
- sd->lb_nobusyq[itype],
- sd->lb_nobusyg[itype]);
- }
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
- " %lu %lu %lu\n",
- sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
- sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
- sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
- sd->ttwu_wake_remote, sd->ttwu_move_affine,
- sd->ttwu_move_balance);
- }
- preempt_enable();
-#endif
- }
- return 0;
-}
-
-static int schedstat_open(struct inode *inode, struct file *file)
-{
- unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
-
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_schedstat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
-}
-
-const struct file_operations proc_schedstat_operations = {
- .open = schedstat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
-{
- if (rq) {
- rq->rq_sched_info.run_delay += delta_jiffies;
- rq->rq_sched_info.pcnt++;
- }
-}
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
-{
- if (rq)
- rq->rq_sched_info.cpu_time += delta_jiffies;
-}
-# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
-{}
-# define schedstat_inc(rq, field) do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
-#endif
-
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
-static inline struct rq *this_rq_lock(void)
+static struct rq *this_rq_lock(void)
__acquires(rq->lock)
{
struct rq *rq;
@@ -603,241 +698,294 @@ static inline struct rq *this_rq_lock(vo
return rq;
}

-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
/*
- * Called when a process is dequeued from the active array and given
- * the cpu. We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue. (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
- *
- * This function is only called from sched_info_arrive(), rather than
- * dequeue_task(). Even though a task may be queued and dequeued multiple
- * times as it is shuffled about, we're really interested in knowing how
- * long it was from the *first* time it was queued to the time that it
- * finally hit a cpu.
+ * We are going deep-idle (irqs are disabled):
*/
-static inline void sched_info_dequeued(struct task_struct *t)
+void sched_clock_idle_sleep_event(void)
{
- t->sched_info.last_queued = 0;
+ struct rq *rq = cpu_rq(smp_processor_id());
+
+ spin_lock(&rq->lock);
+ __update_rq_clock(rq);
+ spin_unlock(&rq->lock);
+ rq->clock_deep_idle_events++;
}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);

/*
- * Called when a task finally hits the cpu. We can now calculate how
- * long it was waiting to run. We also note when it began so that we
- * can keep stats on how long its timeslice is.
+ * We just idled delta nanoseconds (called with irqs disabled):
*/
-static void sched_info_arrive(struct task_struct *t)
+void sched_clock_idle_wakeup_event(u64 delta_ns)
{
- unsigned long now = jiffies, delta_jiffies = 0;
-
- if (t->sched_info.last_queued)
- delta_jiffies = now - t->sched_info.last_queued;
- sched_info_dequeued(t);
- t->sched_info.run_delay += delta_jiffies;
- t->sched_info.last_arrival = now;
- t->sched_info.pcnt++;
+ struct rq *rq = cpu_rq(smp_processor_id());
+ u64 now = sched_clock();

- rq_sched_info_arrive(task_rq(t), delta_jiffies);
+ rq->idle_clock += delta_ns;
+ /*
+ * Override the previous timestamp and ignore all
+ * sched_clock() deltas that occured while we idled,
+ * and use the PM-provided delta_ns to advance the
+ * rq clock:
+ */
+ spin_lock(&rq->lock);
+ rq->prev_clock_raw = now;
+ rq->clock += delta_ns;
+ spin_unlock(&rq->lock);
}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);

/*
- * Called when a process is queued into either the active or expired
- * array. The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu. Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
- * to runqueue.
+ * resched_task - mark a task 'to be rescheduled now'.
*
- * This function is only called from enqueue_task(), but also only updates
- * the timestamp if it is already not set. It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
*/
-static inline void sched_info_queued(struct task_struct *t)
+#ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+static void resched_task(struct task_struct *p)
{
- if (unlikely(sched_info_on()))
- if (!t->sched_info.last_queued)
- t->sched_info.last_queued = jiffies;
+ int cpu;
+
+ assert_spin_locked(&task_rq(p)->lock);
+
+ if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ return;
+
+ set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+ cpu = task_cpu(p);
+ if (cpu == smp_processor_id())
+ return;
+
+ /* NEED_RESCHED must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(p))
+ smp_send_reschedule(cpu);
}

-/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily. Now we can calculate how long we ran.
- */
-static inline void sched_info_depart(struct task_struct *t)
+static void resched_cpu(int cpu)
{
- unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;

- t->sched_info.cpu_time += delta_jiffies;
- rq_sched_info_depart(task_rq(t), delta_jiffies);
+ if (!spin_trylock_irqsave(&rq->lock, flags))
+ return;
+ resched_task(cpu_curr(cpu));
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+#else
+static inline void resched_task(struct task_struct *p)
+{
+ assert_spin_locked(&task_rq(p)->lock);
+ set_tsk_need_resched(p);
}
+#endif
+
+#if BITS_PER_LONG == 32
+# define WMULT_CONST (~0UL)
+#else
+# define WMULT_CONST (1UL << 32)
+#endif
+
+#define WMULT_SHIFT 32

/*
- * Called when tasks are switched involuntarily due, typically, to expiring
- * their time slice. (This may also be called when switching to or from
- * the idle task.) We are only called when prev != next.
+ * Shift right and round:
*/
-static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+
+static unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+ struct load_weight *lw)
{
- struct rq *rq = task_rq(prev);
+ u64 tmp;

+ if (unlikely(!lw->inv_weight))
+ lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
+
+ tmp = (u64)delta_exec * weight;
/*
- * prev now departs the cpu. It's not interesting to record
- * stats about how efficient we were at scheduling the idle
- * process, however.
+ * Check whether we'd overflow the 64-bit multiplication:
*/
- if (prev != rq->idle)
- sched_info_depart(prev);
+ if (unlikely(tmp > WMULT_CONST))
+ tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+ WMULT_SHIFT/2);
+ else
+ tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

- if (next != rq->idle)
- sched_info_arrive(next);
+ return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
-static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
{
- if (unlikely(sched_info_on()))
- __sched_info_switch(prev, next);
+ return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
}
-#else
-#define sched_info_queued(t) do { } while (0)
-#define sched_info_switch(t, next) do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */

-/*
- * Adding/removing a task to/from a priority array:
- */
-static void dequeue_task(struct task_struct *p, struct prio_array *array)
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
- array->nr_active--;
- list_del(&p->run_list);
- if (list_empty(array->queue + p->prio))
- __clear_bit(p->prio, array->bitmap);
+ lw->weight += inc;
}

-static void enqueue_task(struct task_struct *p, struct prio_array *array)
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
- sched_info_queued(p);
- list_add_tail(&p->run_list, array->queue + p->prio);
- __set_bit(p->prio, array->bitmap);
- array->nr_active++;
- p->array = array;
+ lw->weight -= dec;
}

/*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
*/
-static void requeue_task(struct task_struct *p, struct prio_array *array)
-{
- list_move_tail(&p->run_list, array->queue + p->prio);
-}

-static inline void
-enqueue_task_head(struct task_struct *p, struct prio_array *array)
-{
- list_add(&p->run_list, array->queue + p->prio);
- __set_bit(p->prio, array->bitmap);
- array->nr_active++;
- p->array = array;
-}
+#define WEIGHT_IDLEPRIO 2
+#define WMULT_IDLEPRIO (1 << 31)

/*
- * __normal_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+static const int prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
*
- * Both properties are important to certain workloads.
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+static const u32 prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
+
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
+
+/*
+ * runqueue iterator, to support SMP load-balancing between different
+ * scheduling classes, without having to expose their internal data
+ * structures to the load-balancing proper:
*/
+struct rq_iterator {
+ void *arg;
+ struct task_struct *(*start)(void *);
+ struct task_struct *(*next)(void *);
+};

-static inline int __normal_prio(struct task_struct *p)
-{
- int bonus, prio;
+#ifdef CONFIG_SMP
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move, struct sched_domain *sd,
+ enum cpu_idle_type idle, int *all_pinned,
+ int *this_best_prio, struct rq_iterator *iterator);

- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ struct rq_iterator *iterator);
+#endif

- prio = p->static_prio - bonus;
- if (prio < MAX_RT_PRIO)
- prio = MAX_RT_PRIO;
- if (prio > MAX_PRIO-1)
- prio = MAX_PRIO-1;
- return prio;
+static inline void inc_load(struct rq *rq, unsigned long load)
+{
+ update_load_add(&rq->load, load);
}

-/*
- * To aid in avoiding the subversion of "niceness" due to uneven distribution
- * of tasks with abnormal "nice" values across CPUs the contribution that
- * each task makes to its run queue's load is weighted according to its
- * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
- * scaled version of the new time slice allocation that they receive on time
- * slice expiry etc.
- */
+static inline void dec_load(struct rq *rq, unsigned long load)
+{
+ update_load_sub(&rq->load, load);
+}

-/*
- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
- * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification
- */
-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
-#define LOAD_WEIGHT(lp) \
- (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
-#define PRIO_TO_LOAD_WEIGHT(prio) \
- LOAD_WEIGHT(static_prio_timeslice(prio))
-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+#include "sched_stats.h"
+#include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif

-static void set_load_weight(struct task_struct *p)
+#define sched_class_highest (&rt_sched_class)
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
{
- if (has_rt_policy(p)) {
-#ifdef CONFIG_SMP
- if (p == task_rq(p)->migration_thread)
- /*
- * The migration thread does the actual balancing.
- * Giving its load any weight will skew balancing
- * adversely.
- */
- p->load_weight = 0;
- else
-#endif
- p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
- } else
- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+ rq->nr_running++;
}

-static inline void
-inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
{
- rq->raw_weighted_load += p->load_weight;
+ rq->nr_running--;
}

-static inline void
-dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
+static void set_load_weight(struct task_struct *p)
+{
+ if (task_has_rt_policy(p)) {
+ p->se.load.weight = prio_to_weight[0] * 2;
+ p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+ return;
+ }
+
+ /*
+ * SCHED_IDLE tasks get minimal weight:
+ */
+ if (p->policy == SCHED_IDLE) {
+ p->se.load.weight = WEIGHT_IDLEPRIO;
+ p->se.load.inv_weight = WMULT_IDLEPRIO;
+ return;
+ }
+
+ p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
+ p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+}
+
+static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
{
- rq->raw_weighted_load -= p->load_weight;
+ sched_info_queued(p);
+ p->sched_class->enqueue_task(rq, p, wakeup);
+ p->se.on_rq = 1;
}

-static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
- rq->nr_running++;
- inc_raw_weighted_load(rq, p);
+ p->sched_class->dequeue_task(rq, p, sleep);
+ p->se.on_rq = 0;
}

-static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
+/*
+ * __normal_prio - return the priority that is based on the static prio
+ */
+static inline int __normal_prio(struct task_struct *p)
{
- rq->nr_running--;
- dec_raw_weighted_load(rq, p);
+ return p->static_prio;
}

/*
@@ -851,7 +999,7 @@ static inline int normal_prio(struct tas
{
int prio;

- if (has_rt_policy(p))
+ if (task_has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
@@ -879,238 +1027,111 @@ static int effective_prio(struct task_st
}

/*
- * __activate_task - move a task to the runqueue.
+ * activate_task - move a task to the runqueue.
*/
-static void __activate_task(struct task_struct *p, struct rq *rq)
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
{
- struct prio_array *target = rq->active;
+ if (p->state == TASK_UNINTERRUPTIBLE)
+ rq->nr_uninterruptible--;

- if (batch_task(p))
- target = rq->expired;
- enqueue_task(p, target);
+ enqueue_task(rq, p, wakeup);
inc_nr_running(p, rq);
}

/*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
+ * deactivate_task - remove a task from the runqueue.
*/
-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
{
- enqueue_task_head(p, rq->active);
- inc_nr_running(p, rq);
+ if (p->state == TASK_UNINTERRUPTIBLE)
+ rq->nr_uninterruptible++;
+
+ dequeue_task(rq, p, sleep);
+ dec_nr_running(p, rq);
}

-/*
- * Recalculate p->normal_prio and p->prio after having slept,
- * updating the sleep-average too:
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
*/
-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
+inline int task_curr(const struct task_struct *p)
{
- /* Caller must always ensure 'now >= p->timestamp' */
- unsigned long sleep_time = now - p->timestamp;
-
- if (batch_task(p))
- sleep_time = 0;
-
- if (likely(sleep_time > 0)) {
- /*
- * This ceiling is set to the lowest priority that would allow
- * a task to be reinserted into the active array on timeslice
- * completion.
- */
- unsigned long ceiling = INTERACTIVE_SLEEP(p);
-
- if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
- /*
- * Prevents user tasks from achieving best priority
- * with one single large enough sleep.
- */
- p->sleep_avg = ceiling;
- /*
- * Using INTERACTIVE_SLEEP() as a ceiling places a
- * nice(0) task 1ms sleep away from promotion, and
- * gives it 700ms to round-robin with no chance of
- * being demoted. This is more than generous, so
- * mark this sleep as non-interactive to prevent the
- * on-runqueue bonus logic from intervening should
- * this task not receive cpu immediately.
- */
- p->sleep_type = SLEEP_NONINTERACTIVE;
- } else {
- /*
- * Tasks waking from uninterruptible sleep are
- * limited in their sleep_avg rise as they
- * are likely to be waiting on I/O
- */
- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
- if (p->sleep_avg >= ceiling)
- sleep_time = 0;
- else if (p->sleep_avg + sleep_time >=
- ceiling) {
- p->sleep_avg = ceiling;
- sleep_time = 0;
- }
- }
-
- /*
- * This code gives a bonus to interactive tasks.
- *
- * The boost works by updating the 'average sleep time'
- * value here, based on ->timestamp. The more time a
- * task spends sleeping, the higher the average gets -
- * and the higher the priority boost gets as well.
- */
- p->sleep_avg += sleep_time;
-
- }
- if (p->sleep_avg > NS_MAX_SLEEP_AVG)
- p->sleep_avg = NS_MAX_SLEEP_AVG;
- }
-
- return effective_prio(p);
+ return cpu_curr(task_cpu(p)) == p;
}

-/*
- * activate_task - move a task to the runqueue and do priority recalculation
- *
- * Update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
- */
-static void activate_task(struct task_struct *p, struct rq *rq, int local)
+/* Used instead of source_load when we know the type == 0 */
+unsigned long weighted_cpuload(const int cpu)
{
- unsigned long long now;
-
- if (rt_task(p))
- goto out;
+ return cpu_rq(cpu)->load.weight;
+}

- now = sched_clock();
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+ set_task_cfs_rq(p, cpu);
#ifdef CONFIG_SMP
- if (!local) {
- /* Compensate for drifting sched_clock */
- struct rq *this_rq = this_rq();
- now = (now - this_rq->most_recent_timestamp)
- + rq->most_recent_timestamp;
- }
-#endif
-
- /*
- * Sleep time is in units of nanosecs, so shift by 20 to get a
- * milliseconds-range estimation of the amount of time that the task
- * spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- if (p->state == TASK_UNINTERRUPTIBLE)
- profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
- (now - p->timestamp) >> 20);
- }
-
- p->prio = recalc_task_prio(p, now);
-
/*
- * This checks to make sure it's not an uninterruptible task
- * that is now waking up.
+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+ * successfuly executed on another CPU. We must ensure that updates of
+ * per-task data have been completed by this moment.
*/
- if (p->sleep_type == SLEEP_NORMAL) {
- /*
- * Tasks which were woken up by interrupts (ie. hw events)
- * are most likely of interactive nature. So we give them
- * the credit of extending their sleep time to the period
- * of time they spend on the runqueue, waiting for execution
- * on a CPU, first time around:
- */
- if (in_interrupt())
- p->sleep_type = SLEEP_INTERRUPTED;
- else {
- /*
- * Normal first-time wakeups get a credit too for
- * on-runqueue time, but it will be weighted down:
- */
- p->sleep_type = SLEEP_INTERACTIVE;
- }
- }
- p->timestamp = now;
-out:
- __activate_task(p, rq);
+ smp_wmb();
+ task_thread_info(p)->cpu = cpu;
+#endif
}

-/*
- * deactivate_task - remove a task from the runqueue.
- */
-static void deactivate_task(struct task_struct *p, struct rq *rq)
-{
- dec_nr_running(p, rq);
- dequeue_task(p, p->array);
- p->array = NULL;
-}
+#ifdef CONFIG_SMP

/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
+ * Is this task likely cache-hot:
*/
-#ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-#endif
-
-static void resched_task(struct task_struct *p)
+static inline int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
{
- int cpu;
-
- assert_spin_locked(&task_rq(p)->lock);
+ s64 delta;

- if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
- return;
+ if (p->sched_class != &fair_sched_class)
+ return 0;

- set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+ if (sysctl_sched_migration_cost == -1)
+ return 1;
+ if (sysctl_sched_migration_cost == 0)
+ return 0;

- cpu = task_cpu(p);
- if (cpu == smp_processor_id())
- return;
+ delta = now - p->se.exec_start;

- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
- smp_send_reschedule(cpu);
+ return delta < (s64)sysctl_sched_migration_cost;
}

-static void resched_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;

- if (!spin_trylock_irqsave(&rq->lock, flags))
- return;
- resched_task(cpu_curr(cpu));
- spin_unlock_irqrestore(&rq->lock, flags);
-}
-#else
-static inline void resched_task(struct task_struct *p)
+void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
- assert_spin_locked(&task_rq(p)->lock);
- set_tsk_need_resched(p);
-}
-#endif
+ int old_cpu = task_cpu(p);
+ struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
+ struct cfs_rq *old_cfsrq = task_cfs_rq(p),
+ *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
+ u64 clock_offset;

-/**
- * task_curr - is this task currently executing on a CPU?
- * @p: the task in question.
- */
-inline int task_curr(const struct task_struct *p)
-{
- return cpu_curr(task_cpu(p)) == p;
-}
+ clock_offset = old_rq->clock - new_rq->clock;

-/* Used instead of source_load when we know the type == 0 */
-unsigned long weighted_cpuload(const int cpu)
-{
- return cpu_rq(cpu)->raw_weighted_load;
+#ifdef CONFIG_SCHEDSTATS
+ if (p->se.wait_start)
+ p->se.wait_start -= clock_offset;
+ if (p->se.sleep_start)
+ p->se.sleep_start -= clock_offset;
+ if (p->se.block_start)
+ p->se.block_start -= clock_offset;
+ if (old_cpu != new_cpu) {
+ schedstat_inc(p, se.nr_migrations);
+ if (task_hot(p, old_rq->clock, NULL))
+ schedstat_inc(p, se.nr_forced2_migrations);
+ }
+#endif
+ p->se.vruntime -= old_cfsrq->min_vruntime -
+ new_cfsrq->min_vruntime;
+
+ __set_task_cpu(p, new_cpu);
}

-#ifdef CONFIG_SMP
struct migration_req {
struct list_head list;

@@ -1133,7 +1154,7 @@ migrate_task(struct task_struct *p, int
* If the task is not on a runqueue (and not running), then
* it is sufficient to simply update the task's cpu field.
*/
- if (!p->array && !task_running(rq, p)) {
+ if (!p->se.on_rq && !task_running(rq, p)) {
set_task_cpu(p, dest_cpu);
return 0;
}
@@ -1158,73 +1179,74 @@ migrate_task(struct task_struct *p, int
void wait_task_inactive(struct task_struct *p)
{
unsigned long flags;
+ int running, on_rq;
struct rq *rq;
- struct prio_array *array;
- int running;

-repeat:
- /*
- * We do the initial early heuristics without holding
- * any task-queue locks at all. We'll only try to get
- * the runqueue lock when things look like they will
- * work out!
- */
- rq = task_rq(p);
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);

- /*
- * If the task is actively running on another CPU
- * still, just relax and busy-wait without holding
- * any locks.
- *
- * NOTE! Since we don't hold any locks, it's not
- * even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_running()" will
- * return false if the runqueue has changed and p
- * is actually now running somewhere else!
- */
- while (task_running(rq, p))
- cpu_relax();
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_running()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_running(rq, p))
+ cpu_relax();

- /*
- * Ok, time to look more closely! We need the rq
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- rq = task_rq_lock(p, &flags);
- running = task_running(rq, p);
- array = p->array;
- task_rq_unlock(rq, &flags);
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &flags);
+ running = task_running(rq, p);
+ on_rq = p->se.on_rq;
+ task_rq_unlock(rq, &flags);

- /*
- * Was it really running after all now that we
- * checked with the proper locks actually held?
- *
- * Oops. Go back and try again..
- */
- if (unlikely(running)) {
- cpu_relax();
- goto repeat;
- }
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }

- /*
- * It's not enough that it's not actively running,
- * it must be off the runqueue _entirely_, and not
- * preempted!
- *
- * So if it wa still runnable (but just not actively
- * running right now), it's preempted, and we should
- * yield - it could be a while.
- */
- if (unlikely(array)) {
- yield();
- goto repeat;
- }
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it wa still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(on_rq)) {
+ schedule_timeout_uninterruptible(1);
+ continue;
+ }

- /*
- * Ahh, all good. It wasn't running, and it wasn't
- * runnable, which means that it will never become
- * running in the future either. We're all done!
- */
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
}

/***
@@ -1258,28 +1280,30 @@ void kick_process(struct task_struct *p)
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
-static inline unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);

if (type == 0)
- return rq->raw_weighted_load;
+ return total;

- return min(rq->cpu_load[type-1], rq->raw_weighted_load);
+ return min(rq->cpu_load[type-1], total);
}

/*
* Return a high guess at the load of a migration-target cpu weighted
* according to the scheduling class and "nice" value.
*/
-static inline unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);

if (type == 0)
- return rq->raw_weighted_load;
+ return total;

- return max(rq->cpu_load[type-1], rq->raw_weighted_load);
+ return max(rq->cpu_load[type-1], total);
}

/*
@@ -1288,9 +1312,10 @@ static inline unsigned long target_load(
static inline unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
unsigned long n = rq->nr_running;

- return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
+ return n ? total / n : SCHED_LOAD_SCALE;
}

/*
@@ -1312,7 +1337,7 @@ find_idlest_group(struct sched_domain *s

/* Skip over this group if it has no CPUs allowed */
if (!cpus_intersects(group->cpumask, p->cpus_allowed))
- goto nextgroup;
+ continue;

local_group = cpu_isset(this_cpu, group->cpumask);

@@ -1340,9 +1365,7 @@ find_idlest_group(struct sched_domain *s
min_load = avg_load;
idlest = group;
}
-nextgroup:
- group = group->next;
- } while (group != sd->groups);
+ } while (group = group->next, group != sd->groups);

if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
@@ -1392,9 +1415,9 @@ static int sched_balance_self(int cpu, i
struct sched_domain *tmp, *sd = NULL;

for_each_domain(cpu, tmp) {
- /*
- * If power savings logic is enabled for a domain, stop there.
- */
+ /*
+ * If power savings logic is enabled for a domain, stop there.
+ */
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
break;
if (tmp->flags & flag)
@@ -1474,12 +1497,17 @@ static int wake_idle(int cpu, struct tas
if (sd->flags & SD_WAKE_IDLE) {
cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
- if (idle_cpu(i))
+ if (idle_cpu(i)) {
+ if (i != task_cpu(p)) {
+ schedstat_inc(p,
+ se.nr_wakeups_idle);
+ }
return i;
+ }
}
- }
- else
+ } else {
break;
+ }
}
return cpu;
}
@@ -1506,7 +1534,7 @@ static inline int wake_idle(int cpu, str
*/
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
- int cpu, this_cpu, success = 0;
+ int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
long old_state;
struct rq *rq;
@@ -1521,10 +1549,11 @@ static int try_to_wake_up(struct task_st
if (!(old_state & state))
goto out;

- if (p->array)
+ if (p->se.on_rq)
goto out_running;

cpu = task_cpu(p);
+ orig_cpu = cpu;
this_cpu = smp_processor_id();

#ifdef CONFIG_SMP
@@ -1533,7 +1562,7 @@ static int try_to_wake_up(struct task_st

new_cpu = cpu;

- schedstat_inc(rq, ttwu_cnt);
+ schedstat_inc(rq, ttwu_count);
if (cpu == this_cpu) {
schedstat_inc(rq, ttwu_local);
goto out_set_cpu;
@@ -1568,7 +1597,14 @@ static int try_to_wake_up(struct task_st
unsigned long tl = this_load;
unsigned long tl_per_task;

- tl_per_task = cpu_avg_load_per_task(this_cpu);
+ /*
+ * Attract cache-cold tasks on sync wakeups:
+ */
+ if (sync && !task_hot(p, rq->clock, this_sd))
+ goto out_set_cpu;
+
+ schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ tl_per_task = cpu_avg_load_per_task(this_cpu);

/*
* If sync wakeup then subtract the (maximum possible)
@@ -1576,17 +1612,18 @@ static int try_to_wake_up(struct task_st
* of the current CPU:
*/
if (sync)
- tl -= current->load_weight;
+ tl -= current->se.load.weight;

if ((tl <= load &&
tl + target_load(cpu, idx) <= tl_per_task) ||
- 100*(tl + p->load_weight) <= imbalance*load) {
+ 100*(tl + p->se.load.weight) <= imbalance*load) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
* there is no bad imbalance.
*/
schedstat_inc(this_sd, ttwu_move_affine);
+ schedstat_inc(p, se.nr_wakeups_affine);
goto out_set_cpu;
}
}
@@ -1598,6 +1635,7 @@ static int try_to_wake_up(struct task_st
if (this_sd->flags & SD_WAKE_BALANCE) {
if (imbalance*this_load <= 100*load) {
schedstat_inc(this_sd, ttwu_move_balance);
+ schedstat_inc(p, se.nr_wakeups_passive);
goto out_set_cpu;
}
}
@@ -1614,7 +1652,7 @@ out_set_cpu:
old_state = p->state;
if (!(old_state & state))
goto out;
- if (p->array)
+ if (p->se.on_rq)
goto out_running;

this_cpu = smp_processor_id();
@@ -1623,37 +1661,18 @@ out_set_cpu:

out_activate:
#endif /* CONFIG_SMP */
- if (old_state == TASK_UNINTERRUPTIBLE) {
- rq->nr_uninterruptible--;
- /*
- * Tasks on involuntary sleep don't earn
- * sleep_avg beyond just interactive state.
- */
- p->sleep_type = SLEEP_NONINTERACTIVE;
- } else
-
- /*
- * Tasks that have marked their sleep as noninteractive get
- * woken up with their sleep average not weighted in an
- * interactive way.
- */
- if (old_state & TASK_NONINTERACTIVE)
- p->sleep_type = SLEEP_NONINTERACTIVE;
-
-
- activate_task(p, rq, cpu == this_cpu);
- /*
- * Sync wakeups (i.e. those types of wakeups where the waker
- * has indicated that it will leave the CPU in short order)
- * don't trigger a preemption, if the woken up task will run on
- * this cpu. (in this case the 'I will reschedule' promise of
- * the waker guarantees that the freshly woken up task is going
- * to be considered on this CPU.)
- */
- if (!sync || cpu != this_cpu) {
- if (TASK_PREEMPTS_CURR(p, rq))
- resched_task(rq->curr);
- }
+ schedstat_inc(p, se.nr_wakeups);
+ if (sync)
+ schedstat_inc(p, se.nr_wakeups_sync);
+ if (orig_cpu != cpu)
+ schedstat_inc(p, se.nr_wakeups_migrate);
+ if (cpu == this_cpu)
+ schedstat_inc(p, se.nr_wakeups_local);
+ else
+ schedstat_inc(p, se.nr_wakeups_remote);
+ update_rq_clock(rq);
+ activate_task(rq, p, 1);
+ check_preempt_curr(rq, p);
success = 1;

out_running:
@@ -1676,19 +1695,36 @@ int fastcall wake_up_state(struct task_s
return try_to_wake_up(p, state, 0);
}

-static void task_running_tick(struct rq *rq, struct task_struct *p);
/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
+ *
+ * __sched_fork() is basic setup used by init_idle() too:
*/
-void fastcall sched_fork(struct task_struct *p, int clone_flags)
+static void __sched_fork(struct task_struct *p)
{
- int cpu = get_cpu();
+ p->se.exec_start = 0;
+ p->se.sum_exec_runtime = 0;
+ p->se.prev_sum_exec_runtime = 0;

-#ifdef CONFIG_SMP
- cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#ifdef CONFIG_SCHEDSTATS
+ p->se.wait_start = 0;
+ p->se.sum_sleep_runtime = 0;
+ p->se.sleep_start = 0;
+ p->se.block_start = 0;
+ p->se.sleep_max = 0;
+ p->se.block_max = 0;
+ p->se.exec_max = 0;
+ p->se.slice_max = 0;
+ p->se.wait_max = 0;
+#endif
+
+ INIT_LIST_HEAD(&p->run_list);
+ p->se.on_rq = 0;
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
- set_task_cpu(p, cpu);

/*
* We mark the process as running here, but have not actually
@@ -1697,16 +1733,31 @@ void fastcall sched_fork(struct task_str
* event cannot wake it up and insert it on the runqueue either.
*/
p->state = TASK_RUNNING;
+}
+
+/*
+ * fork()/clone()-time setup:
+ */
+void sched_fork(struct task_struct *p, int clone_flags)
+{
+ int cpu = get_cpu();
+
+ __sched_fork(p);
+
+#ifdef CONFIG_SMP
+ cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+ set_task_cpu(p, cpu);

/*
* Make sure we do not leak PI boosting priority to the child:
*/
p->prio = current->normal_prio;
+ if (!rt_prio(p->prio))
+ p->sched_class = &fair_sched_class;

- INIT_LIST_HEAD(&p->run_list);
- p->array = NULL;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
- if (unlikely(sched_info_on()))
+ if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1716,30 +1767,6 @@ void fastcall sched_fork(struct task_str
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
- /*
- * Share the timeslice between parent and child, thus the
- * total amount of pending timeslices in the system doesn't change,
- * resulting in more scheduling fairness.
- */
- local_irq_disable();
- p->time_slice = (current->time_slice + 1) >> 1;
- /*
- * The remainder of the first timeslice might be recovered by
- * the parent if the child exits early enough.
- */
- p->first_time_slice = 1;
- current->time_slice >>= 1;
- p->timestamp = sched_clock();
- if (unlikely(!current->time_slice)) {
- /*
- * This case is rare, it happens when the parent has only
- * a single jiffy left from its timeslice. Taking the
- * runqueue lock is not a problem.
- */
- current->time_slice = 1;
- task_running_tick(cpu_rq(cpu), current);
- }
- local_irq_enable();
put_cpu();
}

@@ -1752,113 +1779,91 @@ void fastcall sched_fork(struct task_str
*/
void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
- struct rq *rq, *this_rq;
unsigned long flags;
- int this_cpu, cpu;
+ struct rq *rq;

rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
- this_cpu = smp_processor_id();
- cpu = task_cpu(p);
-
- /*
- * We decrease the sleep average of forking parents
- * and children as well, to keep max-interactive tasks
- * from forking tasks that are max-interactive. The parent
- * (current) is done further down, under its lock.
- */
- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+ update_rq_clock(rq);

p->prio = effective_prio(p);

- if (likely(cpu == this_cpu)) {
- if (!(clone_flags & CLONE_VM)) {
- /*
- * The VM isn't cloned, so we're in a good position to
- * do child-runs-first in anticipation of an exec. This
- * usually avoids a lot of COW overhead.
- */
- if (unlikely(!current->array))
- __activate_task(p, rq);
- else {
- p->prio = current->prio;
- p->normal_prio = current->normal_prio;
- list_add_tail(&p->run_list, &current->run_list);
- p->array = current->array;
- p->array->nr_active++;
- inc_nr_running(p, rq);
- }
- set_need_resched();
- } else
- /* Run child last */
- __activate_task(p, rq);
- /*
- * We skip the following code due to cpu == this_cpu
- *
- * task_rq_unlock(rq, &flags);
- * this_rq = task_rq_lock(current, &flags);
- */
- this_rq = rq;
+ if (!p->sched_class->task_new || !current->se.on_rq) {
+ activate_task(rq, p, 0);
} else {
- this_rq = cpu_rq(this_cpu);
-
- /*
- * Not the local CPU - must adjust timestamp. This should
- * get optimised away in the !CONFIG_SMP case.
- */
- p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
- + rq->most_recent_timestamp;
- __activate_task(p, rq);
- if (TASK_PREEMPTS_CURR(p, rq))
- resched_task(rq->curr);
-
/*
- * Parent and child are on different CPUs, now get the
- * parent runqueue to update the parent's ->sleep_avg:
+ * Let the scheduling class do new task startup
+ * management (if any):
*/
- task_rq_unlock(rq, &flags);
- this_rq = task_rq_lock(current, &flags);
+ p->sched_class->task_new(rq, p);
+ inc_nr_running(p, rq);
}
- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
- task_rq_unlock(this_rq, &flags);
+ check_preempt_curr(rq, p);
+ task_rq_unlock(rq, &flags);
}

-/*
- * Potentially available exiting-child timeslices are
- * retrieved here - this way the parent does not get
- * penalized for creating too many threads.
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+ hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
*
- * (this cannot be used to 'generate' timeslices
- * artificially, because any timeslice recovered here
- * was given away by the parent in the first place.)
+ * This is safe to call from within a preemption notifier.
*/
-void fastcall sched_exit(struct task_struct *p)
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
{
- unsigned long flags;
- struct rq *rq;
+ hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

- /*
- * If the child was a (relative-) CPU hog then decrease
- * the sleep_avg of the parent as well.
- */
- rq = task_rq_lock(p->parent, &flags);
- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
- p->parent->time_slice += p->time_slice;
- if (unlikely(p->parent->time_slice > task_timeslice(p)))
- p->parent->time_slice = task_timeslice(p);
- }
- if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = p->parent->sleep_avg /
- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
- (EXIT_WEIGHT + 1);
- task_rq_unlock(rq, &flags);
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ struct preempt_notifier *notifier;
+ struct hlist_node *node;
+
+ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ struct preempt_notifier *notifier;
+ struct hlist_node *node;
+
+ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ notifier->ops->sched_out(notifier, next);
+}
+
+#else
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
}

+#endif
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
+ * @prev: the current task that is being switched out
* @next: the task we are going to switch to.
*
* This is called with the rq lock held and interrupts off. It must
@@ -1868,8 +1873,11 @@ void fastcall sched_exit(struct task_str
* prepare_task_switch sets up locking and calls architecture specific
* hooks.
*/
-static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
{
+ fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
}
@@ -1889,7 +1897,7 @@ static inline void prepare_task_switch(s
* with the lock held can cause deadlocks; see schedule() for
* details.)
*/
-static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static void finish_task_switch(struct rq *rq, struct task_struct *prev)
__releases(rq->lock)
{
struct mm_struct *mm = rq->prev_mm;
@@ -1911,13 +1919,14 @@ static inline void finish_task_switch(st
prev_state = prev->state;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
+ fire_sched_in_preempt_notifiers(current);
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
- */
+ */
kprobe_flush_task(prev);
put_task_struct(prev);
}
@@ -1938,20 +1947,22 @@ asmlinkage void schedule_tail(struct tas
preempt_enable();
#endif
if (current->set_child_tid)
- put_user(current->pid, current->set_child_tid);
+ put_user(task_pid_vnr(current), current->set_child_tid);
}

/*
* context_switch - switch to the new MM and the new
* thread's register state.
*/
-static inline struct task_struct *
+static inline void
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
- struct mm_struct *mm = next->mm;
- struct mm_struct *oldmm = prev->active_mm;
+ struct mm_struct *mm, *oldmm;

+ prepare_task_switch(rq, prev, next);
+ mm = next->mm;
+ oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
@@ -1959,16 +1970,15 @@ context_switch(struct rq *rq, struct tas
*/
arch_enter_lazy_cpu_mode();

- if (!mm) {
+ if (unlikely(!mm)) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);

- if (!prev->mm) {
+ if (unlikely(!prev->mm)) {
prev->active_mm = NULL;
- WARN_ON(rq->prev_mm);
rq->prev_mm = oldmm;
}
/*
@@ -1984,7 +1994,13 @@ context_switch(struct rq *rq, struct tas
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);

- return prev;
+ barrier();
+ /*
+ * this_rq must be evaluated again because prev may have moved
+ * CPUs since it called schedule(), thus the 'rq' on its stack
+ * frame will be invalid.
+ */
+ finish_task_switch(this_rq(), prev);
}

/*
@@ -2057,17 +2073,38 @@ unsigned long nr_active(void)
return running + uninterruptible;
}

-#ifdef CONFIG_SMP
-
/*
- * Is this task likely cache-hot:
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC).
*/
-static inline int
-task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
+static void update_cpu_load(struct rq *this_rq)
{
- return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
+ unsigned long this_load = this_rq->load.weight;
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale-1;
+ this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ }
}

+#ifdef CONFIG_SMP
+
/*
* double_rq_lock - safely lock two runqueues
*
@@ -2091,6 +2128,8 @@ static void double_rq_lock(struct rq *rq
spin_lock(&rq1->lock);
}
}
+ update_rq_clock(rq1);
+ update_rq_clock(rq2);
}

/*
@@ -2184,23 +2223,17 @@ void sched_exec(void)
* pull_task - move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked.
*/
-static void pull_task(struct rq *src_rq, struct prio_array *src_array,
- struct task_struct *p, struct rq *this_rq,
- struct prio_array *this_array, int this_cpu)
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+ struct rq *this_rq, int this_cpu)
{
- dequeue_task(p, src_array);
- dec_nr_running(p, src_rq);
+ deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
- inc_nr_running(p, this_rq);
- enqueue_task(p, this_array);
- p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
- + this_rq->most_recent_timestamp;
+ activate_task(this_rq, p, 0);
/*
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- if (TASK_PREEMPTS_CURR(p, this_rq))
- resched_task(this_rq->curr);
+ check_preempt_curr(this_rq, p);
}

/*
@@ -2208,7 +2241,7 @@ static void pull_task(struct rq *src_rq,
*/
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
- struct sched_domain *sd, enum idle_type idle,
+ struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
/*
@@ -2217,12 +2250,16 @@ int can_migrate_task(struct task_struct
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
- if (!cpu_isset(this_cpu, p->cpus_allowed))
+ if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+ schedstat_inc(p, se.nr_failed_migrations_affine);
return 0;
+ }
*all_pinned = 0;

- if (task_running(rq, p))
+ if (task_running(rq, p)) {
+ schedstat_inc(p, se.nr_failed_migrations_running);
return 0;
+ }

/*
* Aggressive migration if:
@@ -2230,131 +2267,75 @@ int can_migrate_task(struct task_struct
* 2) too many balance attempts have failed.
*/

- if (sd->nr_balance_failed > sd->cache_nice_tries) {
+ if (!task_hot(p, rq->clock, sd) ||
+ sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
- if (task_hot(p, rq->most_recent_timestamp, sd))
+ if (task_hot(p, rq->clock, sd)) {
schedstat_inc(sd, lb_hot_gained[idle]);
+ schedstat_inc(p, se.nr_forced_migrations);
+ }
#endif
return 1;
}

- if (task_hot(p, rq->most_recent_timestamp, sd))
+ if (task_hot(p, rq->clock, sd)) {
+ schedstat_inc(p, se.nr_failed_migrations_hot);
return 0;
+ }
return 1;
}

-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
-
-/*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_nr_move, unsigned long max_load_move,
- struct sched_domain *sd, enum idle_type idle,
- int *all_pinned)
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move, struct sched_domain *sd,
+ enum cpu_idle_type idle, int *all_pinned,
+ int *this_best_prio, struct rq_iterator *iterator)
{
- int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
- best_prio_seen, skip_for_load;
- struct prio_array *array, *dst_array;
- struct list_head *head, *curr;
- struct task_struct *tmp;
- long rem_load_move;
+ int loops = 0, pulled = 0, pinned = 0, skip_for_load;
+ struct task_struct *p;
+ long rem_load_move = max_load_move;

- if (max_nr_move == 0 || max_load_move == 0)
+ if (max_load_move == 0)
goto out;

- rem_load_move = max_load_move;
pinned = 1;
- this_best_prio = rq_best_prio(this_rq);
- best_prio = rq_best_prio(busiest);
- /*
- * Enable handling of the case where there is more than one task
- * with the best priority. If the current running task is one
- * of those with prio==best_prio we know it won't be moved
- * and therefore it's safe to override the skip (based on load) of
- * any task we find with that prio.
- */
- best_prio_seen = best_prio == busiest->curr->prio;
-
- /*
- * We first consider expired tasks. Those will likely not be
- * executed in the near future, and they are most likely to
- * be cache-cold, thus switching CPUs has the least effect
- * on them.
- */
- if (busiest->expired->nr_active) {
- array = busiest->expired;
- dst_array = this_rq->expired;
- } else {
- array = busiest->active;
- dst_array = this_rq->active;
- }

-new_array:
- /* Start searching at priority 0: */
- idx = 0;
-skip_bitmap:
- if (!idx)
- idx = sched_find_first_bit(array->bitmap);
- else
- idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
- if (idx >= MAX_PRIO) {
- if (array == busiest->expired && busiest->active->nr_active) {
- array = busiest->active;
- dst_array = this_rq->active;
- goto new_array;
- }
+ /*
+ * Start the load-balancing iterator:
+ */
+ p = iterator->start(iterator->arg);
+next:
+ if (!p || loops++ > sysctl_sched_nr_migrate)
goto out;
- }
-
- head = array->queue + idx;
- curr = head->prev;
-skip_queue:
- tmp = list_entry(curr, struct task_struct, run_list);
-
- curr = curr->prev;
-
/*
- * To help distribute high priority tasks accross CPUs we don't
+ * To help distribute high priority tasks across CPUs we don't
* skip a task if it will be the highest priority task (i.e. smallest
* prio value) on its new queue regardless of its load weight
*/
- skip_for_load = tmp->load_weight > rem_load_move;
- if (skip_for_load && idx < this_best_prio)
- skip_for_load = !best_prio_seen && idx == best_prio;
- if (skip_for_load ||
- !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
-
- best_prio_seen |= idx == best_prio;
- if (curr != head)
- goto skip_queue;
- idx++;
- goto skip_bitmap;
+ skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
+ SCHED_LOAD_SCALE_FUZZ;
+ if ((skip_for_load && p->prio >= *this_best_prio) ||
+ !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+ p = iterator->next(iterator->arg);
+ goto next;
}

- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+ pull_task(busiest, p, this_rq, this_cpu);
pulled++;
- rem_load_move -= tmp->load_weight;
+ rem_load_move -= p->se.load.weight;

/*
- * We only want to steal up to the prescribed number of tasks
- * and the prescribed amount of weighted load.
+ * We only want to steal up to the prescribed amount of weighted load.
*/
- if (pulled < max_nr_move && rem_load_move > 0) {
- if (idx < this_best_prio)
- this_best_prio = idx;
- if (curr != head)
- goto skip_queue;
- idx++;
- goto skip_bitmap;
+ if (rem_load_move > 0) {
+ if (p->prio < *this_best_prio)
+ *this_best_prio = p->prio;
+ p = iterator->next(iterator->arg);
+ goto next;
}
out:
/*
- * Right now, this is the only place pull_task() is called,
+ * Right now, this is one of only two places pull_task() is called,
* so we can safely collect pull_task() stats here rather than
* inside pull_task().
*/
@@ -2362,7 +2343,80 @@ out:

if (all_pinned)
*all_pinned = pinned;
- return pulled;
+
+ return max_load_move - rem_load_move;
+}
+
+/*
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned)
+{
+ const struct sched_class *class = sched_class_highest;
+ unsigned long total_load_moved = 0;
+ int this_best_prio = this_rq->curr->prio;
+
+ do {
+ total_load_moved +=
+ class->load_balance(this_rq, this_cpu, busiest,
+ max_load_move - total_load_moved,
+ sd, idle, all_pinned, &this_best_prio);
+ class = class->next;
+ } while (class && max_load_move > total_load_moved);
+
+ return total_load_moved > 0;
+}
+
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ struct rq_iterator *iterator)
+{
+ struct task_struct *p = iterator->start(iterator->arg);
+ int pinned = 0;
+
+ while (p) {
+ if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+ pull_task(busiest, p, this_rq, this_cpu);
+ /*
+ * Right now, this is only the second place pull_task()
+ * is called, so we can safely collect pull_task()
+ * stats here rather than inside pull_task().
+ */
+ schedstat_inc(sd, lb_gained[idle]);
+
+ return 1;
+ }
+ p = iterator->next(iterator->arg);
+ }
+
+ return 0;
+}
+
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ const struct sched_class *class;
+
+ for (class = sched_class_highest; class; class = class->next)
+ if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
+ return 1;
+
+ return 0;
}

/*
@@ -2372,15 +2426,15 @@ out:
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle, int *sd_idle,
- cpumask_t *cpus, int *balance)
+ unsigned long *imbalance, enum cpu_idle_type idle,
+ int *sd_idle, cpumask_t *cpus, int *balance)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
unsigned long max_pull;
unsigned long busiest_load_per_task, busiest_nr_running;
unsigned long this_load_per_task, this_nr_running;
- int load_idx;
+ int load_idx, group_imb = 0;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int power_savings_balance = 1;
unsigned long leader_nr_running = 0, min_load_per_task = 0;
@@ -2391,17 +2445,18 @@ find_busiest_group(struct sched_domain *
max_load = this_load = total_load = total_pwr = 0;
busiest_load_per_task = busiest_nr_running = 0;
this_load_per_task = this_nr_running = 0;
- if (idle == NOT_IDLE)
+ if (idle == CPU_NOT_IDLE)
load_idx = sd->busy_idx;
- else if (idle == NEWLY_IDLE)
+ else if (idle == CPU_NEWLY_IDLE)
load_idx = sd->newidle_idx;
else
load_idx = sd->idle_idx;

do {
- unsigned long load, group_capacity;
+ unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
int local_group;
int i;
+ int __group_imb = 0;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long sum_nr_running, sum_weighted_load;

@@ -2412,6 +2467,8 @@ find_busiest_group(struct sched_domain *

/* Tally up the load of all CPUs in the group */
sum_weighted_load = sum_nr_running = avg_load = 0;
+ max_cpu_load = 0;
+ min_cpu_load = ~0UL;

for_each_cpu_mask(i, group->cpumask) {
struct rq *rq;
@@ -2421,7 +2478,7 @@ find_busiest_group(struct sched_domain *

rq = cpu_rq(i);

- if (*sd_idle && !idle_cpu(i))
+ if (*sd_idle && rq->nr_running)
*sd_idle = 0;

/* Bias balancing toward cpus of our domain */
@@ -2432,20 +2489,27 @@ find_busiest_group(struct sched_domain *
}

load = target_load(i, load_idx);
- } else
+ } else {
load = source_load(i, load_idx);
+ if (load > max_cpu_load)
+ max_cpu_load = load;
+ if (min_cpu_load > load)
+ min_cpu_load = load;
+ }

avg_load += load;
sum_nr_running += rq->nr_running;
- sum_weighted_load += rq->raw_weighted_load;
+ sum_weighted_load += weighted_cpuload(i);
}

/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
- * domains.
+ * domains. In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
*/
- if (local_group && balance_cpu != this_cpu && balance) {
+ if (idle != CPU_NEWLY_IDLE && local_group &&
+ balance_cpu != this_cpu && balance) {
*balance = 0;
goto ret;
}
@@ -2457,6 +2521,9 @@ find_busiest_group(struct sched_domain *
avg_load = sg_div_cpu_power(group,
avg_load * SCHED_LOAD_SCALE);

+ if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+ __group_imb = 1;
+
group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

if (local_group) {
@@ -2465,11 +2532,12 @@ find_busiest_group(struct sched_domain *
this_nr_running = sum_nr_running;
this_load_per_task = sum_weighted_load;
} else if (avg_load > max_load &&
- sum_nr_running > group_capacity) {
+ (sum_nr_running > group_capacity || __group_imb)) {
max_load = avg_load;
busiest = group;
busiest_nr_running = sum_nr_running;
busiest_load_per_task = sum_weighted_load;
+ group_imb = __group_imb;
}

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2477,8 +2545,9 @@ find_busiest_group(struct sched_domain *
* Busy processors will not participate in power savings
* balance.
*/
- if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- goto group_next;
+ if (idle == CPU_NOT_IDLE ||
+ !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ goto group_next;

/*
* If the local group is idle or completely loaded
@@ -2488,42 +2557,42 @@ find_busiest_group(struct sched_domain *
!this_nr_running))
power_savings_balance = 0;

- /*
+ /*
* If a group is already running at full capacity or idle,
* don't include that group in power savings calculations
- */
- if (!power_savings_balance || sum_nr_running >= group_capacity
+ */
+ if (!power_savings_balance || sum_nr_running >= group_capacity
|| !sum_nr_running)
- goto group_next;
+ goto group_next;

- /*
+ /*
* Calculate the group which has the least non-idle load.
- * This is the group from where we need to pick up the load
- * for saving power
- */
- if ((sum_nr_running < min_nr_running) ||
- (sum_nr_running == min_nr_running &&
+ * This is the group from where we need to pick up the load
+ * for saving power
+ */
+ if ((sum_nr_running < min_nr_running) ||
+ (sum_nr_running == min_nr_running &&
first_cpu(group->cpumask) <
first_cpu(group_min->cpumask))) {
- group_min = group;
- min_nr_running = sum_nr_running;
+ group_min = group;
+ min_nr_running = sum_nr_running;
min_load_per_task = sum_weighted_load /
sum_nr_running;
- }
+ }

- /*
+ /*
* Calculate the group which is almost near its
- * capacity but still has some space to pick up some load
- * from other group and save more power
- */
- if (sum_nr_running <= group_capacity - 1) {
- if (sum_nr_running > leader_nr_running ||
- (sum_nr_running == leader_nr_running &&
- first_cpu(group->cpumask) >
- first_cpu(group_leader->cpumask))) {
- group_leader = group;
- leader_nr_running = sum_nr_running;
- }
+ * capacity but still has some space to pick up some load
+ * from other group and save more power
+ */
+ if (sum_nr_running <= group_capacity - 1) {
+ if (sum_nr_running > leader_nr_running ||
+ (sum_nr_running == leader_nr_running &&
+ first_cpu(group->cpumask) >
+ first_cpu(group_leader->cpumask))) {
+ group_leader = group;
+ leader_nr_running = sum_nr_running;
+ }
}
group_next:
#endif
@@ -2540,6 +2609,9 @@ group_next:
goto out_balanced;

busiest_load_per_task /= busiest_nr_running;
+ if (group_imb)
+ busiest_load_per_task = min(busiest_load_per_task, avg_load);
+
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
@@ -2592,7 +2664,8 @@ small_imbalance:
} else
this_load_per_task = SCHED_LOAD_SCALE;

- if (max_load - this_load >= busiest_load_per_task * imbn) {
+ if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+ busiest_load_per_task * imbn) {
*imbalance = busiest_load_per_task;
return busiest;
}
@@ -2629,17 +2702,15 @@ small_imbalance:
pwr_move /= SCHED_LOAD_SCALE;

/* Move if we gain throughput */
- if (pwr_move <= pwr_now)
- goto out_balanced;
-
- *imbalance = busiest_load_per_task;
+ if (pwr_move > pwr_now)
+ *imbalance = busiest_load_per_task;
}

return busiest;

out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
goto ret;

if (this == group_leader && group_leader != group_min) {
@@ -2656,7 +2727,7 @@ ret:
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *
-find_busiest_queue(struct sched_group *group, enum idle_type idle,
+find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
unsigned long imbalance, cpumask_t *cpus)
{
struct rq *busiest = NULL, *rq;
@@ -2664,17 +2735,19 @@ find_busiest_queue(struct sched_group *g
int i;

for_each_cpu_mask(i, group->cpumask) {
+ unsigned long wl;

if (!cpu_isset(i, *cpus))
continue;

rq = cpu_rq(i);
+ wl = weighted_cpuload(i);

- if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
+ if (rq->nr_running == 1 && wl > imbalance)
continue;

- if (rq->raw_weighted_load > max_load) {
- max_load = rq->raw_weighted_load;
+ if (wl > max_load) {
+ max_load = wl;
busiest = rq;
}
}
@@ -2688,20 +2761,15 @@ find_busiest_queue(struct sched_group *g
*/
#define MAX_PINNED_INTERVAL 512

-static inline unsigned long minus_1_or_zero(unsigned long n)
-{
- return n > 0 ? n - 1 : 0;
-}
-
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
- struct sched_domain *sd, enum idle_type idle,
+ struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+ int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
@@ -2711,14 +2779,14 @@ static int load_balance(int this_cpu, st
/*
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
- * let the state of idle sibling percolate up as IDLE, instead of
- * portraying it as NOT_IDLE.
+ * let the state of idle sibling percolate up as CPU_IDLE, instead of
+ * portraying it as CPU_NOT_IDLE.
*/
- if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+ if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;

- schedstat_inc(sd, lb_cnt[idle]);
+ schedstat_inc(sd, lb_count[idle]);

redo:
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
@@ -2742,18 +2810,17 @@ redo:

schedstat_add(sd, lb_imbalance[idle], imbalance);

- nr_moved = 0;
+ ld_moved = 0;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
- * still unbalanced. nr_moved simply stays zero, so it is
+ * still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
- minus_1_or_zero(busiest->nr_running),
+ ld_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
@@ -2761,7 +2828,7 @@ redo:
/*
* some other cpu did the load balance for us.
*/
- if (nr_moved && this_cpu != smp_processor_id())
+ if (ld_moved && this_cpu != smp_processor_id())
resched_cpu(this_cpu);

/* All tasks on this runqueue were pinned by CPU affinity */
@@ -2773,7 +2840,7 @@ redo:
}
}

- if (!nr_moved) {
+ if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;

@@ -2822,10 +2889,10 @@ redo:
sd->balance_interval *= 2;
}

- if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
- return nr_moved;
+ return ld_moved;

out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
@@ -2848,7 +2915,7 @@ out_one_pinned:
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*
- * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
* this_rq is locked.
*/
static int
@@ -2857,68 +2924,71 @@ load_balance_newidle(int this_cpu, struc
struct sched_group *group;
struct rq *busiest = NULL;
unsigned long imbalance;
- int nr_moved = 0;
+ int ld_moved = 0;
int sd_idle = 0;
+ int all_pinned = 0;
cpumask_t cpus = CPU_MASK_ALL;

/*
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
* let the state of idle sibling percolate up as IDLE, instead of
- * portraying it as NOT_IDLE.
+ * portraying it as CPU_NOT_IDLE.
*/
if (sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;

- schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+ schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
redo:
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+ group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
&sd_idle, &cpus, NULL);
if (!group) {
- schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+ schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
goto out_balanced;
}

- busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+ busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
&cpus);
if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+ schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
goto out_balanced;
}

BUG_ON(busiest == this_rq);

- schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+ schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

- nr_moved = 0;
+ ld_moved = 0;
if (busiest->nr_running > 1) {
/* Attempt to move tasks */
double_lock_balance(this_rq, busiest);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
- minus_1_or_zero(busiest->nr_running),
- imbalance, sd, NEWLY_IDLE, NULL);
+ /* this_rq->clock is already updated */
+ update_rq_clock(busiest);
+ ld_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, CPU_NEWLY_IDLE,
+ &all_pinned);
spin_unlock(&busiest->lock);

- if (!nr_moved) {
+ if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
if (!cpus_empty(cpus))
goto redo;
}
}

- if (!nr_moved) {
- schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+ if (!ld_moved) {
+ schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
} else
sd->nr_balance_failed = 0;

- return nr_moved;
+ return ld_moved;

out_balanced:
- schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
@@ -2934,8 +3004,8 @@ out_balanced:
static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
- int pulled_task = 0;
- unsigned long next_balance = jiffies + 60 * HZ;
+ int pulled_task = -1;
+ unsigned long next_balance = jiffies + HZ;

for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -2954,12 +3024,13 @@ static void idle_balance(int this_cpu, s
if (pulled_task)
break;
}
- if (!pulled_task)
+ if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
/*
* We are going idle. next_balance may be set based on
* a busy processor. So reset next_balance.
*/
this_rq->next_balance = next_balance;
+ }
}

/*
@@ -2991,6 +3062,8 @@ static void active_load_balance(struct r

/* move a task from busiest_rq to target_rq */
double_lock_balance(busiest_rq, target_rq);
+ update_rq_clock(busiest_rq);
+ update_rq_clock(target_rq);

/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {
@@ -3000,11 +3073,10 @@ static void active_load_balance(struct r
}

if (likely(sd)) {
- schedstat_inc(sd, alb_cnt);
+ schedstat_inc(sd, alb_count);

- if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
- RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
- NULL))
+ if (move_one_task(target_rq, target_cpu, busiest_rq,
+ sd, CPU_IDLE))
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
@@ -3012,32 +3084,6 @@ static void active_load_balance(struct r
spin_unlock(&target_rq->lock);
}

-static void update_load(struct rq *this_rq)
-{
- unsigned long this_load;
- unsigned int i, scale;
-
- this_load = this_rq->raw_weighted_load;
-
- /* Update our load: */
- for (i = 0, scale = 1; i < 3; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale-1;
- this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
- }
-}
-
#ifdef CONFIG_NO_HZ
static struct {
atomic_t load_balancer;
@@ -3120,7 +3166,7 @@ static DEFINE_SPINLOCK(balancing);
*
* Balancing parameters are set up in arch_init_sched_domains.
*/
-static inline void rebalance_domains(int cpu, enum idle_type idle)
+static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
int balance = 1;
struct rq *rq = cpu_rq(cpu);
@@ -3128,19 +3174,23 @@ static inline void rebalance_domains(int
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
+ int update_next_balance = 0;

for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;

interval = sd->balance_interval;
- if (idle != SCHED_IDLE)
+ if (idle != CPU_IDLE)
interval *= sd->busy_factor;

/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
if (unlikely(!interval))
interval = 1;
+ if (interval > HZ*NR_CPUS/10)
+ interval = HZ*NR_CPUS/10;
+

if (sd->flags & SD_SERIALIZE) {
if (!spin_trylock(&balancing))
@@ -3154,15 +3204,17 @@ static inline void rebalance_domains(int
* longer idle, or one of our SMT siblings is
* not idle.
*/
- idle = NOT_IDLE;
+ idle = CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
}
if (sd->flags & SD_SERIALIZE)
spin_unlock(&balancing);
out:
- if (time_after(next_balance, sd->last_balance + interval))
+ if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
+ update_next_balance = 1;
+ }

/*
* Stop the load balance at this level. There is another
@@ -3172,7 +3224,14 @@ out:
if (!balance)
break;
}
- rq->next_balance = next_balance;
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the cpu is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ rq->next_balance = next_balance;
}

/*
@@ -3182,11 +3241,12 @@ out:
*/
static void run_rebalance_domains(struct softirq_action *h)
{
- int local_cpu = smp_processor_id();
- struct rq *local_rq = cpu_rq(local_cpu);
- enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+ int this_cpu = smp_processor_id();
+ struct rq *this_rq = cpu_rq(this_cpu);
+ enum cpu_idle_type idle = this_rq->idle_at_tick ?
+ CPU_IDLE : CPU_NOT_IDLE;

- rebalance_domains(local_cpu, idle);
+ rebalance_domains(this_cpu, idle);

#ifdef CONFIG_NO_HZ
/*
@@ -3194,13 +3254,13 @@ static void run_rebalance_domains(struct
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
- if (local_rq->idle_at_tick &&
- atomic_read(&nohz.load_balancer) == local_cpu) {
+ if (this_rq->idle_at_tick &&
+ atomic_read(&nohz.load_balancer) == this_cpu) {
cpumask_t cpus = nohz.cpu_mask;
struct rq *rq;
int balance_cpu;

- cpu_clear(local_cpu, cpus);
+ cpu_clear(this_cpu, cpus);
for_each_cpu_mask(balance_cpu, cpus) {
/*
* If this cpu gets work to do, stop the load balancing
@@ -3210,11 +3270,11 @@ static void run_rebalance_domains(struct
if (need_resched())
break;

- rebalance_domains(balance_cpu, SCHED_IDLE);
+ rebalance_domains(balance_cpu, CPU_IDLE);

rq = cpu_rq(balance_cpu);
- if (time_after(local_rq->next_balance, rq->next_balance))
- local_rq->next_balance = rq->next_balance;
+ if (time_after(this_rq->next_balance, rq->next_balance))
+ this_rq->next_balance = rq->next_balance;
}
}
#endif
@@ -3227,9 +3287,8 @@ static void run_rebalance_domains(struct
* idle load balancing owner or decide to stop the periodic load balancing,
* if the whole system is idle.
*/
-static inline void trigger_load_balance(int cpu)
+static inline void trigger_load_balance(struct rq *rq, int cpu)
{
- struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_NO_HZ
/*
* If we were in the nohz mode recently and busy at the current
@@ -3281,13 +3340,16 @@ static inline void trigger_load_balance(
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
}
-#else
+
+#else /* CONFIG_SMP */
+
/*
* on UP we do not need to balance between CPUs:
*/
static inline void idle_balance(int cpu, struct rq *rq)
{
}
+
#endif

DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3295,57 +3357,31 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * This is called on clock ticks and on context switches.
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * Return p->sum_exec_runtime plus any more ns on the sched_clock
+ * that have not yet been banked in case the task is currently running.
*/
-static inline void
-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
+unsigned long long task_sched_runtime(struct task_struct *p)
{
- p->sched_time += now - p->last_ran;
- p->last_ran = rq->most_recent_timestamp = now;
-}
-
-/*
- * Return current->sched_time plus any more ns on the sched_clock
- * that have not yet been banked.
- */
-unsigned long long current_sched_time(const struct task_struct *p)
-{
- unsigned long long ns;
- unsigned long flags;
-
- local_irq_save(flags);
- ns = p->sched_time + sched_clock() - p->last_ran;
- local_irq_restore(flags);
-
- return ns;
-}
-
-/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-static inline int expired_starving(struct rq *rq)
-{
- if (rq->curr->static_prio > rq->best_expired_prio)
- return 1;
- if (!STARVATION_LIMIT || !rq->expired_timestamp)
- return 0;
- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
- return 1;
- return 0;
+ unsigned long flags;
+ u64 ns, delta_exec;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime;
+ if (rq->curr == p) {
+ update_rq_clock(rq);
+ delta_exec = rq->clock - p->se.exec_start;
+ if ((s64)delta_exec > 0)
+ ns += delta_exec;
+ }
+ task_rq_unlock(rq, &flags);
+
+ return ns;
}

/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
* @cputime: the cpu time spent in user space since the last update
*/
void account_user_time(struct task_struct *p, cputime_t cputime)
@@ -3364,6 +3400,35 @@ void account_user_time(struct task_struc
}

/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime)
+{
+ cputime64_t tmp;
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+ tmp = cputime_to_cputime64(cputime);
+
+ p->utime = cputime_add(p->utime, cputime);
+ p->gtime = cputime_add(p->gtime, cputime);
+
+ cpustat->user = cputime64_add(cpustat->user, tmp);
+ cpustat->guest = cputime64_add(cpustat->guest, tmp);
+}
+
+/*
+ * Account scaled user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ */
+void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
+{
+ p->utimescaled = cputime_add(p->utimescaled, cputime);
+}
+
+/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3376,6 +3441,9 @@ void account_system_time(struct task_str
struct rq *rq = this_rq();
cputime64_t tmp;

+ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
+ return account_guest_time(p, cputime);
+
p->stime = cputime_add(p->stime, cputime);

/* Add system time to cpustat. */
@@ -3395,6 +3463,17 @@ void account_system_time(struct task_str
}

/*
+ * Account scaled system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ */
+void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+{
+ p->stimescaled = cputime_add(p->stimescaled, cputime);
+}
+
+/*
* Account for involuntary wait time.
* @p: the process from which the cpu time has been stolen
* @steal: the cpu time spent in involuntary wait
@@ -3415,81 +3494,6 @@ void account_steal_time(struct task_stru
cpustat->steal = cputime64_add(cpustat->steal, tmp);
}

-static void task_running_tick(struct rq *rq, struct task_struct *p)
-{
- if (p->array != rq->active) {
- /* Task has expired but was not scheduled yet */
- set_tsk_need_resched(p);
- return;
- }
- spin_lock(&rq->lock);
- /*
- * The task was running during this tick - update the
- * time slice counter. Note: we do not update a thread's
- * priority until it either goes to sleep or uses up its
- * timeslice. This makes it possible for interactive tasks
- * to use up their timeslices at their highest priority levels.
- */
- if (rt_task(p)) {
- /*
- * RR tasks need a special form of timeslice management.
- * FIFO tasks have no timeslices.
- */
- if ((p->policy == SCHED_RR) && !--p->time_slice) {
- p->time_slice = task_timeslice(p);
- p->first_time_slice = 0;
- set_tsk_need_resched(p);
-
- /* put it at the end of the queue: */
- requeue_task(p, rq->active);
- }
- goto out_unlock;
- }
- if (!--p->time_slice) {
- dequeue_task(p, rq->active);
- set_tsk_need_resched(p);
- p->prio = effective_prio(p);
- p->time_slice = task_timeslice(p);
- p->first_time_slice = 0;
-
- if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
- enqueue_task(p, rq->expired);
- if (p->static_prio < rq->best_expired_prio)
- rq->best_expired_prio = p->static_prio;
- } else
- enqueue_task(p, rq->active);
- } else {
- /*
- * Prevent a too long timeslice allowing a task to monopolize
- * the CPU. We do this by splitting up the timeslice into
- * smaller pieces.
- *
- * Note: this does not mean the task's timeslices expire or
- * get lost in any way, they just might be preempted by
- * another task of equal priority. (one with higher
- * priority would have preempted this task already.) We
- * requeue this task to the end of the list on this priority
- * level, which is in essence a round-robin of tasks with
- * equal priority.
- *
- * This only applies to tasks in the interactive
- * delta range with at least TIMESLICE_GRANULARITY to requeue.
- */
- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
- p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
- (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
- (p->array == rq->active)) {
-
- requeue_task(p, rq->active);
- set_tsk_need_resched(p);
- }
- }
-out_unlock:
- spin_unlock(&rq->lock);
-}
-
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3499,20 +3503,27 @@ out_unlock:
*/
void scheduler_tick(void)
{
- unsigned long long now = sched_clock();
- struct task_struct *p = current;
int cpu = smp_processor_id();
- int idle_at_tick = idle_cpu(cpu);
struct rq *rq = cpu_rq(cpu);
+ struct task_struct *curr = rq->curr;
+ u64 next_tick = rq->tick_timestamp + TICK_NSEC;

- update_cpu_clock(p, rq, now);
+ spin_lock(&rq->lock);
+ __update_rq_clock(rq);
+ /*
+ * Let rq->clock advance by at least TICK_NSEC:
+ */
+ if (unlikely(rq->clock < next_tick))
+ rq->clock = next_tick;
+ rq->tick_timestamp = rq->clock;
+ update_cpu_load(rq);
+ if (curr != rq->idle) /* FIXME: needed? */
+ curr->sched_class->task_tick(rq, curr);
+ spin_unlock(&rq->lock);

- if (!idle_at_tick)
- task_running_tick(rq, p);
#ifdef CONFIG_SMP
- update_load(rq);
- rq->idle_at_tick = idle_at_tick;
- trigger_load_balance(cpu);
+ rq->idle_at_tick = idle_cpu(cpu);
+ trigger_load_balance(rq, cpu);
#endif
}

@@ -3554,170 +3565,145 @@ EXPORT_SYMBOL(sub_preempt_count);

#endif

-static inline int interactive_sleep(enum sleep_type sleep_type)
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
{
- return (sleep_type == SLEEP_INTERACTIVE ||
- sleep_type == SLEEP_INTERRUPTED);
+ struct pt_regs *regs = get_irq_regs();
+
+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+ prev->comm, prev->pid, preempt_count());
+
+ debug_show_held_locks(prev);
+ if (irqs_disabled())
+ print_irqtrace_events(prev);
+
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
}

/*
- * schedule() is the main scheduler function.
+ * Various schedule()-time debugging checks and statistics:
*/
-asmlinkage void __sched schedule(void)
+static inline void schedule_debug(struct task_struct *prev)
{
- struct task_struct *prev, *next;
- struct prio_array *array;
- struct list_head *queue;
- unsigned long long now;
- unsigned long run_time;
- int cpu, idx, new_prio;
- long *switch_count;
- struct rq *rq;
-
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
- if (unlikely(in_atomic() && !current->exit_state)) {
- printk(KERN_ERR "BUG: scheduling while atomic: "
- "%s/0x%08x/%d\n",
- current->comm, preempt_count(), current->pid);
- debug_show_held_locks(current);
- if (irqs_disabled())
- print_irqtrace_events(current);
- dump_stack();
- }
+ if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
+ __schedule_bug(prev);
+
profile_hit(SCHED_PROFILING, __builtin_return_address(0));

-need_resched:
- preempt_disable();
- prev = current;
- release_kernel_lock(prev);
-need_resched_nonpreemptible:
- rq = this_rq();
+ schedstat_inc(this_rq(), sched_count);
+#ifdef CONFIG_SCHEDSTATS
+ if (unlikely(prev->lock_depth >= 0)) {
+ schedstat_inc(this_rq(), bkl_count);
+ schedstat_inc(prev, sched_info.bkl_count);
+ }
+#endif
+}
+
+/*
+ * Pick up the highest-prio task:
+ */
+static inline struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev)
+{
+ const struct sched_class *class;
+ struct task_struct *p;

/*
- * The idle thread is not allowed to schedule!
- * Remove this check after it has been exercised a bit.
+ * Optimization: we know that if all tasks are in
+ * the fair class we can call that function directly:
*/
- if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
- printk(KERN_ERR "bad: scheduling from the idle thread!\n");
- dump_stack();
+ if (likely(rq->nr_running == rq->cfs.nr_running)) {
+ p = fair_sched_class.pick_next_task(rq);
+ if (likely(p))
+ return p;
}

- schedstat_inc(rq, sched_cnt);
- now = sched_clock();
- if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
- run_time = now - prev->timestamp;
- if (unlikely((long long)(now - prev->timestamp) < 0))
- run_time = 0;
- } else
- run_time = NS_MAX_SLEEP_AVG;
+ class = sched_class_highest;
+ for ( ; ; ) {
+ p = class->pick_next_task(rq);
+ if (p)
+ return p;
+ /*
+ * Will never be NULL as the idle class always
+ * returns a non-NULL p:
+ */
+ class = class->next;
+ }
+}
+
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+ struct task_struct *prev, *next;
+ long *switch_count;
+ struct rq *rq;
+ int cpu;
+
+need_resched:
+ preempt_disable();
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ rcu_qsctr_inc(cpu);
+ prev = rq->curr;
+ switch_count = &prev->nivcsw;
+
+ release_kernel_lock(prev);
+need_resched_nonpreemptible:
+
+ schedule_debug(prev);

/*
- * Tasks charged proportionately less run_time at high sleep_avg to
- * delay them losing their interactive status
+ * Do the rq-clock update outside the rq lock:
*/
- run_time /= (CURRENT_BONUS(prev) ? : 1);
-
- spin_lock_irq(&rq->lock);
+ local_irq_disable();
+ __update_rq_clock(rq);
+ spin_lock(&rq->lock);
+ clear_tsk_need_resched(prev);

- switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- switch_count = &prev->nvcsw;
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
- unlikely(signal_pending(prev))))
+ unlikely(signal_pending(prev)))) {
prev->state = TASK_RUNNING;
- else {
- if (prev->state == TASK_UNINTERRUPTIBLE)
- rq->nr_uninterruptible++;
- deactivate_task(prev, rq);
+ } else {
+ deactivate_task(rq, prev, 1);
}
+ switch_count = &prev->nvcsw;
}

- cpu = smp_processor_id();
- if (unlikely(!rq->nr_running)) {
+ if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
- if (!rq->nr_running) {
- next = rq->idle;
- rq->expired_timestamp = 0;
- goto switch_tasks;
- }
- }

- array = rq->active;
- if (unlikely(!array->nr_active)) {
- /*
- * Switch the active and expired arrays.
- */
- schedstat_inc(rq, sched_switch);
- rq->active = rq->expired;
- rq->expired = array;
- array = rq->active;
- rq->expired_timestamp = 0;
- rq->best_expired_prio = MAX_PRIO;
- }
-
- idx = sched_find_first_bit(array->bitmap);
- queue = array->queue + idx;
- next = list_entry(queue->next, struct task_struct, run_list);
-
- if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
- unsigned long long delta = now - next->timestamp;
- if (unlikely((long long)(now - next->timestamp) < 0))
- delta = 0;
-
- if (next->sleep_type == SLEEP_INTERACTIVE)
- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
- array = next->array;
- new_prio = recalc_task_prio(next, next->timestamp + delta);
-
- if (unlikely(next->prio != new_prio)) {
- dequeue_task(next, array);
- next->prio = new_prio;
- enqueue_task(next, array);
- }
- }
- next->sleep_type = SLEEP_NORMAL;
-switch_tasks:
- if (next == rq->idle)
- schedstat_inc(rq, sched_goidle);
- prefetch(next);
- prefetch_stack(next);
- clear_tsk_need_resched(prev);
- rcu_qsctr_inc(task_cpu(prev));
-
- update_cpu_clock(prev, rq, now);
-
- prev->sleep_avg -= run_time;
- if ((long)prev->sleep_avg <= 0)
- prev->sleep_avg = 0;
- prev->timestamp = prev->last_ran = now;
+ prev->sched_class->put_prev_task(rq, prev);
+ next = pick_next_task(rq, prev);

sched_info_switch(prev, next);
+
if (likely(prev != next)) {
- next->timestamp = next->last_ran = now;
rq->nr_switches++;
rq->curr = next;
++*switch_count;

- prepare_task_switch(rq, next);
- prev = context_switch(rq, prev, next);
- barrier();
- /*
- * this_rq must be evaluated again because prev may have moved
- * CPUs since it called schedule(), thus the 'rq' on its stack
- * frame will be invalid.
- */
- finish_task_switch(this_rq(), prev);
+ context_switch(rq, prev, next); /* unlocks the rq */
} else
spin_unlock_irq(&rq->lock);

- prev = current;
- if (unlikely(reacquire_kernel_lock(prev) < 0))
+ if (unlikely(reacquire_kernel_lock(current) < 0)) {
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
goto need_resched_nonpreemptible;
+ }
preempt_enable_no_resched();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
@@ -3744,27 +3730,30 @@ asmlinkage void __sched preempt_schedule
if (likely(ti->preempt_count || irqs_disabled()))
return;

-need_resched:
- add_preempt_count(PREEMPT_ACTIVE);
- /*
- * We keep the big kernel semaphore locked, but we
- * clear ->lock_depth so that schedule() doesnt
- * auto-release the semaphore:
- */
+ do {
+ add_preempt_count(PREEMPT_ACTIVE);
+
+ /*
+ * We keep the big kernel semaphore locked, but we
+ * clear ->lock_depth so that schedule() doesnt
+ * auto-release the semaphore:
+ */
#ifdef CONFIG_PREEMPT_BKL
- saved_lock_depth = task->lock_depth;
- task->lock_depth = -1;
+ saved_lock_depth = task->lock_depth;
+ task->lock_depth = -1;
#endif
- schedule();
+ schedule();
#ifdef CONFIG_PREEMPT_BKL
- task->lock_depth = saved_lock_depth;
+ task->lock_depth = saved_lock_depth;
#endif
- sub_preempt_count(PREEMPT_ACTIVE);
+ sub_preempt_count(PREEMPT_ACTIVE);

- /* we could miss a preemption opportunity between schedule and now */
- barrier();
- if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
- goto need_resched;
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
}
EXPORT_SYMBOL(preempt_schedule);

@@ -3784,29 +3773,32 @@ asmlinkage void __sched preempt_schedule
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());

-need_resched:
- add_preempt_count(PREEMPT_ACTIVE);
- /*
- * We keep the big kernel semaphore locked, but we
- * clear ->lock_depth so that schedule() doesnt
- * auto-release the semaphore:
- */
+ do {
+ add_preempt_count(PREEMPT_ACTIVE);
+
+ /*
+ * We keep the big kernel semaphore locked, but we
+ * clear ->lock_depth so that schedule() doesnt
+ * auto-release the semaphore:
+ */
#ifdef CONFIG_PREEMPT_BKL
- saved_lock_depth = task->lock_depth;
- task->lock_depth = -1;
+ saved_lock_depth = task->lock_depth;
+ task->lock_depth = -1;
#endif
- local_irq_enable();
- schedule();
- local_irq_disable();
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
#ifdef CONFIG_PREEMPT_BKL
- task->lock_depth = saved_lock_depth;
+ task->lock_depth = saved_lock_depth;
#endif
- sub_preempt_count(PREEMPT_ACTIVE);
+ sub_preempt_count(PREEMPT_ACTIVE);

- /* we could miss a preemption opportunity between schedule and now */
- barrier();
- if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
- goto need_resched;
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
}

#endif /* CONFIG_PREEMPT */
@@ -3830,10 +3822,9 @@ EXPORT_SYMBOL(default_wake_function);
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int sync, void *key)
{
- struct list_head *tmp, *next;
+ wait_queue_t *curr, *next;

- list_for_each_safe(tmp, next, &q->task_list) {
- wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;

if (curr->func(curr, mode, sync, key) &&
@@ -3899,7 +3890,7 @@ __wake_up_sync(wait_queue_head_t *q, uns
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

-void fastcall complete(struct completion *x)
+void complete(struct completion *x)
{
unsigned long flags;

@@ -3911,7 +3902,7 @@ void fastcall complete(struct completion
}
EXPORT_SYMBOL(complete);

-void fastcall complete_all(struct completion *x)
+void complete_all(struct completion *x)
{
unsigned long flags;

@@ -3923,196 +3914,120 @@ void fastcall complete_all(struct comple
}
EXPORT_SYMBOL(complete_all);

-void fastcall __sched wait_for_completion(struct completion *x)
-{
- might_sleep();
-
- spin_lock_irq(&x->wait.lock);
- if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
-
- wait.flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue_tail(&x->wait, &wait);
- do {
- __set_current_state(TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&x->wait.lock);
- schedule();
- spin_lock_irq(&x->wait.lock);
- } while (!x->done);
- __remove_wait_queue(&x->wait, &wait);
- }
- x->done--;
- spin_unlock_irq(&x->wait.lock);
-}
-EXPORT_SYMBOL(wait_for_completion);
-
-unsigned long fastcall __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+static inline long __sched
+do_wait_for_common(struct completion *x, long timeout, int state)
{
- might_sleep();
-
- spin_lock_irq(&x->wait.lock);
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);

wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ if (state == TASK_INTERRUPTIBLE &&
+ signal_pending(current)) {
+ __remove_wait_queue(&x->wait, &wait);
+ return -ERESTARTSYS;
+ }
+ __set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&x->wait.lock);
if (!timeout) {
__remove_wait_queue(&x->wait, &wait);
- goto out;
+ return timeout;
}
} while (!x->done);
__remove_wait_queue(&x->wait, &wait);
}
x->done--;
-out:
- spin_unlock_irq(&x->wait.lock);
return timeout;
}
-EXPORT_SYMBOL(wait_for_completion_timeout);

-int fastcall __sched wait_for_completion_interruptible(struct completion *x)
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
{
- int ret = 0;
-
might_sleep();

spin_lock_irq(&x->wait.lock);
- if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
-
- wait.flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue_tail(&x->wait, &wait);
- do {
- if (signal_pending(current)) {
- ret = -ERESTARTSYS;
- __remove_wait_queue(&x->wait, &wait);
- goto out;
- }
- __set_current_state(TASK_INTERRUPTIBLE);
- spin_unlock_irq(&x->wait.lock);
- schedule();
- spin_lock_irq(&x->wait.lock);
- } while (!x->done);
- __remove_wait_queue(&x->wait, &wait);
- }
- x->done--;
-out:
+ timeout = do_wait_for_common(x, timeout, state);
spin_unlock_irq(&x->wait.lock);
-
- return ret;
+ return timeout;
}
-EXPORT_SYMBOL(wait_for_completion_interruptible);

-unsigned long fastcall __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
- unsigned long timeout)
+void __sched wait_for_completion(struct completion *x)
{
- might_sleep();
+ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);

- spin_lock_irq(&x->wait.lock);
- if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);

- wait.flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue_tail(&x->wait, &wait);
- do {
- if (signal_pending(current)) {
- timeout = -ERESTARTSYS;
- __remove_wait_queue(&x->wait, &wait);
- goto out;
- }
- __set_current_state(TASK_INTERRUPTIBLE);
- spin_unlock_irq(&x->wait.lock);
- timeout = schedule_timeout(timeout);
- spin_lock_irq(&x->wait.lock);
- if (!timeout) {
- __remove_wait_queue(&x->wait, &wait);
- goto out;
- }
- } while (!x->done);
- __remove_wait_queue(&x->wait, &wait);
- }
- x->done--;
-out:
- spin_unlock_irq(&x->wait.lock);
- return timeout;
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+unsigned long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

+static long __sched
+sleep_on_common(wait_queue_head_t *q, int state, long timeout)
+{
+ unsigned long flags;
+ wait_queue_t wait;

-#define SLEEP_ON_VAR \
- unsigned long flags; \
- wait_queue_t wait; \
init_waitqueue_entry(&wait, current);

-#define SLEEP_ON_HEAD \
- spin_lock_irqsave(&q->lock,flags); \
- __add_wait_queue(q, &wait); \
- spin_unlock(&q->lock);
+ __set_current_state(state);

-#define SLEEP_ON_TAIL \
- spin_lock_irq(&q->lock); \
- __remove_wait_queue(q, &wait); \
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue(q, &wait);
+ spin_unlock(&q->lock);
+ timeout = schedule_timeout(timeout);
+ spin_lock_irq(&q->lock);
+ __remove_wait_queue(q, &wait);
spin_unlock_irqrestore(&q->lock, flags);

-void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
- SLEEP_ON_VAR
-
- current->state = TASK_INTERRUPTIBLE;
+ return timeout;
+}

- SLEEP_ON_HEAD
- schedule();
- SLEEP_ON_TAIL
+void __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+ sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
EXPORT_SYMBOL(interruptible_sleep_on);

-long fastcall __sched
+long __sched
interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
- SLEEP_ON_VAR
-
- current->state = TASK_INTERRUPTIBLE;
-
- SLEEP_ON_HEAD
- timeout = schedule_timeout(timeout);
- SLEEP_ON_TAIL
-
- return timeout;
+ return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
}
EXPORT_SYMBOL(interruptible_sleep_on_timeout);

-void fastcall __sched sleep_on(wait_queue_head_t *q)
+void __sched sleep_on(wait_queue_head_t *q)
{
- SLEEP_ON_VAR
-
- current->state = TASK_UNINTERRUPTIBLE;
-
- SLEEP_ON_HEAD
- schedule();
- SLEEP_ON_TAIL
+ sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
EXPORT_SYMBOL(sleep_on);

-long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
- SLEEP_ON_VAR
-
- current->state = TASK_UNINTERRUPTIBLE;
-
- SLEEP_ON_HEAD
- timeout = schedule_timeout(timeout);
- SLEEP_ON_TAIL
-
- return timeout;
+ return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
}
-
EXPORT_SYMBOL(sleep_on_timeout);

#ifdef CONFIG_RT_MUTEXES
@@ -4129,39 +4044,46 @@ EXPORT_SYMBOL(sleep_on_timeout);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- struct prio_array *array;
unsigned long flags;
+ int oldprio, on_rq, running;
struct rq *rq;
- int oldprio;

BUG_ON(prio < 0 || prio > MAX_PRIO);

rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);

oldprio = p->prio;
- array = p->array;
- if (array)
- dequeue_task(p, array);
+ on_rq = p->se.on_rq;
+ running = task_running(rq, p);
+ if (on_rq) {
+ dequeue_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+ }
+
+ if (rt_prio(prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
p->prio = prio;

- if (array) {
- /*
- * If changing to an RT priority then queue it
- * in the active array!
- */
- if (rt_task(p))
- array = rq->active;
- enqueue_task(p, array);
+ if (on_rq) {
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ enqueue_task(rq, p, 0);
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (task_running(rq, p)) {
+ if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
- resched_task(rq->curr);
+ } else {
+ check_preempt_curr(rq, p);
+ }
}
task_rq_unlock(rq, &flags);
}
@@ -4170,8 +4092,7 @@ void rt_mutex_setprio(struct task_struct

void set_user_nice(struct task_struct *p, long nice)
{
- struct prio_array *array;
- int old_prio, delta;
+ int old_prio, delta, on_rq;
unsigned long flags;
struct rq *rq;

@@ -4182,21 +4103,20 @@ void set_user_nice(struct task_struct *p
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
* it wont have any effect on scheduling until the task is
- * not SCHED_NORMAL/SCHED_BATCH:
+ * SCHED_FIFO/SCHED_RR:
*/
- if (has_rt_policy(p)) {
+ if (task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- array = p->array;
- if (array) {
- dequeue_task(p, array);
- dec_raw_weighted_load(rq, p);
- }
+ on_rq = p->se.on_rq;
+ if (on_rq)
+ dequeue_task(rq, p, 0);

p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -4204,9 +4124,8 @@ void set_user_nice(struct task_struct *p
p->prio = effective_prio(p);
delta = p->prio - old_prio;

- if (array) {
- enqueue_task(p, array);
- inc_raw_weighted_load(rq, p);
+ if (on_rq) {
+ enqueue_task(rq, p, 0);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -4320,26 +4239,34 @@ struct task_struct *idle_task(int cpu)
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
*/
-static inline struct task_struct *find_process_by_pid(pid_t pid)
+static struct task_struct *find_process_by_pid(pid_t pid)
{
- return pid ? find_task_by_pid(pid) : current;
+ return pid ? find_task_by_vpid(pid) : current;
}

/* Actually do priority change: must hold rq lock. */
-static void __setscheduler(struct task_struct *p, int policy, int prio)
+static void
+__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
{
- BUG_ON(p->array);
+ BUG_ON(p->se.on_rq);

p->policy = policy;
+ switch (p->policy) {
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ p->sched_class = &fair_sched_class;
+ break;
+ case SCHED_FIFO:
+ case SCHED_RR:
+ p->sched_class = &rt_sched_class;
+ break;
+ }
+
p->rt_priority = prio;
p->normal_prio = normal_prio(p);
/* we are holding p->pi_lock already */
p->prio = rt_mutex_getprio(p);
- /*
- * SCHED_BATCH tasks are treated as perpetual CPU hogs:
- */
- if (policy == SCHED_BATCH)
- p->sleep_avg = 0;
set_load_weight(p);
}

@@ -4354,8 +4281,7 @@ static void __setscheduler(struct task_s
int sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param)
{
- int retval, oldprio, oldpolicy = -1;
- struct prio_array *array;
+ int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
struct rq *rq;

@@ -4366,27 +4292,27 @@ recheck:
if (policy < 0)
policy = oldpolicy = p->policy;
else if (policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH)
+ policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_IDLE)
return -EINVAL;
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
- * SCHED_BATCH is 0.
+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * SCHED_BATCH and SCHED_IDLE is 0.
*/
if (param->sched_priority < 0 ||
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if (is_rt_policy(policy) != (param->sched_priority != 0))
+ if (rt_policy(policy) != (param->sched_priority != 0))
return -EINVAL;

/*
* Allow unprivileged RT tasks to decrease priority:
*/
if (!capable(CAP_SYS_NICE)) {
- if (is_rt_policy(policy)) {
+ if (rt_policy(policy)) {
unsigned long rlim_rtprio;
- unsigned long flags;

if (!lock_task_sighand(p, &flags))
return -ESRCH;
@@ -4402,6 +4328,12 @@ recheck:
param->sched_priority > rlim_rtprio)
return -EPERM;
}
+ /*
+ * Like positive nice levels, dont allow tasks to
+ * move out of SCHED_IDLE either:
+ */
+ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
+ return -EPERM;

/* can't change other user's priorities */
if ((current->euid != p->euid) &&
@@ -4429,23 +4361,33 @@ recheck:
spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
- array = p->array;
- if (array)
- deactivate_task(p, rq);
+ update_rq_clock(rq);
+ on_rq = p->se.on_rq;
+ running = task_running(rq, p);
+ if (on_rq) {
+ deactivate_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+ }
+
oldprio = p->prio;
- __setscheduler(p, policy, param->sched_priority);
- if (array) {
- __activate_task(p, rq);
+ __setscheduler(rq, p, policy, param->sched_priority);
+
+ if (on_rq) {
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ activate_task(rq, p, 0);
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (task_running(rq, p)) {
+ if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
- resched_task(rq->curr);
+ } else {
+ check_preempt_curr(rq, p);
+ }
}
__task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4511,10 +4453,10 @@ asmlinkage long sys_sched_setparam(pid_t
asmlinkage long sys_sched_getscheduler(pid_t pid)
{
struct task_struct *p;
- int retval = -EINVAL;
+ int retval;

if (pid < 0)
- goto out_nounlock;
+ return -EINVAL;

retval = -ESRCH;
read_lock(&tasklist_lock);
@@ -4525,8 +4467,6 @@ asmlinkage long sys_sched_getscheduler(p
retval = p->policy;
}
read_unlock(&tasklist_lock);
-
-out_nounlock:
return retval;
}

@@ -4539,10 +4479,10 @@ asmlinkage long sys_sched_getparam(pid_t
{
struct sched_param lp;
struct task_struct *p;
- int retval = -EINVAL;
+ int retval;

if (!param || pid < 0)
- goto out_nounlock;
+ return -EINVAL;

read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
@@ -4562,7 +4502,6 @@ asmlinkage long sys_sched_getparam(pid_t
*/
retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

-out_nounlock:
return retval;

out_unlock:
@@ -4605,8 +4544,21 @@ long sched_setaffinity(pid_t pid, cpumas

cpus_allowed = cpuset_cpus_allowed(p);
cpus_and(new_mask, new_mask, cpus_allowed);
+ again:
retval = set_cpus_allowed(p, new_mask);

+ if (!retval) {
+ cpus_allowed = cpuset_cpus_allowed(p);
+ if (!cpus_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset
+ * update. Just reset the cpus_allowed to the
+ * cpuset's cpus_allowed
+ */
+ new_mask = cpus_allowed;
+ goto again;
+ }
+ }
out_unlock:
put_task_struct(p);
mutex_unlock(&sched_hotcpu_mutex);
@@ -4683,10 +4635,8 @@ long sched_getaffinity(pid_t pid, cpumas
out_unlock:
read_unlock(&tasklist_lock);
mutex_unlock(&sched_hotcpu_mutex);
- if (retval)
- return retval;

- return 0;
+ return retval;
}

/**
@@ -4717,41 +4667,15 @@ asmlinkage long sys_sched_getaffinity(pi
/**
* sys_sched_yield - yield the current processor to other threads.
*
- * This function yields the current CPU by moving the calling thread
- * to the expired array. If there are no other threads running on this
- * CPU then this function will return.
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
*/
asmlinkage long sys_sched_yield(void)
{
struct rq *rq = this_rq_lock();
- struct prio_array *array = current->array, *target = rq->expired;
-
- schedstat_inc(rq, yld_cnt);
- /*
- * We implement yielding by moving the task into the expired
- * queue.
- *
- * (special rule: RT tasks will just roundrobin in the active
- * array.)
- */
- if (rt_task(current))
- target = rq->active;

- if (array->nr_active == 1) {
- schedstat_inc(rq, yld_act_empty);
- if (!rq->expired->nr_active)
- schedstat_inc(rq, yld_both_empty);
- } else if (!rq->expired->nr_active)
- schedstat_inc(rq, yld_exp_empty);
-
- if (array != target) {
- dequeue_task(current, array);
- enqueue_task(current, target);
- } else
- /*
- * requeue_task is cheaper so perform that if possible.
- */
- requeue_task(current, array);
+ schedstat_inc(rq, yld_count);
+ current->sched_class->yield_task(rq);

/*
* Since we are going to call schedule() anyway, there's
@@ -4902,6 +4826,7 @@ asmlinkage long sys_sched_get_priority_m
break;
case SCHED_NORMAL:
case SCHED_BATCH:
+ case SCHED_IDLE:
ret = 0;
break;
}
@@ -4926,6 +4851,7 @@ asmlinkage long sys_sched_get_priority_m
break;
case SCHED_NORMAL:
case SCHED_BATCH:
+ case SCHED_IDLE:
ret = 0;
}
return ret;
@@ -4943,11 +4869,12 @@ asmlinkage
long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
{
struct task_struct *p;
- int retval = -EINVAL;
+ unsigned int time_slice;
+ int retval;
struct timespec t;

if (pid < 0)
- goto out_nounlock;
+ return -EINVAL;

retval = -ESRCH;
read_lock(&tasklist_lock);
@@ -4959,12 +4886,24 @@ long sys_sched_rr_get_interval(pid_t pid
if (retval)
goto out_unlock;

- jiffies_to_timespec(p->policy == SCHED_FIFO ?
- 0 : task_timeslice(p), &t);
+ if (p->policy == SCHED_FIFO)
+ time_slice = 0;
+ else if (p->policy == SCHED_RR)
+ time_slice = DEF_TIMESLICE;
+ else {
+ struct sched_entity *se = &p->se;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &flags);
+ time_slice = DEF_TIMESLICE;
+ task_rq_unlock(rq, &flags);
+ }
read_unlock(&tasklist_lock);
+ jiffies_to_timespec(time_slice, &t);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
-out_nounlock:
return retval;
+
out_unlock:
read_unlock(&tasklist_lock);
return retval;
@@ -4978,18 +4917,18 @@ static void show_task(struct task_struct
unsigned state;

state = p->state ? __ffs(p->state) + 1 : 0;
- printk("%-13.13s %c", p->comm,
+ printk(KERN_INFO "%-13.13s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if (BITS_PER_LONG == 32)
+#if BITS_PER_LONG == 32
if (state == TASK_RUNNING)
- printk(" running ");
+ printk(KERN_CONT " running ");
else
- printk(" %08lX ", thread_saved_pc(p));
+ printk(KERN_CONT " %08lx ", thread_saved_pc(p));
#else
if (state == TASK_RUNNING)
- printk(" running task ");
+ printk(KERN_CONT " running task ");
else
- printk(" %016lx ", thread_saved_pc(p));
+ printk(KERN_CONT " %016lx ", thread_saved_pc(p));
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
{
@@ -4999,11 +4938,8 @@ static void show_task(struct task_struct
free = (unsigned long)n - (unsigned long)end_of_stack(p);
}
#endif
- printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
- if (!p->mm)
- printk(" (L-TLB)\n");
- else
- printk(" (NOTLB)\n");
+ printk(KERN_CONT "%5lu %5d %6d\n", free,
+ task_pid_nr(p), task_pid_nr(p->parent));

if (state != TASK_RUNNING)
show_stack(p, NULL);
@@ -5013,14 +4949,12 @@ void show_state_filter(unsigned long sta
{
struct task_struct *g, *p;

-#if (BITS_PER_LONG == 32)
- printk("\n"
- " free sibling\n");
- printk(" task PC stack pid father child younger older\n");
+#if BITS_PER_LONG == 32
+ printk(KERN_INFO
+ " task PC stack pid father\n");
#else
- printk("\n"
- " free sibling\n");
- printk(" task PC stack pid father child younger older\n");
+ printk(KERN_INFO
+ " task PC stack pid father\n");
#endif
read_lock(&tasklist_lock);
do_each_thread(g, p) {
@@ -5035,6 +4969,9 @@ void show_state_filter(unsigned long sta

touch_all_softlockup_watchdogs();

+#ifdef CONFIG_SCHED_DEBUG
+ sysrq_sched_debug_show();
+#endif
read_unlock(&tasklist_lock);
/*
* Only show locks if all tasks are dumped:
@@ -5043,6 +4980,11 @@ void show_state_filter(unsigned long sta
debug_show_all_locks();
}

+void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+{
+ idle->sched_class = &idle_sched_class;
+}
+
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
@@ -5056,13 +4998,12 @@ void __cpuinit init_idle(struct task_str
struct rq *rq = cpu_rq(cpu);
unsigned long flags;

- idle->timestamp = sched_clock();
- idle->sleep_avg = 0;
- idle->array = NULL;
+ __sched_fork(idle);
+ idle->se.exec_start = sched_clock();
+
idle->prio = idle->normal_prio = MAX_PRIO;
- idle->state = TASK_RUNNING;
idle->cpus_allowed = cpumask_of_cpu(cpu);
- set_task_cpu(idle, cpu);
+ __set_task_cpu(idle, cpu);

spin_lock_irqsave(&rq->lock, flags);
rq->curr = rq->idle = idle;
@@ -5077,6 +5018,10 @@ void __cpuinit init_idle(struct task_str
#else
task_thread_info(idle)->preempt_count = 0;
#endif
+ /*
+ * The idle tasks have their own, simple scheduling class:
+ */
+ idle->sched_class = &idle_sched_class;
}

/*
@@ -5088,6 +5033,32 @@ void __cpuinit init_idle(struct task_str
*/
cpumask_t nohz_cpu_mask = CPU_MASK_NONE;

+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static inline void sched_init_granularity(void)
+{
+ unsigned int factor = 1 + ilog2(num_online_cpus());
+ const unsigned long limit = 200000000;
+
+ sysctl_sched_min_granularity *= factor;
+ if (sysctl_sched_min_granularity > limit)
+ sysctl_sched_min_granularity = limit;
+
+ sysctl_sched_latency *= factor;
+ if (sysctl_sched_latency > limit)
+ sysctl_sched_latency = limit;
+
+ sysctl_sched_wakeup_granularity *= factor;
+ sysctl_sched_batch_wakeup_granularity *= factor;
+}
+
#ifdef CONFIG_SMP
/*
* This is how migration works:
@@ -5161,7 +5132,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
struct rq *rq_dest, *rq_src;
- int ret = 0;
+ int ret = 0, on_rq;

if (unlikely(cpu_is_offline(dest_cpu)))
return ret;
@@ -5177,20 +5148,14 @@ static int __migrate_task(struct task_st
if (!cpu_isset(dest_cpu, p->cpus_allowed))
goto out;

+ on_rq = p->se.on_rq;
+ if (on_rq)
+ deactivate_task(rq_src, p, 0);
+
set_task_cpu(p, dest_cpu);
- if (p->array) {
- /*
- * Sync timestamp with rq_dest's before activating.
- * The same thing could be achieved by doing this step
- * afterwards, and pretending it was a local activate.
- * This way is cleaner and logically correct.
- */
- p->timestamp = p->timestamp - rq_src->most_recent_timestamp
- + rq_dest->most_recent_timestamp;
- deactivate_task(p, rq_src);
- __activate_task(p, rq_dest);
- if (TASK_PREEMPTS_CURR(p, rq_dest))
- resched_task(rq_dest->curr);
+ if (on_rq) {
+ activate_task(rq_dest, p, 0);
+ check_preempt_curr(rq_dest, p);
}
ret = 1;
out:
@@ -5262,8 +5227,19 @@ wait_to_die:
}

#ifdef CONFIG_HOTPLUG_CPU
+
+static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+ int ret;
+
+ local_irq_disable();
+ ret = __migrate_task(p, src_cpu, dest_cpu);
+ local_irq_enable();
+ return ret;
+}
+
/*
- * Figure out where task on dead CPU should go, use force if neccessary.
+ * Figure out where task on dead CPU should go, use force if necessary.
* NOTE: interrupts should be disabled by the caller
*/
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
@@ -5273,35 +5249,42 @@ static void move_task_off_dead_cpu(int d
struct rq *rq;
int dest_cpu;

-restart:
- /* On same node? */
- mask = node_to_cpumask(cpu_to_node(dead_cpu));
- cpus_and(mask, mask, p->cpus_allowed);
- dest_cpu = any_online_cpu(mask);
-
- /* On any allowed CPU? */
- if (dest_cpu == NR_CPUS)
- dest_cpu = any_online_cpu(p->cpus_allowed);
-
- /* No more Mr. Nice Guy. */
- if (dest_cpu == NR_CPUS) {
- rq = task_rq_lock(p, &flags);
- cpus_setall(p->cpus_allowed);
- dest_cpu = any_online_cpu(p->cpus_allowed);
- task_rq_unlock(rq, &flags);
+ do {
+ /* On same node? */
+ mask = node_to_cpumask(cpu_to_node(dead_cpu));
+ cpus_and(mask, mask, p->cpus_allowed);
+ dest_cpu = any_online_cpu(mask);
+
+ /* On any allowed CPU? */
+ if (dest_cpu == NR_CPUS)
+ dest_cpu = any_online_cpu(p->cpus_allowed);
+
+ /* No more Mr. Nice Guy. */
+ if (dest_cpu == NR_CPUS) {
+ cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
+ /*
+ * Try to stay on the same cpuset, where the
+ * current cpuset may be a subset of all cpus.
+ * The cpuset_cpus_allowed_locked() variant of
+ * cpuset_cpus_allowed() will not block. It must be
+ * called within calls to cpuset_lock/cpuset_unlock.
+ */
+ rq = task_rq_lock(p, &flags);
+ p->cpus_allowed = cpus_allowed;
+ dest_cpu = any_online_cpu(p->cpus_allowed);
+ task_rq_unlock(rq, &flags);

- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit())
- printk(KERN_INFO "process %d (%s) no "
- "longer affine to cpu%d\n",
- p->pid, p->comm, dead_cpu);
- }
- if (!__migrate_task(p, dead_cpu, dest_cpu))
- goto restart;
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit())
+ printk(KERN_INFO "process %d (%s) no "
+ "longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, dead_cpu);
+ }
+ } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
}

/*
@@ -5329,7 +5312,7 @@ static void migrate_live_tasks(int src_c
{
struct task_struct *p, *t;

- write_lock_irq(&tasklist_lock);
+ read_lock(&tasklist_lock);

do_each_thread(t, p) {
if (p == current)
@@ -5339,12 +5322,13 @@ static void migrate_live_tasks(int src_c
move_task_off_dead_cpu(src_cpu, p);
} while_each_thread(t, p);

- write_unlock_irq(&tasklist_lock);
+ read_unlock(&tasklist_lock);
}

-/* Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible and adding it to
- * the _front_ of the runqueue. Used by CPU offline code.
+/*
+ * Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible.
+ * Used by CPU offline code.
*/
void sched_idle_next(void)
{
@@ -5362,10 +5346,10 @@ void sched_idle_next(void)
*/
spin_lock_irqsave(&rq->lock, flags);

- __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+ __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

- /* Add idle task to the _front_ of its priority queue: */
- __activate_idle_task(p, rq);
+ update_rq_clock(rq);
+ activate_task(rq, p, 0);

spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -5391,7 +5375,7 @@ static void migrate_dead(unsigned int de
struct rq *rq = cpu_rq(dead_cpu);

/* Must be exiting, otherwise would be on tasklist. */
- BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
+ BUG_ON(!p->exit_state);

/* Cannot have done final schedule yet: would have vanished. */
BUG_ON(p->state == TASK_DEAD);
@@ -5402,11 +5386,10 @@ static void migrate_dead(unsigned int de
* Drop lock around migration; if someone else moves it,
* that's OK. No task can be added to this CPU, so iteration is
* fine.
- * NOTE: interrupts should be left disabled --dev@
*/
- spin_unlock(&rq->lock);
+ spin_unlock_irq(&rq->lock);
move_task_off_dead_cpu(dead_cpu, p);
- spin_lock(&rq->lock);
+ spin_lock_irq(&rq->lock);

put_task_struct(p);
}
@@ -5415,20 +5398,186 @@ static void migrate_dead(unsigned int de
static void migrate_dead_tasks(unsigned int dead_cpu)
{
struct rq *rq = cpu_rq(dead_cpu);
- unsigned int arr, i;
+ struct task_struct *next;
+
+ for ( ; ; ) {
+ if (!rq->nr_running)
+ break;
+ update_rq_clock(rq);
+ next = pick_next_task(rq, rq->curr);
+ if (!next)
+ break;
+ migrate_dead(dead_cpu, next);

- for (arr = 0; arr < 2; arr++) {
- for (i = 0; i < MAX_PRIO; i++) {
- struct list_head *list = &rq->arrays[arr].queue[i];
-
- while (!list_empty(list))
- migrate_dead(dead_cpu, list_entry(list->next,
- struct task_struct, run_list));
- }
}
}
#endif /* CONFIG_HOTPLUG_CPU */

+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {0, },
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .ctl_name = CTL_KERN,
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {0, },
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+ return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+ struct ctl_table *entry;
+
+ /*
+ * In the intermediate directories, both the child directory and
+ * procname are dynamically allocated and could fail but the mode
+ * will always be set. In the lowest directory the names are
+ * static strings and all have proc handlers.
+ */
+ for (entry = *tablep; entry->mode; entry++) {
+ if (entry->child)
+ sd_free_ctl_entry(&entry->child);
+ if (entry->proc_handler == NULL)
+ kfree(entry->procname);
+ }
+
+ kfree(*tablep);
+ *tablep = NULL;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ mode_t mode, proc_handler *proc_handler)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(12);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "min_interval", &sd->min_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[9], "cache_nice_tries",
+ &sd->cache_nice_tries,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[10], "flags", &sd->flags,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ /* &table[11] is terminator */
+
+ return table;
+}
+
+static ctl_table * sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+ if (table == NULL)
+ return NULL;
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void register_sched_domain_sysctl(void)
+{
+ int i, cpu_num = num_online_cpus();
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ char buf[32];
+
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = entry;
+
+ if (entry == NULL)
+ return;
+
+ for_each_online_cpu(i) {
+ snprintf(buf, 32, "cpu%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_cpu_table(i);
+ entry++;
+ }
+
+ WARN_ON(sd_sysctl_header);
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+static void unregister_sched_domain_sysctl(void)
+{
+ if (sd_sysctl_header)
+ unregister_sysctl_table(sd_sysctl_header);
+ sd_sysctl_header = NULL;
+ if (sd_ctl_dir[0].child)
+ sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#else
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
/*
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
@@ -5448,21 +5597,21 @@ migration_call(struct notifier_block *nf

case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
+ p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
if (IS_ERR(p))
return NOTIFY_BAD;
p->flags |= PF_NOFREEZE;
kthread_bind(p, cpu);
/* Must be high prio: stop_machine expects to yield to it. */
rq = task_rq_lock(p, &flags);
- __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+ __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
task_rq_unlock(rq, &flags);
cpu_rq(cpu)->migration_thread = p;
break;

case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- /* Strictly unneccessary, as first user will wake it. */
+ /* Strictly unnecessary, as first user will wake it. */
wake_up_process(cpu_rq(cpu)->migration_thread);
break;

@@ -5480,17 +5629,21 @@ migration_call(struct notifier_block *nf

case CPU_DEAD:
case CPU_DEAD_FROZEN:
+ cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
- rq = task_rq_lock(rq->idle, &flags);
- deactivate_task(rq->idle, rq);
+ spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ deactivate_task(rq, rq->idle, 0);
rq->idle->static_prio = MAX_PRIO;
- __setscheduler(rq->idle, SCHED_NORMAL, 0);
+ __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
+ rq->idle->sched_class = &idle_sched_class;
migrate_dead_tasks(cpu);
- task_rq_unlock(rq, &flags);
+ spin_unlock_irq(&rq->lock);
+ cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);

@@ -5524,7 +5677,7 @@ static struct notifier_block __cpuinitda
.priority = 10
};

-int __init migration_init(void)
+void __init migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err;
@@ -5534,8 +5687,6 @@ int __init migration_init(void)
BUG_ON(err == NOTIFY_BAD);
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
-
- return 0;
}
#endif

@@ -5545,100 +5696,102 @@ int __init migration_init(void)
int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);

-#undef SCHED_DOMAIN_DEBUG
-#ifdef SCHED_DOMAIN_DEBUG
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
+#ifdef CONFIG_SCHED_DEBUG
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
{
- int level = 0;
+ struct sched_group *group = sd->groups;
+ cpumask_t groupmask;
+ char str[NR_CPUS];

- if (!sd) {
- printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
- return;
+ cpumask_scnprintf(str, NR_CPUS, sd->span);
+ cpus_clear(groupmask);
+
+ printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not load-balance\n");
+ if (sd->parent)
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+ " has parent");
+ return -1;
}

- printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+ printk(KERN_CONT "span %s\n", str);

+ if (!cpu_isset(cpu, sd->span)) {
+ printk(KERN_ERR "ERROR: domain->span does not contain "
+ "CPU%d\n", cpu);
+ }
+ if (!cpu_isset(cpu, group->cpumask)) {
+ printk(KERN_ERR "ERROR: domain->groups does not contain"
+ " CPU%d\n", cpu);
+ }
+
+ printk(KERN_DEBUG "%*s groups:", level + 1, "");
do {
- int i;
- char str[NR_CPUS];
- struct sched_group *group = sd->groups;
- cpumask_t groupmask;
-
- cpumask_scnprintf(str, NR_CPUS, sd->span);
- cpus_clear(groupmask);
-
- printk(KERN_DEBUG);
- for (i = 0; i < level + 1; i++)
- printk(" ");
- printk("domain %d: ", level);
-
- if (!(sd->flags & SD_LOAD_BALANCE)) {
- printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
+ if (!group) {
+ printk("\n");
+ printk(KERN_ERR "ERROR: group is NULL\n");
break;
}

- printk("span %s\n", str);
+ if (!group->__cpu_power) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: domain->cpu_power not "
+ "set\n");
+ break;
+ }

- if (!cpu_isset(cpu, sd->span))
- printk(KERN_ERR "ERROR: domain->span does not contain "
- "CPU%d\n", cpu);
- if (!cpu_isset(cpu, group->cpumask))
- printk(KERN_ERR "ERROR: domain->groups does not contain"
- " CPU%d\n", cpu);
-
- printk(KERN_DEBUG);
- for (i = 0; i < level + 2; i++)
- printk(" ");
- printk("groups:");
- do {
- if (!group) {
- printk("\n");
- printk(KERN_ERR "ERROR: group is NULL\n");
- break;
- }
+ if (!cpus_weight(group->cpumask)) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: empty group\n");
+ break;
+ }

- if (!group->__cpu_power) {
- printk("\n");
- printk(KERN_ERR "ERROR: domain->cpu_power not "
- "set\n");
- }
+ if (cpus_intersects(groupmask, group->cpumask)) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: repeated CPUs\n");
+ break;
+ }

- if (!cpus_weight(group->cpumask)) {
- printk("\n");
- printk(KERN_ERR "ERROR: empty group\n");
- }
+ cpus_or(groupmask, groupmask, group->cpumask);

- if (cpus_intersects(groupmask, group->cpumask)) {
- printk("\n");
- printk(KERN_ERR "ERROR: repeated CPUs\n");
- }
+ cpumask_scnprintf(str, NR_CPUS, group->cpumask);
+ printk(KERN_CONT " %s", str);
+
+ group = group->next;
+ } while (group != sd->groups);
+ printk(KERN_CONT "\n");
+
+ if (!cpus_equal(sd->span, groupmask))
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+ if (sd->parent && !cpus_subset(groupmask, sd->parent->span))
+ printk(KERN_ERR "ERROR: parent span is not a superset "
+ "of domain->span\n");
+ return 0;
+}

- cpus_or(groupmask, groupmask, group->cpumask);
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+ int level = 0;

- cpumask_scnprintf(str, NR_CPUS, group->cpumask);
- printk(" %s", str);
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }

- group = group->next;
- } while (group != sd->groups);
- printk("\n");
-
- if (!cpus_equal(sd->span, groupmask))
- printk(KERN_ERR "ERROR: groups don't span "
- "domain->span\n");
+ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

+ for (;;) {
+ if (sched_domain_debug_one(sd, cpu, level))
+ break;
level++;
sd = sd->parent;
if (!sd)
- continue;
-
- if (!cpus_subset(groupmask, sd->span))
- printk(KERN_ERR "ERROR: parent span is not a superset "
- "of domain->span\n");
-
- } while (sd);
+ break;
+ }
}
#else
# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -5747,7 +5900,7 @@ static int __init isolated_cpu_setup(cha
return 1;
}

-__setup ("isolcpus=", isolated_cpu_setup);
+__setup("isolcpus=", isolated_cpu_setup);

/*
* init_sched_build_groups takes the cpumask we wish to span, and a pointer
@@ -5797,483 +5950,6 @@ init_sched_build_groups(cpumask_t span,

#define SD_NODES_PER_DOMAIN 16

-/*
- * Self-tuning task migration cost measurement between source and target CPUs.
- *
- * This is done by measuring the cost of manipulating buffers of varying
- * sizes. For a given buffer-size here are the steps that are taken:
- *
- * 1) the source CPU reads+dirties a shared buffer
- * 2) the target CPU reads+dirties the same shared buffer
- *
- * We measure how long they take, in the following 4 scenarios:
- *
- * - source: CPU1, target: CPU2 | cost1
- * - source: CPU2, target: CPU1 | cost2
- * - source: CPU1, target: CPU1 | cost3
- * - source: CPU2, target: CPU2 | cost4
- *
- * We then calculate the cost3+cost4-cost1-cost2 difference - this is
- * the cost of migration.
- *
- * We then start off from a small buffer-size and iterate up to larger
- * buffer sizes, in 5% steps - measuring each buffer-size separately, and
- * doing a maximum search for the cost. (The maximum cost for a migration
- * normally occurs when the working set size is around the effective cache
- * size.)
- */
-#define SEARCH_SCOPE 2
-#define MIN_CACHE_SIZE (64*1024U)
-#define DEFAULT_CACHE_SIZE (5*1024*1024U)
-#define ITERATIONS 1
-#define SIZE_THRESH 130
-#define COST_THRESH 130
-
-/*
- * The migration cost is a function of 'domain distance'. Domain
- * distance is the number of steps a CPU has to iterate down its
- * domain tree to share a domain with the other CPU. The farther
- * two CPUs are from each other, the larger the distance gets.
- *
- * Note that we use the distance only to cache measurement results,
- * the distance value is not used numerically otherwise. When two
- * CPUs have the same distance it is assumed that the migration
- * cost is the same. (this is a simplification but quite practical)
- */
-#define MAX_DOMAIN_DISTANCE 32
-
-static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
- { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
-/*
- * Architectures may override the migration cost and thus avoid
- * boot-time calibration. Unit is nanoseconds. Mostly useful for
- * virtualized hardware:
- */
-#ifdef CONFIG_DEFAULT_MIGRATION_COST
- CONFIG_DEFAULT_MIGRATION_COST
-#else
- -1LL
-#endif
-};
-
-/*
- * Allow override of migration cost - in units of microseconds.
- * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
- * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
- */
-static int __init migration_cost_setup(char *str)
-{
- int ints[MAX_DOMAIN_DISTANCE+1], i;
-
- str = get_options(str, ARRAY_SIZE(ints), ints);
-
- printk("#ints: %d\n", ints[0]);
- for (i = 1; i <= ints[0]; i++) {
- migration_cost[i-1] = (unsigned long long)ints[i]*1000;
- printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
- }
- return 1;
-}
-
-__setup ("migration_cost=", migration_cost_setup);
-
-/*
- * Global multiplier (divisor) for migration-cutoff values,
- * in percentiles. E.g. use a value of 150 to get 1.5 times
- * longer cache-hot cutoff times.
- *
- * (We scale it from 100 to 128 to long long handling easier.)
- */
-
-#define MIGRATION_FACTOR_SCALE 128
-
-static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
-
-static int __init setup_migration_factor(char *str)
-{
- get_option(&str, &migration_factor);
- migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
- return 1;
-}
-
-__setup("migration_factor=", setup_migration_factor);
-
-/*
- * Estimated distance of two CPUs, measured via the number of domains
- * we have to pass for the two CPUs to be in the same span:
- */
-static unsigned long domain_distance(int cpu1, int cpu2)
-{
- unsigned long distance = 0;
- struct sched_domain *sd;
-
- for_each_domain(cpu1, sd) {
- WARN_ON(!cpu_isset(cpu1, sd->span));
- if (cpu_isset(cpu2, sd->span))
- return distance;
- distance++;
- }
- if (distance >= MAX_DOMAIN_DISTANCE) {
- WARN_ON(1);
- distance = MAX_DOMAIN_DISTANCE-1;
- }
-
- return distance;
-}
-
-static unsigned int migration_debug;
-
-static int __init setup_migration_debug(char *str)
-{
- get_option(&str, &migration_debug);
- return 1;
-}
-
-__setup("migration_debug=", setup_migration_debug);
-
-/*
- * Maximum cache-size that the scheduler should try to measure.
- * Architectures with larger caches should tune this up during
- * bootup. Gets used in the domain-setup code (i.e. during SMP
- * bootup).
- */
-unsigned int max_cache_size;
-
-static int __init setup_max_cache_size(char *str)
-{
- get_option(&str, &max_cache_size);
- return 1;
-}
-
-__setup("max_cache_size=", setup_max_cache_size);
-
-/*
- * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
- * is the operation that is timed, so we try to generate unpredictable
- * cachemisses that still end up filling the L2 cache:
- */
-static void touch_cache(void *__cache, unsigned long __size)
-{
- unsigned long size = __size / sizeof(long);
- unsigned long chunk1 = size / 3;
- unsigned long chunk2 = 2 * size / 3;
- unsigned long *cache = __cache;
- int i;
-
- for (i = 0; i < size/6; i += 8) {
- switch (i % 6) {
- case 0: cache[i]++;
- case 1: cache[size-1-i]++;
- case 2: cache[chunk1-i]++;
- case 3: cache[chunk1+i]++;
- case 4: cache[chunk2-i]++;
- case 5: cache[chunk2+i]++;
- }
- }
-}
-
-/*
- * Measure the cache-cost of one task migration. Returns in units of nsec.
- */
-static unsigned long long
-measure_one(void *cache, unsigned long size, int source, int target)
-{
- cpumask_t mask, saved_mask;
- unsigned long long t0, t1, t2, t3, cost;
-
- saved_mask = current->cpus_allowed;
-
- /*
- * Flush source caches to RAM and invalidate them:
- */
- sched_cacheflush();
-
- /*
- * Migrate to the source CPU:
- */
- mask = cpumask_of_cpu(source);
- set_cpus_allowed(current, mask);
- WARN_ON(smp_processor_id() != source);
-
- /*
- * Dirty the working set:
- */
- t0 = sched_clock();
- touch_cache(cache, size);
- t1 = sched_clock();
-
- /*
- * Migrate to the target CPU, dirty the L2 cache and access
- * the shared buffer. (which represents the working set
- * of a migrated task.)
- */
- mask = cpumask_of_cpu(target);
- set_cpus_allowed(current, mask);
- WARN_ON(smp_processor_id() != target);
-
- t2 = sched_clock();
- touch_cache(cache, size);
- t3 = sched_clock();
-
- cost = t1-t0 + t3-t2;
-
- if (migration_debug >= 2)
- printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
- source, target, t1-t0, t1-t0, t3-t2, cost);
- /*
- * Flush target caches to RAM and invalidate them:
- */
- sched_cacheflush();
-
- set_cpus_allowed(current, saved_mask);
-
- return cost;
-}
-
-/*
- * Measure a series of task migrations and return the average
- * result. Since this code runs early during bootup the system
- * is 'undisturbed' and the average latency makes sense.
- *
- * The algorithm in essence auto-detects the relevant cache-size,
- * so it will properly detect different cachesizes for different
- * cache-hierarchies, depending on how the CPUs are connected.
- *
- * Architectures can prime the upper limit of the search range via
- * max_cache_size, otherwise the search range defaults to 20MB...64K.
- */
-static unsigned long long
-measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
-{
- unsigned long long cost1, cost2;
- int i;
-
- /*
- * Measure the migration cost of 'size' bytes, over an
- * average of 10 runs:
- *
- * (We perturb the cache size by a small (0..4k)
- * value to compensate size/alignment related artifacts.
- * We also subtract the cost of the operation done on
- * the same CPU.)
- */
- cost1 = 0;
-
- /*
- * dry run, to make sure we start off cache-cold on cpu1,
- * and to get any vmalloc pagefaults in advance:
- */
- measure_one(cache, size, cpu1, cpu2);
- for (i = 0; i < ITERATIONS; i++)
- cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
-
- measure_one(cache, size, cpu2, cpu1);
- for (i = 0; i < ITERATIONS; i++)
- cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
-
- /*
- * (We measure the non-migrating [cached] cost on both
- * cpu1 and cpu2, to handle CPUs with different speeds)
- */
- cost2 = 0;
-
- measure_one(cache, size, cpu1, cpu1);
- for (i = 0; i < ITERATIONS; i++)
- cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
-
- measure_one(cache, size, cpu2, cpu2);
- for (i = 0; i < ITERATIONS; i++)
- cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
-
- /*
- * Get the per-iteration migration cost:
- */
- do_div(cost1, 2 * ITERATIONS);
- do_div(cost2, 2 * ITERATIONS);
-
- return cost1 - cost2;
-}
-
-static unsigned long long measure_migration_cost(int cpu1, int cpu2)
-{
- unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
- unsigned int max_size, size, size_found = 0;
- long long cost = 0, prev_cost;
- void *cache;
-
- /*
- * Search from max_cache_size*5 down to 64K - the real relevant
- * cachesize has to lie somewhere inbetween.
- */
- if (max_cache_size) {
- max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
- size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
- } else {
- /*
- * Since we have no estimation about the relevant
- * search range
- */
- max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
- size = MIN_CACHE_SIZE;
- }
-
- if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
- printk("cpu %d and %d not both online!\n", cpu1, cpu2);
- return 0;
- }
-
- /*
- * Allocate the working set:
- */
- cache = vmalloc(max_size);
- if (!cache) {
- printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
- return 1000000; /* return 1 msec on very small boxen */
- }
-
- while (size <= max_size) {
- prev_cost = cost;
- cost = measure_cost(cpu1, cpu2, cache, size);
-
- /*
- * Update the max:
- */
- if (cost > 0) {
- if (max_cost < cost) {
- max_cost = cost;
- size_found = size;
- }
- }
- /*
- * Calculate average fluctuation, we use this to prevent
- * noise from triggering an early break out of the loop:
- */
- fluct = abs(cost - prev_cost);
- avg_fluct = (avg_fluct + fluct)/2;
-
- if (migration_debug)
- printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
- "(%8Ld %8Ld)\n",
- cpu1, cpu2, size,
- (long)cost / 1000000,
- ((long)cost / 100000) % 10,
- (long)max_cost / 1000000,
- ((long)max_cost / 100000) % 10,
- domain_distance(cpu1, cpu2),
- cost, avg_fluct);
-
- /*
- * If we iterated at least 20% past the previous maximum,
- * and the cost has dropped by more than 20% already,
- * (taking fluctuations into account) then we assume to
- * have found the maximum and break out of the loop early:
- */
- if (size_found && (size*100 > size_found*SIZE_THRESH))
- if (cost+avg_fluct <= 0 ||
- max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
-
- if (migration_debug)
- printk("-> found max.\n");
- break;
- }
- /*
- * Increase the cachesize in 10% steps:
- */
- size = size * 10 / 9;
- }
-
- if (migration_debug)
- printk("[%d][%d] working set size found: %d, cost: %Ld\n",
- cpu1, cpu2, size_found, max_cost);
-
- vfree(cache);
-
- /*
- * A task is considered 'cache cold' if at least 2 times
- * the worst-case cost of migration has passed.
- *
- * (this limit is only listened to if the load-balancing
- * situation is 'nice' - if there is a large imbalance we
- * ignore it for the sake of CPU utilization and
- * processing fairness.)
- */
- return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
-}
-
-static void calibrate_migration_costs(const cpumask_t *cpu_map)
-{
- int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
- unsigned long j0, j1, distance, max_distance = 0;
- struct sched_domain *sd;
-
- j0 = jiffies;
-
- /*
- * First pass - calculate the cacheflush times:
- */
- for_each_cpu_mask(cpu1, *cpu_map) {
- for_each_cpu_mask(cpu2, *cpu_map) {
- if (cpu1 == cpu2)
- continue;
- distance = domain_distance(cpu1, cpu2);
- max_distance = max(max_distance, distance);
- /*
- * No result cached yet?
- */
- if (migration_cost[distance] == -1LL)
- migration_cost[distance] =
- measure_migration_cost(cpu1, cpu2);
- }
- }
- /*
- * Second pass - update the sched domain hierarchy with
- * the new cache-hot-time estimations:
- */
- for_each_cpu_mask(cpu, *cpu_map) {
- distance = 0;
- for_each_domain(cpu, sd) {
- sd->cache_hot_time = migration_cost[distance];
- distance++;
- }
- }
- /*
- * Print the matrix:
- */
- if (migration_debug)
- printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
- max_cache_size,
-#ifdef CONFIG_X86
- cpu_khz/1000
-#else
- -1
-#endif
- );
- if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
- printk("migration_cost=");
- for (distance = 0; distance <= max_distance; distance++) {
- if (distance)
- printk(",");
- printk("%ld", (long)migration_cost[distance] / 1000);
- }
- printk("\n");
- }
- j1 = jiffies;
- if (migration_debug)
- printk("migration: %ld seconds\n", (j1-j0) / HZ);
-
- /*
- * Move back to the original CPU. NUMA-Q gets confused
- * if we migrate to another quad during bootup.
- */
- if (raw_smp_processor_id() != orig_cpu) {
- cpumask_t mask = cpumask_of_cpu(orig_cpu),
- saved_mask = current->cpus_allowed;
-
- set_cpus_allowed(current, mask);
- set_cpus_allowed(current, saved_mask);
- }
-}
-
#ifdef CONFIG_NUMA

/**
@@ -6380,7 +6056,7 @@ static int cpu_to_core_group(int cpu, co
struct sched_group **sg)
{
int group;
- cpumask_t mask = cpu_sibling_map[cpu];
+ cpumask_t mask = cpu_sibling_map(cpu);
cpus_and(mask, mask, *cpu_map);
group = first_cpu(mask);
if (sg)
@@ -6409,7 +6085,7 @@ static int cpu_to_phys_group(int cpu, co
cpus_and(mask, mask, *cpu_map);
group = first_cpu(mask);
#elif defined(CONFIG_SCHED_SMT)
- cpumask_t mask = cpu_sibling_map[cpu];
+ cpumask_t mask = cpu_sibling_map(cpu);
cpus_and(mask, mask, *cpu_map);
group = first_cpu(mask);
#else
@@ -6453,24 +6129,23 @@ static void init_numa_sched_groups_power

if (!sg)
return;
-next_sg:
- for_each_cpu_mask(j, sg->cpumask) {
- struct sched_domain *sd;
+ do {
+ for_each_cpu_mask(j, sg->cpumask) {
+ struct sched_domain *sd;

- sd = &per_cpu(phys_domains, j);
- if (j != first_cpu(sd->groups->cpumask)) {
- /*
- * Only add "power" once for each
- * physical package.
- */
- continue;
- }
+ sd = &per_cpu(phys_domains, j);
+ if (j != first_cpu(sd->groups->cpumask)) {
+ /*
+ * Only add "power" once for each
+ * physical package.
+ */
+ continue;
+ }

- sg_inc_cpu_power(sg, sd->groups->__cpu_power);
- }
- sg = sg->next;
- if (sg != group_head)
- goto next_sg;
+ sg_inc_cpu_power(sg, sd->groups->__cpu_power);
+ }
+ sg = sg->next;
+ } while (sg != group_head);
}
#endif

@@ -6574,7 +6249,6 @@ static void init_sched_groups_power(int
static int build_sched_domains(const cpumask_t *cpu_map)
{
int i;
- struct sched_domain *sd;
#ifdef CONFIG_NUMA
struct sched_group **sched_group_nodes = NULL;
int sd_allnodes = 0;
@@ -6582,7 +6256,7 @@ static int build_sched_domains(const cpu
/*
* Allocate the per-node list of sched groups
*/
- sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+ sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
GFP_KERNEL);
if (!sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6601,8 +6275,8 @@ static int build_sched_domains(const cpu
cpus_and(nodemask, nodemask, *cpu_map);

#ifdef CONFIG_NUMA
- if (cpus_weight(*cpu_map)
- > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+ if (cpus_weight(*cpu_map) >
+ SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
sd = &per_cpu(allnodes_domains, i);
*sd = SD_ALLNODES_INIT;
sd->span = *cpu_map;
@@ -6645,7 +6319,7 @@ static int build_sched_domains(const cpu
p = sd;
sd = &per_cpu(cpu_domains, i);
*sd = SD_SIBLING_INIT;
- sd->span = cpu_sibling_map[i];
+ sd->span = cpu_sibling_map(i);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
@@ -6656,12 +6330,13 @@ static int build_sched_domains(const cpu
#ifdef CONFIG_SCHED_SMT
/* Set up CPU (sibling) groups */
for_each_cpu_mask(i, *cpu_map) {
- cpumask_t this_sibling_map = cpu_sibling_map[i];
+ cpumask_t this_sibling_map = cpu_sibling_map(i);
cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
if (i != first_cpu(this_sibling_map))
continue;

- init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
+ init_sched_build_groups(this_sibling_map, cpu_map,
+ &cpu_to_cpu_group);
}
#endif

@@ -6672,11 +6347,11 @@ static int build_sched_domains(const cpu
cpus_and(this_core_map, this_core_map, *cpu_map);
if (i != first_cpu(this_core_map))
continue;
- init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
+ init_sched_build_groups(this_core_map, cpu_map,
+ &cpu_to_core_group);
}
#endif

-
/* Set up physical groups */
for (i = 0; i < MAX_NUMNODES; i++) {
cpumask_t nodemask = node_to_cpumask(i);
@@ -6691,7 +6366,8 @@ static int build_sched_domains(const cpu
#ifdef CONFIG_NUMA
/* Set up node groups */
if (sd_allnodes)
- init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
+ init_sched_build_groups(*cpu_map, cpu_map,
+ &cpu_to_allnodes_group);

for (i = 0; i < MAX_NUMNODES; i++) {
/* Set up node groups */
@@ -6719,6 +6395,7 @@ static int build_sched_domains(const cpu
sched_group_nodes[i] = sg;
for_each_cpu_mask(j, nodemask) {
struct sched_domain *sd;
+
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}
@@ -6763,19 +6440,22 @@ static int build_sched_domains(const cpu
/* Calculate CPU power for physical packages and nodes */
#ifdef CONFIG_SCHED_SMT
for_each_cpu_mask(i, *cpu_map) {
- sd = &per_cpu(cpu_domains, i);
+ struct sched_domain *sd = &per_cpu(cpu_domains, i);
+
init_sched_groups_power(i, sd);
}
#endif
#ifdef CONFIG_SCHED_MC
for_each_cpu_mask(i, *cpu_map) {
- sd = &per_cpu(core_domains, i);
+ struct sched_domain *sd = &per_cpu(core_domains, i);
+
init_sched_groups_power(i, sd);
}
#endif

for_each_cpu_mask(i, *cpu_map) {
- sd = &per_cpu(phys_domains, i);
+ struct sched_domain *sd = &per_cpu(phys_domains, i);
+
init_sched_groups_power(i, sd);
}

@@ -6803,10 +6483,6 @@ static int build_sched_domains(const cpu
#endif
cpu_attach_domain(sd, i);
}
- /*
- * Tune cache-hot values:
- */
- calibrate_migration_costs(cpu_map);

return 0;

@@ -6816,22 +6492,33 @@ error:
return -ENOMEM;
#endif
}
+
+static cpumask_t *doms_cur; /* current sched domains */
+static int ndoms_cur; /* number of sched domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask_t) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask_t fallback_doms.
+ */
+static cpumask_t fallback_doms;
+
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
*/
static int arch_init_sched_domains(const cpumask_t *cpu_map)
{
- cpumask_t cpu_default_map;
int err;

- /*
- * Setup mask for cpus without special case scheduling requirements.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
- */
- cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-
- err = build_sched_domains(&cpu_default_map);
+ ndoms_cur = 1;
+ doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ if (!doms_cur)
+ doms_cur = &fallback_doms;
+ cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+ err = build_sched_domains(doms_cur);
+ register_sched_domain_sysctl();

return err;
}
@@ -6849,6 +6536,8 @@ static void detach_destroy_domains(const
{
int i;

+ unregister_sched_domain_sysctl();
+
for_each_cpu_mask(i, *cpu_map)
cpu_attach_domain(NULL, i);
synchronize_sched();
@@ -6856,34 +6545,78 @@ static void detach_destroy_domains(const
}

/*
- * Partition sched domains as specified by the cpumasks below.
- * This attaches all cpus from the cpumasks to the NULL domain,
- * waits for a RCU quiescent period, recalculates sched
- * domain information and then attaches them back to the
- * correct sched domains
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be kmalloc'd. This routine takes
+ * ownership of it and will kfree it when done with it. If the caller
+ * failed the kmalloc call, then it can pass in doms_new == NULL,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms'.
+ *
* Call with hotplug lock held
*/
-int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
{
- cpumask_t change_map;
- int err = 0;
+ int i, j;

- cpus_and(*partition1, *partition1, cpu_online_map);
- cpus_and(*partition2, *partition2, cpu_online_map);
- cpus_or(change_map, *partition1, *partition2);
-
- /* Detach sched domains from all of the affected cpus */
- detach_destroy_domains(&change_map);
- if (!cpus_empty(*partition1))
- err = build_sched_domains(partition1);
- if (!err && !cpus_empty(*partition2))
- err = build_sched_domains(partition2);
+ lock_doms_cur();

- return err;
+ /* always unregister in case we don't destroy any domains */
+ unregister_sched_domain_sysctl();
+
+ if (doms_new == NULL) {
+ ndoms_new = 1;
+ doms_new = &fallback_doms;
+ cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+ }
+
+ /* Destroy deleted domains */
+ for (i = 0; i < ndoms_cur; i++) {
+ for (j = 0; j < ndoms_new; j++) {
+ if (cpus_equal(doms_cur[i], doms_new[j]))
+ goto match1;
+ }
+ /* no match - a current sched domain not in new doms_new[] */
+ detach_destroy_domains(doms_cur + i);
+match1:
+ ;
+ }
+
+ /* Build new domains */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < ndoms_cur; j++) {
+ if (cpus_equal(doms_new[i], doms_cur[j]))
+ goto match2;
+ }
+ /* no match - add a new doms_new */
+ build_sched_domains(doms_new + i);
+match2:
+ ;
+ }
+
+ /* Remember the new sched domains */
+ if (doms_cur != &fallback_doms)
+ kfree(doms_cur);
+ doms_cur = doms_new;
+ ndoms_cur = ndoms_new;
+
+ register_sched_domain_sysctl();
+
+ unlock_doms_cur();
}

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-int arch_reinit_sched_domains(void)
+static int arch_reinit_sched_domains(void)
{
int err;

@@ -6912,24 +6645,6 @@ static ssize_t sched_power_savings_store
return ret ? ret : count;
}

-int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
-{
- int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
- if (smt_capable())
- err = sysfs_create_file(&cls->kset.kobj,
- &attr_sched_smt_power_savings.attr);
-#endif
-#ifdef CONFIG_SCHED_MC
- if (!err && mc_capable())
- err = sysfs_create_file(&cls->kset.kobj,
- &attr_sched_mc_power_savings.attr);
-#endif
- return err;
-}
-#endif
-
#ifdef CONFIG_SCHED_MC
static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
{
@@ -6940,8 +6655,8 @@ static ssize_t sched_mc_power_savings_st
{
return sched_power_savings_store(buf, count, 0);
}
-SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
- sched_mc_power_savings_store);
+static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
#endif

#ifdef CONFIG_SCHED_SMT
@@ -6954,8 +6669,26 @@ static ssize_t sched_smt_power_savings_s
{
return sched_power_savings_store(buf, count, 1);
}
-SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
- sched_smt_power_savings_store);
+static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+ sched_smt_power_savings_store);
+#endif
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+ int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+ if (smt_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+ if (!err && mc_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_mc_power_savings.attr);
+#endif
+ return err;
+}
#endif

/*
@@ -7013,10 +6746,24 @@ void __init sched_init_smp(void)
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
+ sched_init_granularity();
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ lb_monitor_task = kthread_create(load_balance_monitor, NULL,
+ "load_balance_monitor");
+ if (!IS_ERR(lb_monitor_task)) {
+ lb_monitor_task->flags |= PF_NOFREEZE;
+ wake_up_process(lb_monitor_task);
+ } else {
+ printk("Could not create load balance monitor thread"
+ "(error = %ld) \n", PTR_ERR(lb_monitor_task));
+ }
+#endif
}
#else
void __init sched_init_smp(void)
{
+ sched_init_granularity();
}
#endif /* CONFIG_SMP */

@@ -7030,28 +6777,61 @@ int in_sched_functions(unsigned long add
&& addr < (unsigned long)__sched_text_end);
}

+static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+{
+ cfs_rq->tasks_timeline = AVL_ROOT;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->rq = rq;
+#endif
+ cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+}
+
void __init sched_init(void)
{
- int i, j, k;
int highest_cpu = 0;
+ int i, j;

for_each_possible_cpu(i) {
- struct prio_array *array;
+ struct rt_prio_array *array;
struct rq *rq;

rq = cpu_rq(i);
spin_lock_init(&rq->lock);
lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
- rq->active = rq->arrays;
- rq->expired = rq->arrays + 1;
- rq->best_expired_prio = MAX_PRIO;
+ rq->clock = 1;
+ init_cfs_rq(&rq->cfs, rq);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ {
+ struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
+ struct sched_entity *se =
+ &per_cpu(init_sched_entity, i);
+
+ init_cfs_rq_p[i] = cfs_rq;
+ init_cfs_rq(cfs_rq, rq);
+ cfs_rq->tg = &init_task_group;
+ list_add(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+
+ init_sched_entity_p[i] = se;
+ se->cfs_rq = &rq->cfs;
+ se->my_q = cfs_rq;
+ se->load.weight = init_task_group_load;
+ se->load.inv_weight =
+ div64_64(1ULL<<32, init_task_group_load);
+ se->parent = NULL;
+ }
+ init_task_group.shares = init_task_group_load;
+ mutex_init(&init_task_group.lock);
+#endif

+ for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ rq->cpu_load[j] = 0;
#ifdef CONFIG_SMP
rq->sd = NULL;
- for (j = 1; j < 3; j++)
- rq->cpu_load[j] = 0;
rq->active_balance = 0;
+ rq->next_balance = jiffies;
rq->push_cpu = 0;
rq->cpu = i;
rq->migration_thread = NULL;
@@ -7059,20 +6839,22 @@ void __init sched_init(void)
#endif
atomic_set(&rq->nr_iowait, 0);

- for (j = 0; j < 2; j++) {
- array = rq->arrays + j;
- for (k = 0; k < MAX_PRIO; k++) {
- INIT_LIST_HEAD(array->queue + k);
- __clear_bit(k, array->bitmap);
- }
- // delimiter for bitsearch
- __set_bit(MAX_PRIO, array->bitmap);
+ array = &rq->rt.active;
+ for (j = 0; j < MAX_RT_PRIO; j++) {
+ INIT_LIST_HEAD(array->queue + j);
+ __clear_bit(j, array->bitmap);
}
highest_cpu = i;
+ /* delimiter for bitsearch: */
+ __set_bit(MAX_RT_PRIO, array->bitmap);
}

set_load_weight(&init_task);

+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
#ifdef CONFIG_SMP
nr_cpu_ids = highest_cpu + 1;
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
@@ -7095,6 +6877,10 @@ void __init sched_init(void)
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
+ /*
+ * During early bootup we pretend to be a normal task:
+ */
+ current->sched_class = &fair_sched_class;
}

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7123,30 +6909,56 @@ EXPORT_SYMBOL(__might_sleep);
#endif

#ifdef CONFIG_MAGIC_SYSRQ
+static void normalize_task(struct rq *rq, struct task_struct *p)
+{
+ int on_rq;
+ update_rq_clock(rq);
+ on_rq = p->se.on_rq;
+ if (on_rq)
+ deactivate_task(rq, p, 0);
+ __setscheduler(rq, p, SCHED_NORMAL, 0);
+ if (on_rq) {
+ activate_task(rq, p, 0);
+ resched_task(rq->curr);
+ }
+}
+
void normalize_rt_tasks(void)
{
- struct prio_array *array;
struct task_struct *g, *p;
unsigned long flags;
struct rq *rq;

read_lock_irq(&tasklist_lock);
-
do_each_thread(g, p) {
- if (!rt_task(p))
+ /*
+ * Only normalize user tasks:
+ */
+ if (!p->mm)
+ continue;
+
+ p->se.exec_start = 0;
+#ifdef CONFIG_SCHEDSTATS
+ p->se.wait_start = 0;
+ p->se.sleep_start = 0;
+ p->se.block_start = 0;
+#endif
+ task_rq(p)->clock = 0;
+
+ if (!rt_task(p)) {
+ /*
+ * Renice negative nice level userspace
+ * tasks back to 0:
+ */
+ if (TASK_NICE(p) < 0 && p->mm)
+ set_user_nice(p, 0);
continue;
+ }

spin_lock_irqsave(&p->pi_lock, flags);
rq = __task_rq_lock(p);

- array = p->array;
- if (array)
- deactivate_task(p, task_rq(p));
- __setscheduler(p, SCHED_NORMAL, 0);
- if (array) {
- __activate_task(p, task_rq(p));
- resched_task(rq->curr);
- }
+ normalize_task(rq, p);

__task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -7200,3 +7012,497 @@ void set_curr_task(int cpu, struct task_
}

#endif
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+
+/* distribute shares of all task groups among their schedulable entities,
+ * to reflect load distrbution across cpus.
+ */
+static int rebalance_shares(struct sched_domain *sd, int this_cpu)
+{
+ struct cfs_rq *cfs_rq;
+ struct rq *rq = cpu_rq(this_cpu);
+ cpumask_t sdspan = sd->span;
+ int balanced = 1;
+
+ /* Walk thr' all the task groups that we have */
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ int i;
+ unsigned long total_load = 0, total_shares;
+ struct task_group *tg = cfs_rq->tg;
+
+ /* Gather total task load of this group across cpus */
+ for_each_cpu_mask(i, sdspan)
+ total_load += tg->cfs_rq[i]->load.weight;
+
+ /* Nothing to do if this group has no load or if it's load
+ * hasn't changed since the last time we checked.
+ */
+ if (!total_load || total_load == tg->last_total_load)
+ continue;
+
+ tg->last_total_load = total_load;
+
+ /* tg->shares represents the number of cpu shares the task group
+ * is eligible to hold on a single cpu. On N cpus, it is
+ * eligible to hold (N * tg->shares) number of cpu shares.
+ */
+ total_shares = tg->shares * cpus_weight(sdspan);
+
+ /* redistribute total_shares across cpus as per the task load
+ * distribution.
+ */
+ for_each_cpu_mask(i, sdspan) {
+ unsigned long local_load, local_shares, irqflags;
+
+ local_load = tg->cfs_rq[i]->load.weight;
+ local_shares = (local_load * total_shares) / total_load;
+ if (!local_shares)
+ local_shares = MIN_GROUP_SHARES;
+ if (local_shares == tg->se[i]->load.weight)
+ continue;
+
+ spin_lock_irqsave(&cpu_rq(i)->lock, irqflags);
+ set_se_shares(tg->se[i], local_shares);
+ spin_unlock_irqrestore(&cpu_rq(i)->lock, irqflags);
+ balanced = 0;
+ }
+ }
+
+ return balanced;
+}
+
+/*
+ * How frequently should we rebalance_shares() across cpus?
+ *
+ * The more frequently we rebalance shares, the more accurate is the fairness
+ * of cpu bandwidth distribution between task groups. However higher frequency
+ * also implies increased scheduling overhead.
+ *
+ * sysctl_sched_min_bal_int_shares represents the minimum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * sysctl_sched_max_bal_int_shares represents the maximum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * These settings allows for the appropriate tradeoff between accuracy of
+ * fairness and the associated overhead.
+ *
+ */
+
+/* default: 8ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
+
+/* default: 128ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
+
+static int load_balance_monitor(void *unused)
+{
+ unsigned int timeout = sysctl_sched_min_bal_int_shares;
+
+ while (!kthread_should_stop()) {
+ int i, cpu, balanced = 1;
+
+ lock_cpu_hotplug(); /* Prevent cpus going down or coming up */
+ lock_doms_cur(); /* lockout changes to doms_cur[] array */
+
+ rcu_read_lock(); /* to walk rq->sd chain on various cpus */
+
+ for (i=0; i < ndoms_cur; i++) {
+ cpumask_t cpumap = doms_cur[i];
+ struct sched_domain *sd = NULL, *sd_prev = NULL;
+
+ cpu = first_cpu(cpumap);
+
+ /* Find the highest domain at which to balance shares */
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+ sd_prev = sd;
+ }
+
+ sd = sd_prev;
+ /* sd == NULL? No load balance reqd in this domain */
+ if (!sd)
+ continue;
+
+ balanced &= rebalance_shares(sd, cpu);
+ }
+
+ rcu_read_unlock();
+
+ unlock_doms_cur();
+ unlock_cpu_hotplug();
+
+ if (!balanced)
+ timeout = sysctl_sched_min_bal_int_shares;
+ else if (timeout < sysctl_sched_max_bal_int_shares)
+ timeout *= 2;
+
+ msleep_interruptible(timeout);
+ }
+
+ return 0;
+}
+
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(void)
+{
+ struct task_group *tg;
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se;
+ struct rq *rq;
+ int i;
+
+ tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
+ if (!tg->cfs_rq)
+ goto err;
+ tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
+ if (!tg->se)
+ goto err;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
+ cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
+ cpu_to_node(i));
+ if (!cfs_rq)
+ goto err;
+
+ se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
+ cpu_to_node(i));
+ if (!se)
+ goto err;
+
+ memset(cfs_rq, 0, sizeof(struct cfs_rq));
+ memset(se, 0, sizeof(struct sched_entity));
+
+ tg->cfs_rq[i] = cfs_rq;
+ init_cfs_rq(cfs_rq, rq);
+ cfs_rq->tg = tg;
+
+ tg->se[i] = se;
+ se->cfs_rq = &rq->cfs;
+ se->my_q = cfs_rq;
+ se->load.weight = NICE_0_LOAD;
+ se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+ se->parent = NULL;
+ }
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ cfs_rq = tg->cfs_rq[i];
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+ }
+
+ tg->shares = NICE_0_LOAD;
+ mutex_init(&tg->lock);
+
+ return tg;
+
+err:
+ for_each_possible_cpu(i) {
+ if (tg->cfs_rq)
+ kfree(tg->cfs_rq[i]);
+ if (tg->se)
+ kfree(tg->se[i]);
+ }
+ kfree(tg->cfs_rq);
+ kfree(tg->se);
+ kfree(tg);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void free_sched_group(struct rcu_head *rhp)
+{
+ struct task_group *tg = container_of(rhp, struct task_group, rcu);
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se;
+ int i;
+
+ /* now it should be safe to free those cfs_rqs */
+ for_each_possible_cpu(i) {
+ cfs_rq = tg->cfs_rq[i];
+ kfree(cfs_rq);
+
+ se = tg->se[i];
+ kfree(se);
+ }
+
+ kfree(tg->cfs_rq);
+ kfree(tg->se);
+ kfree(tg);
+}
+
+/* Destroy runqueue etc associated with a task group */
+void sched_destroy_group(struct task_group *tg)
+{
+ struct cfs_rq *cfs_rq = NULL;
+ int i;
+
+ for_each_possible_cpu(i) {
+ cfs_rq = tg->cfs_rq[i];
+ list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+ }
+
+ BUG_ON(!cfs_rq);
+
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group);
+}
+
+/* change task's runqueue when it moves between groups.
+ * The caller of this function should have put the task in its new group
+ * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
+ * reflect its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int on_rq, running;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ if (tsk->sched_class != &fair_sched_class) {
+ set_task_cfs_rq(tsk, task_cpu(tsk));
+ goto done;
+ }
+
+ update_rq_clock(rq);
+
+ running = task_running(rq, tsk);
+ on_rq = tsk->se.on_rq;
+
+ if (on_rq) {
+ dequeue_task(rq, tsk, 0);
+ if (unlikely(running))
+ tsk->sched_class->put_prev_task(rq, tsk);
+ }
+
+ set_task_cfs_rq(tsk, task_cpu(tsk));
+
+ if (on_rq) {
+ if (unlikely(running))
+ tsk->sched_class->set_curr_task(rq);
+ enqueue_task(rq, tsk, 0);
+ }
+
+done:
+ task_rq_unlock(rq, &flags);
+}
+
+/* rq->lock to be locked by caller */
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+ struct cfs_rq *cfs_rq = se->cfs_rq;
+ struct rq *rq = cfs_rq->rq;
+ int on_rq;
+
+ if (!shares)
+ shares = MIN_GROUP_SHARES;
+
+ on_rq = se->on_rq;
+ if (on_rq) {
+ dequeue_entity(cfs_rq, se, 0);
+ dec_load(rq, se->load.weight);
+ }
+
+ se->load.weight = shares;
+ se->load.inv_weight = div64_64((1ULL<<32), shares);
+
+ if (on_rq) {
+ enqueue_entity(cfs_rq, se, 0);
+ inc_load(rq, se->load.weight);
+ }
+}
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+ int i;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+
+ mutex_lock(&tg->lock);
+
+ if (tg->shares == shares)
+ goto done;
+
+ if (shares < MIN_GROUP_SHARES)
+ shares = MIN_GROUP_SHARES;
+
+ /* Prevent any load balance activity (rebalance_shares,
+ * load_balance_fair) from referring to this group first,
+ * by taking it off the rq->leaf_cfs_rq_list on each cpu.
+ */
+ for_each_possible_cpu(i) {
+ cfs_rq = tg->cfs_rq[i];
+ list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+ }
+
+ /* wait for any ongoing reference to this group to finish */
+ synchronize_sched();
+
+ /* Now we are free to modify the group's share on each cpu
+ * w/o tripping rebalance_share or load_balance_fair.
+ */
+ tg->shares = shares;
+ for_each_possible_cpu(i) {
+ spin_lock_irq(&cpu_rq(i)->lock);
+ set_se_shares(tg->se[i], shares);
+ spin_unlock_irq(&cpu_rq(i)->lock);
+ }
+
+ /* Enable load balance activity on this group, by inserting it back on
+ * each cpu's rq->lead_cfs_rq_list.
+ */
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ cfs_rq = tg->cfs_rq[i];
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+ }
+
+done:
+ mutex_unlock(&tg->lock);
+ return 0;
+}
+
+unsigned long sched_group_shares(struct task_group *tg)
+{
+ return tg->shares;
+}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_FAIR_CGROUP_SCHED
+
+/* return corresponding task_group object of a cgroup */
+static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
+ struct task_group, css);
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct task_group *tg;
+
+ if (!cgrp->parent) {
+ /* This is early initialization for the top cgroup */
+ init_task_group.css.cgroup = cgrp;
+ return &init_task_group.css;
+ }
+
+ /* we support only 1-level deep hierarchical scheduler atm */
+ if (cgrp->parent->parent)
+ return ERR_PTR(-EINVAL);
+
+ tg = sched_create_group();
+ if (IS_ERR(tg))
+ return ERR_PTR(-ENOMEM);
+
+ /* Bind the cgroup to task_group object we just created */
+ tg->css.cgroup = cgrp;
+
+ return &tg->css;
+}
+
+static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ sched_destroy_group(tg);
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
+ struct cgroup *cgrp, struct task_struct *tsk)
+{
+ /* We don't support RT-tasks being in separate groups */
+ if (tsk->sched_class != &fair_sched_class)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cont, struct task_struct *tsk)
+{
+ sched_move_task(tsk);
+}
+
+static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+ u64 shareval)
+{
+ return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+}
+
+static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ return (u64) tg->shares;
+}
+
+static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ unsigned long flags;
+ u64 res = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ /*
+ * Lock to prevent races with updating 64-bit counters
+ * on 32-bit arches.
+ */
+ spin_lock_irqsave(&cpu_rq(i)->lock, flags);
+ res += tg->se[i]->sum_exec_runtime;
+ spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
+ }
+ /* Convert from ns to ms */
+ do_div(res, NSEC_PER_MSEC);
+
+ return res;
+}
+
+static struct cftype cpu_files[] = {
+ {
+ .name = "shares",
+ .read_uint = cpu_shares_read_uint,
+ .write_uint = cpu_shares_write_uint,
+ },
+ {
+ .name = "usage",
+ .read_uint = cpu_usage_read,
+ },
+};
+
+static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
+}
+
+struct cgroup_subsys cpu_cgroup_subsys = {
+ .name = "cpu",
+ .create = cpu_cgroup_create,
+ .destroy = cpu_cgroup_destroy,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
+ .populate = cpu_cgroup_populate,
+ .subsys_id = cpu_cgroup_subsys_id,
+ .early_init = 1,
+};
+
+#endif /* CONFIG_FAIR_CGROUP_SCHED */
Index: linux-cfs-2.6.22.13.q/kernel/sched_debug.c
===================================================================
--- /dev/null
+++ linux-cfs-2.6.22.13.q/kernel/sched_debug.c
@@ -0,0 +1,394 @@
+/*
+ * kernel/time/sched_debug.c
+ *
+ * Print the CFS rbtree
+ *
+ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+/*
+ * This allows printing both to /proc/sched_debug and
+ * to the console
+ */
+#define SEQ_printf(m, x...) \
+ do { \
+ if (m) \
+ seq_printf(m, x); \
+ else \
+ printk(x); \
+ } while (0)
+
+/*
+ * Ease the printing of nsec fields:
+ */
+static long long nsec_high(long long nsec)
+{
+ if (nsec < 0) {
+ nsec = -nsec;
+ do_div(nsec, 1000000);
+ return -nsec;
+ }
+ do_div(nsec, 1000000);
+
+ return nsec;
+}
+
+static unsigned long nsec_low(long long nsec)
+{
+ if (nsec < 0)
+ nsec = -nsec;
+
+ return do_div(nsec, 1000000);
+}
+
+#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+
+static void
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+{
+ if (rq->curr == p)
+ SEQ_printf(m, "R");
+ else
+ SEQ_printf(m, " ");
+
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+ p->comm, p->pid,
+ SPLIT_NS(p->se.vruntime),
+ (long long)(p->nvcsw + p->nivcsw),
+ p->prio);
+#ifdef CONFIG_SCHEDSTATS
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n",
+ SPLIT_NS(p->se.vruntime),
+ SPLIT_NS(p->se.sum_exec_runtime),
+ SPLIT_NS(p->se.sum_sleep_runtime));
+#else
+ SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
+ 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+#endif
+}
+
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+
+ SEQ_printf(m,
+ "\nrunnable tasks:\n"
+ " task PID tree-key switches prio"
+ " exec-runtime sum-exec sum-sleep\n"
+ "------------------------------------------------------"
+ "----------------------------------------------------\n");
+
+ read_lock_irqsave(&tasklist_lock, flags);
+
+ do_each_thread(g, p) {
+ if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+ continue;
+
+ print_task(m, rq, p);
+ } while_each_thread(g, p);
+
+ read_unlock_irqrestore(&tasklist_lock, flags);
+}
+
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+ s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+ spread, rq0_min_vruntime, spread0;
+ struct rq *rq = &per_cpu(runqueues, cpu);
+ struct sched_entity *last;
+ unsigned long flags;
+
+ SEQ_printf(m, "\ncfs_rq\n");
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
+ SPLIT_NS(cfs_rq->exec_clock));
+
+ spin_lock_irqsave(&rq->lock, flags);
+ if (cfs_rq->avl_leftmost)
+ MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+ last = __pick_last_entity(cfs_rq);
+ if (last)
+ max_vruntime = last->vruntime;
+ min_vruntime = rq->cfs.min_vruntime;
+ rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+ spin_unlock_irqrestore(&rq->lock, flags);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+ SPLIT_NS(MIN_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
+ SPLIT_NS(min_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
+ SPLIT_NS(max_vruntime));
+ spread = max_vruntime - MIN_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
+ SPLIT_NS(spread));
+ spread0 = min_vruntime - rq0_min_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
+ SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+#ifdef CONFIG_SCHEDSTATS
+ SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
+ rq->bkl_count);
+#endif
+ SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
+ cfs_rq->nr_spread_over);
+}
+
+static void print_cpu(struct seq_file *m, int cpu)
+{
+ struct rq *rq = &per_cpu(runqueues, cpu);
+
+#ifdef CONFIG_X86
+ {
+ unsigned int freq = cpu_khz ? : 1;
+
+ SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+ cpu, freq / 1000, (freq % 1000));
+ }
+#else
+ SEQ_printf(m, "\ncpu#%d\n", cpu);
+#endif
+
+#define P(x) \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define PN(x) \
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
+
+ P(nr_running);
+ SEQ_printf(m, " .%-30s: %lu\n", "load",
+ rq->load.weight);
+ P(nr_switches);
+ P(nr_load_updates);
+ P(nr_uninterruptible);
+ SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
+ PN(next_balance);
+ P(curr->pid);
+ PN(clock);
+ PN(idle_clock);
+ PN(prev_clock_raw);
+ P(clock_warps);
+ P(clock_overflows);
+ P(clock_deep_idle_events);
+ PN(clock_max_delta);
+ P(cpu_load[0]);
+ P(cpu_load[1]);
+ P(cpu_load[2]);
+ P(cpu_load[3]);
+ P(cpu_load[4]);
+#undef P
+#undef PN
+
+ print_cfs_stats(m, cpu);
+
+ print_rq(m, rq, cpu);
+}
+
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+ SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+
+#define P(x) \
+ SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ PN(sysctl_sched_latency);
+ PN(sysctl_sched_min_granularity);
+ PN(sysctl_sched_wakeup_granularity);
+ PN(sysctl_sched_batch_wakeup_granularity);
+ PN(sysctl_sched_child_runs_first);
+ P(sysctl_sched_features);
+#undef PN
+#undef P
+
+ for_each_online_cpu(cpu)
+ print_cpu(m, cpu);
+
+ SEQ_printf(m, "\n");
+
+ return 0;
+}
+
+static void sysrq_sched_debug_show(void)
+{
+ sched_debug_show(NULL, NULL);
+}
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_debug_show, NULL);
+}
+
+static const struct file_operations sched_debug_fops = {
+ .open = sched_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_sched_debug_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = create_proc_entry("sched_debug", 0644, NULL);
+ if (!pe)
+ return -ENOMEM;
+
+ pe->proc_fops = &sched_debug_fops;
+
+ return 0;
+}
+
+__initcall(init_sched_debug_procfs);
+
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+ unsigned long nr_switches;
+ unsigned long flags;
+ int num_threads = 1;
+
+ rcu_read_lock();
+ if (lock_task_sighand(p, &flags)) {
+ num_threads = atomic_read(&p->signal->count);
+ unlock_task_sighand(p, &flags);
+ }
+ rcu_read_unlock();
+
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+ SEQ_printf(m,
+ "---------------------------------------------------------\n");
+#define __P(F) \
+ SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+ SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+ SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+ SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+ PN(se.exec_start);
+ PN(se.vruntime);
+ PN(se.sum_exec_runtime);
+
+ nr_switches = p->nvcsw + p->nivcsw;
+
+#ifdef CONFIG_SCHEDSTATS
+ PN(se.wait_start);
+ PN(se.sleep_start);
+ PN(se.block_start);
+ PN(se.sleep_max);
+ PN(se.block_max);
+ PN(se.exec_max);
+ PN(se.slice_max);
+ PN(se.wait_max);
+ P(sched_info.bkl_count);
+ P(se.nr_migrations);
+ P(se.nr_migrations_cold);
+ P(se.nr_failed_migrations_affine);
+ P(se.nr_failed_migrations_running);
+ P(se.nr_failed_migrations_hot);
+ P(se.nr_forced_migrations);
+ P(se.nr_forced2_migrations);
+ P(se.nr_wakeups);
+ P(se.nr_wakeups_sync);
+ P(se.nr_wakeups_migrate);
+ P(se.nr_wakeups_local);
+ P(se.nr_wakeups_remote);
+ P(se.nr_wakeups_affine);
+ P(se.nr_wakeups_affine_attempts);
+ P(se.nr_wakeups_passive);
+ P(se.nr_wakeups_idle);
+
+ {
+ u64 avg_atom, avg_per_cpu;
+
+ avg_atom = p->se.sum_exec_runtime;
+ if (nr_switches)
+ do_div(avg_atom, nr_switches);
+ else
+ avg_atom = -1LL;
+
+ avg_per_cpu = p->se.sum_exec_runtime;
+ if (p->se.nr_migrations)
+ avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations);
+ else
+ avg_per_cpu = -1LL;
+
+ __PN(avg_atom);
+ __PN(avg_per_cpu);
+ }
+#endif
+ __P(nr_switches);
+ SEQ_printf(m, "%-35s:%21Ld\n",
+ "nr_voluntary_switches", (long long)p->nvcsw);
+ SEQ_printf(m, "%-35s:%21Ld\n",
+ "nr_involuntary_switches", (long long)p->nivcsw);
+
+ P(se.load.weight);
+ P(policy);
+ P(prio);
+#undef PN
+#undef __PN
+#undef P
+#undef __P
+
+ {
+ u64 t0, t1;
+
+ t0 = sched_clock();
+ t1 = sched_clock();
+ SEQ_printf(m, "%-35s:%21Ld\n",
+ "clock-delta", (long long)(t1-t0));
+ }
+}
+
+void proc_sched_set_task(struct task_struct *p)
+{
+#ifdef CONFIG_SCHEDSTATS
+ p->se.wait_max = 0;
+ p->se.sleep_max = 0;
+ p->se.sum_sleep_runtime = 0;
+ p->se.block_max = 0;
+ p->se.exec_max = 0;
+ p->se.slice_max = 0;
+ p->se.nr_migrations = 0;
+ p->se.nr_migrations_cold = 0;
+ p->se.nr_failed_migrations_affine = 0;
+ p->se.nr_failed_migrations_running = 0;
+ p->se.nr_failed_migrations_hot = 0;
+ p->se.nr_forced_migrations = 0;
+ p->se.nr_forced2_migrations = 0;
+ p->se.nr_wakeups = 0;
+ p->se.nr_wakeups_sync = 0;
+ p->se.nr_wakeups_migrate = 0;
+ p->se.nr_wakeups_local = 0;
+ p->se.nr_wakeups_remote = 0;
+ p->se.nr_wakeups_affine = 0;
+ p->se.nr_wakeups_affine_attempts = 0;
+ p->se.nr_wakeups_passive = 0;
+ p->se.nr_wakeups_idle = 0;
+ p->sched_info.bkl_count = 0;
+#endif
+ p->se.sum_exec_runtime = 0;
+ p->se.prev_sum_exec_runtime = 0;
+ p->nvcsw = 0;
+ p->nivcsw = 0;
+}
Index: linux-cfs-2.6.22.13.q/kernel/sched_fair.c
===================================================================
--- /dev/null
+++ linux-cfs-2.6.22.13.q/kernel/sched_fair.c
@@ -0,0 +1,1164 @@
+/*
+ * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@xxxxxxxxxx>
+ *
+ * Interactivity improvements by Mike Galbraith
+ * (C) 2007 Mike Galbraith <efault@xxxxxx>
+ *
+ * Various enhancements by Dmitry Adamushko.
+ * (C) 2007 Dmitry Adamushko <dmitry.adamushko@xxxxxxxxx>
+ *
+ * Group scheduling enhancements by Srivatsa Vaddagiri
+ * Copyright IBM Corporation, 2007
+ * Author: Srivatsa Vaddagiri <vatsa@xxxxxxxxxxxxxxxxxx>
+ *
+ * Scaled math optimizations by Thomas Gleixner
+ * Copyright (C) 2007, Thomas Gleixner <tglx@xxxxxxxxxxxxx>
+ *
+ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@xxxxxxxxxx>
+ */
+
+/*
+ * Targeted preemption latency for CPU-bound tasks:
+ * (default: 20ms * ilog(ncpus), units: nanoseconds)
+ *
+ * NOTE: this latency value is not the same as the concept of
+ * 'timeslice length' - timeslices in CFS are of variable length
+ * and have no persistent notion like in traditional, time-slice
+ * based scheduling concepts.
+ *
+ * (to see the precise effective timeslice length of your workload,
+ * run vmstat and monitor the context-switches (cs) field)
+ */
+unsigned int sysctl_sched_latency = 20000000ULL;
+
+/*
+ * Minimal preemption granularity for CPU-bound tasks:
+ * (default: 1 msec * ilog(ncpus), units: nanoseconds)
+ */
+unsigned int sysctl_sched_min_granularity = 1000000ULL;
+
+/*
+ * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ */
+static unsigned int sched_nr_latency = 20;
+
+/*
+ * After fork, child runs first. (default) If set to 0 then
+ * parent will (try to) run first.
+ */
+const_debug unsigned int sysctl_sched_child_runs_first = 1;
+
+/*
+ * sys_sched_yield() compat mode
+ *
+ * This option switches the agressive yield implementation of the
+ * old scheduler back on.
+ */
+unsigned int __read_mostly sysctl_sched_compat_yield;
+
+/*
+ * SCHED_BATCH wake-up granularity.
+ * (default: 10 msec * ilog(ncpus), units: nanoseconds)
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ */
+unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
+
+/*
+ * SCHED_OTHER wake-up granularity.
+ * (default: 10 msec * ilog(ncpus), units: nanoseconds)
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ */
+unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+
+/**************************************************************
+ * CFS operations on generic schedulable entities:
+ */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* cpu runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se) (!se->my_q)
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+
+#define entity_is_task(se) 1
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
+
+
+/**************************************************************
+ * Scheduling class tree data structure manipulation methods:
+ */
+
+static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta > 0)
+ min_vruntime = vruntime;
+
+ return min_vruntime;
+}
+
+static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta < 0)
+ min_vruntime = vruntime;
+
+ return min_vruntime;
+}
+
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return se->vruntime - cfs_rq->min_vruntime;
+}
+
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct avl_node **link = &cfs_rq->tasks_timeline.avl_node;
+ struct avl_node *parent = NULL;
+ struct sched_entity *entry;
+ s64 key = entity_key(cfs_rq, se);
+ int leftmost = 1;
+
+ /*
+ * Find the right place in the rbtree:
+ */
+ while (*link) {
+ parent = *link;
+ entry = avl_entry(parent, struct sched_entity, run_node);
+ /*
+ * We dont care about collisions. Nodes with
+ * the same key stay together.
+ */
+ if (key < entity_key(cfs_rq, entry)) {
+ link = &parent->avl_left;
+ } else {
+ link = &parent->avl_right;
+ leftmost = 0;
+ }
+ }
+
+ /*
+ * Maintain a cache of leftmost tree entries (it is frequently
+ * used):
+ */
+ if (leftmost)
+ cfs_rq->avl_leftmost = &se->run_node;
+
+ avl_link_node(&se->run_node, parent, link);
+ insert_avl_node(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (cfs_rq->avl_leftmost == &se->run_node)
+ cfs_rq->avl_leftmost = avl_next(&se->run_node);
+
+ delete_avl_node(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+static inline struct avl_node *first_fair(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avl_leftmost;
+}
+
+static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+{
+ return avl_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+}
+
+static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+{
+ struct avl_node **link = &cfs_rq->tasks_timeline.avl_node;
+ struct sched_entity *se = NULL;
+ struct avl_node *parent;
+
+ while (*link) {
+ parent = *link;
+ se = avl_entry(parent, struct sched_entity, run_node);
+ link = &parent->avl_right;
+ }
+
+ return se;
+}
+
+/**************************************************************
+ * Scheduling class statistics methods:
+ */
+
+#ifdef CONFIG_SCHED_DEBUG
+int sched_nr_latency_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
+ sysctl_sched_min_granularity);
+
+ return 0;
+}
+#endif
+
+/*
+ * The idea is to set a period in which each task runs once.
+ *
+ * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
+ * this period because otherwise the slices get too small.
+ *
+ * p = (nr <= nl) ? l : l*nr/nl
+ */
+static u64 __sched_period(unsigned long nr_running)
+{
+ u64 period = sysctl_sched_latency;
+ unsigned long nr_latency = sched_nr_latency;
+
+ if (unlikely(nr_running > nr_latency)) {
+ period *= nr_running;
+ do_div(period, nr_latency);
+ }
+
+ return period;
+}
+
+/*
+ * We calculate the wall-time slice from the period by taking a part
+ * proportional to the weight.
+ *
+ * s = p*w/rw
+ */
+static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 slice = __sched_period(cfs_rq->nr_running);
+
+ slice *= se->load.weight;
+ do_div(slice, cfs_rq->load.weight);
+
+ return slice;
+}
+
+/*
+ * We calculate the vruntime slice.
+ *
+ * vs = s/w = p/rw
+ */
+static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
+{
+ u64 vslice = __sched_period(nr_running);
+
+ vslice *= NICE_0_LOAD;
+ do_div(vslice, rq_weight);
+
+ return vslice;
+}
+
+static u64 sched_vslice(struct cfs_rq *cfs_rq)
+{
+ return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
+}
+
+static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return __sched_vslice(cfs_rq->load.weight + se->load.weight,
+ cfs_rq->nr_running + 1);
+}
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static inline void
+__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
+ unsigned long delta_exec)
+{
+ unsigned long delta_exec_weighted;
+ u64 vruntime;
+
+ schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+
+ curr->sum_exec_runtime += delta_exec;
+ schedstat_add(cfs_rq, exec_clock, delta_exec);
+ delta_exec_weighted = delta_exec;
+ if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+ delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+ &curr->load);
+ }
+ curr->vruntime += delta_exec_weighted;
+
+ /*
+ * maintain cfs_rq->min_vruntime to be a monotonic increasing
+ * value tracking the leftmost vruntime in the tree.
+ */
+ if (first_fair(cfs_rq)) {
+ vruntime = min_vruntime(curr->vruntime,
+ __pick_next_entity(cfs_rq)->vruntime);
+ } else
+ vruntime = curr->vruntime;
+
+ cfs_rq->min_vruntime =
+ max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
+
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 now = rq_of(cfs_rq)->clock;
+ unsigned long delta_exec;
+
+ if (unlikely(!curr))
+ return;
+
+ /*
+ * Get the amount of time the current task was running
+ * since the last time we changed load (this cannot
+ * overflow on 32 bits):
+ */
+ delta_exec = (unsigned long)(now - curr->exec_start);
+
+ __update_curr(cfs_rq, curr, delta_exec);
+ curr->exec_start = now;
+}
+
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+}
+
+/*
+ * Task is being enqueued - update stats:
+ */
+static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * Are we enqueueing a waiting task? (for current tasks
+ * a dequeue/enqueue event is a NOP)
+ */
+ if (se != cfs_rq->curr)
+ update_stats_wait_start(cfs_rq, se);
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ schedstat_set(se->wait_max, max(se->wait_max,
+ rq_of(cfs_rq)->clock - se->wait_start));
+ schedstat_set(se->wait_start, 0);
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * Mark the end of the wait period if dequeueing a
+ * waiting task:
+ */
+ if (se != cfs_rq->curr)
+ update_stats_wait_end(cfs_rq, se);
+}
+
+/*
+ * We are picking a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * We are starting a new run period:
+ */
+ se->exec_start = rq_of(cfs_rq)->clock;
+}
+
+/**************************************************
+ * Scheduling class queueing methods:
+ */
+
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_add(&cfs_rq->load, se->load.weight);
+ cfs_rq->nr_running++;
+ se->on_rq = 1;
+}
+
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_sub(&cfs_rq->load, se->load.weight);
+ cfs_rq->nr_running--;
+ se->on_rq = 0;
+}
+
+static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHEDSTATS
+ if (se->sleep_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > se->sleep_max))
+ se->sleep_max = delta;
+
+ se->sleep_start = 0;
+ se->sum_sleep_runtime += delta;
+ }
+ if (se->block_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > se->block_max))
+ se->block_max = delta;
+
+ se->block_start = 0;
+ se->sum_sleep_runtime += delta;
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by 20 to
+ * get a milliseconds-range estimation of the amount of
+ * time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ struct task_struct *tsk = task_of(se);
+
+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ }
+#endif
+}
+
+static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ s64 d = se->vruntime - cfs_rq->min_vruntime;
+
+ if (d < 0)
+ d = -d;
+
+ if (d > 3*sysctl_sched_latency)
+ schedstat_inc(cfs_rq, nr_spread_over);
+#endif
+}
+
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+{
+ u64 vruntime;
+
+ vruntime = cfs_rq->min_vruntime;
+
+ if (sched_feat(TREE_AVG)) {
+ struct sched_entity *last = __pick_last_entity(cfs_rq);
+ if (last) {
+ vruntime += last->vruntime;
+ vruntime >>= 1;
+ }
+ } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
+ vruntime += sched_vslice(cfs_rq)/2;
+
+ /*
+ * The 'current' period is already promised to the current tasks,
+ * however the extra weight of the new task will slow them down a
+ * little, place the new task so that it fits in the slot that
+ * stays open at the end.
+ */
+ if (initial && sched_feat(START_DEBIT))
+ vruntime += sched_vslice_add(cfs_rq, se);
+
+ if (!initial) {
+ /* sleeps upto a single latency don't count. */
+ if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
+ task_of(se)->policy != SCHED_BATCH)
+ vruntime -= sysctl_sched_latency;
+
+ /* ensure we never gain time by being placed backwards. */
+ vruntime = max_vruntime(se->vruntime, vruntime);
+ }
+
+ se->vruntime = vruntime;
+}
+
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+ if (wakeup) {
+ place_entity(cfs_rq, se, 0);
+ enqueue_sleeper(cfs_rq, se);
+ }
+
+ update_stats_enqueue(cfs_rq, se);
+ check_spread(cfs_rq, se);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ account_entity_enqueue(cfs_rq, se);
+}
+
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+ update_stats_dequeue(cfs_rq, se);
+ if (sleep) {
+#ifdef CONFIG_SCHEDSTATS
+ if (entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ se->sleep_start = rq_of(cfs_rq)->clock;
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ se->block_start = rq_of(cfs_rq)->clock;
+ }
+#endif
+ }
+
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ account_entity_dequeue(cfs_rq, se);
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void
+check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ unsigned long ideal_runtime, delta_exec;
+
+ ideal_runtime = sched_slice(cfs_rq, curr);
+ delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+ if (delta_exec > ideal_runtime)
+ resched_task(rq_of(cfs_rq)->curr);
+}
+
+static void
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /* 'current' is not kept within the tree. */
+ if (se->on_rq) {
+ /*
+ * Any task has to be enqueued before it get to execute on
+ * a CPU. So account for the time it spent waiting on the
+ * runqueue.
+ */
+ update_stats_wait_end(cfs_rq, se);
+ __dequeue_entity(cfs_rq, se);
+ }
+
+ update_stats_curr_start(cfs_rq, se);
+ cfs_rq->curr = se;
+#ifdef CONFIG_SCHEDSTATS
+ /*
+ * Track our maximum slice length, if the CPU's load is at
+ * least twice that of our own weight (i.e. dont track it
+ * when there are only lesser-weight tasks around):
+ */
+ if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ se->slice_max = max(se->slice_max,
+ se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ }
+#endif
+ se->prev_sum_exec_runtime = se->sum_exec_runtime;
+}
+
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se = NULL;
+
+ if (first_fair(cfs_rq)) {
+ se = __pick_next_entity(cfs_rq);
+ set_next_entity(cfs_rq, se);
+ }
+
+ return se;
+}
+
+static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+{
+ /*
+ * If still on the runqueue then deactivate_task()
+ * was not called and update_curr() has to be done:
+ */
+ if (prev->on_rq)
+ update_curr(cfs_rq);
+
+ check_spread(cfs_rq, prev);
+ if (prev->on_rq) {
+ update_stats_wait_start(cfs_rq, prev);
+ /* Put 'current' back into the tree. */
+ __enqueue_entity(cfs_rq, prev);
+ }
+ cfs_rq->curr = NULL;
+}
+
+static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+ if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
+ check_preempt_tick(cfs_rq, curr);
+}
+
+/**************************************************
+ * CFS operations on tasks:
+ */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+ for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return grp->my_q;
+}
+
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+ return cfs_rq->tg->cfs_rq[this_cpu];
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+ if (se->cfs_rq == pse->cfs_rq)
+ return 1;
+
+ return 0;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return se->parent;
+}
+
+#define GROUP_IMBALANCE_PCT 20
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+#define for_each_sched_entity(se) \
+ for (; se; se = NULL)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+ return &cpu_rq(this_cpu)->cfs;
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+ return 1;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+/*
+ * The enqueue_task method is called before nr_running is
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+ unsigned long old_load = rq->cfs.load.weight, new_load, delta_load;
+
+ for_each_sched_entity(se) {
+ if (se->on_rq)
+ break;
+ cfs_rq = cfs_rq_of(se);
+ enqueue_entity(cfs_rq, se, wakeup);
+ wakeup = 1;
+ }
+
+ new_load = rq->cfs.load.weight;
+ delta_load = new_load - old_load;
+ inc_load(rq, delta_load);
+}
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+ unsigned long old_load = rq->cfs.load.weight, new_load, delta_load;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ dequeue_entity(cfs_rq, se, sleep);
+ /* Don't dequeue parent if it has other entities besides us */
+ if (cfs_rq->load.weight)
+ break;
+ sleep = 1;
+ }
+
+ new_load = rq->cfs.load.weight;
+ delta_load = old_load - new_load;
+ dec_load(rq, delta_load);
+}
+
+/*
+ * sched_yield() support is very simple - we dequeue and enqueue.
+ *
+ * If compat_yield is turned on then we requeue to the end of the tree.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
+ struct sched_entity *rightmost, *se = &rq->curr->se;
+
+ /*
+ * Are we the only task in the tree?
+ */
+ if (unlikely(cfs_rq->nr_running == 1))
+ return;
+
+ if (likely(!sysctl_sched_compat_yield)) {
+ __update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+ return;
+ }
+ /*
+ * Find the rightmost entry in the rbtree:
+ */
+ rightmost = __pick_last_entity(cfs_rq);
+ /*
+ * Already in the rightmost position?
+ */
+ if (unlikely(rightmost->vruntime < se->vruntime))
+ return;
+
+ /*
+ * Minimally necessary key value to be last in the tree:
+ * Upon rescheduling, sched_class::put_prev_task() will place
+ * 'current' within the tree based on its new key value.
+ */
+ se->vruntime = rightmost->vruntime + 1;
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
+{
+ struct task_struct *curr = rq->curr;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct sched_entity *se = &curr->se, *pse = &p->se;
+ unsigned long gran;
+
+ if (unlikely(rt_prio(p->prio))) {
+ update_rq_clock(rq);
+ update_curr(cfs_rq);
+ resched_task(curr);
+ return;
+ }
+ /*
+ * Batch tasks do not preempt (their preemption is driven by
+ * the tick):
+ */
+ if (unlikely(p->policy == SCHED_BATCH))
+ return;
+
+ if (!sched_feat(WAKEUP_PREEMPT))
+ return;
+
+ while (!is_same_group(se, pse)) {
+ se = parent_entity(se);
+ pse = parent_entity(pse);
+ }
+
+ gran = sysctl_sched_wakeup_granularity;
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, &se->load);
+
+ if (pse->vruntime + gran < se->vruntime)
+ resched_task(curr);
+}
+
+static struct task_struct *pick_next_task_fair(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ struct sched_entity *se;
+
+ if (unlikely(!cfs_rq->nr_running))
+ return NULL;
+
+ do {
+ se = pick_next_entity(cfs_rq);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ return task_of(se);
+}
+
+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_entity *se = &prev->se;
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ put_prev_entity(cfs_rq, se);
+ }
+}
+
+#ifdef CONFIG_SMP
+/**************************************************
+ * Fair scheduling class load-balancing methods:
+ */
+
+/*
+ * Load-balancing iterator. Note: while the runqueue stays locked
+ * during the whole iteration, the current task might be
+ * dequeued so the iterator has to be dequeue-safe. Here we
+ * achieve that by always pre-iterating before returning
+ * the current task:
+ */
+static struct task_struct *
+__load_balance_iterator(struct cfs_rq *cfs_rq, struct avl_node *curr)
+{
+ struct task_struct *p;
+
+ if (!curr)
+ return NULL;
+
+ p = avl_entry(curr, struct task_struct, se.run_node);
+ cfs_rq->avl_load_balance_curr = avl_next(curr);
+
+ return p;
+}
+
+static struct task_struct *load_balance_start_fair(void *arg)
+{
+ struct cfs_rq *cfs_rq = arg;
+
+ return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
+}
+
+static struct task_struct *load_balance_next_fair(void *arg)
+{
+ struct cfs_rq *cfs_rq = arg;
+
+ return __load_balance_iterator(cfs_rq, cfs_rq->avl_load_balance_curr);
+}
+
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ struct cfs_rq *busy_cfs_rq;
+ long rem_load_move = max_load_move;
+ struct rq_iterator cfs_rq_iterator;
+ unsigned long load_moved;
+
+ cfs_rq_iterator.start = load_balance_start_fair;
+ cfs_rq_iterator.next = load_balance_next_fair;
+
+ for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
+ unsigned long maxload, task_load, group_weight;
+ unsigned long thisload, per_task_load;
+ struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
+
+ task_load = busy_cfs_rq->load.weight;
+ group_weight = se->load.weight;
+
+ /*
+ * 'group_weight' is contributed by tasks of total weight
+ * 'task_load'. To move 'rem_load_move' worth of weight only,
+ * we need to move a maximum task load of:
+ *
+ * maxload = (remload / group_weight) * task_load;
+ */
+ maxload = (rem_load_move * task_load) / group_weight;
+
+ if (!maxload || !task_load)
+ continue;
+
+ per_task_load = task_load/busy_cfs_rq->nr_running;
+
+ /* balance_tasks will try to forcibly move atleast one task if
+ * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
+ * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
+ */
+ if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
+ continue;
+
+ *this_best_prio = 0;
+ thisload = this_cfs_rq->load.weight;
+#else
+# define maxload rem_load_move
+#endif
+ /*
+ * pass busy_cfs_rq argument into
+ * load_balance_[start|next]_fair iterators
+ */
+ cfs_rq_iterator.arg = busy_cfs_rq;
+ load_moved = balance_tasks(this_rq, this_cpu, busiest,
+ maxload, sd, idle, all_pinned,
+ this_best_prio,
+ &cfs_rq_iterator);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* load_moved holds the task load that was moved. The
+ * effective weight moved would be:
+ * load_moved_eff = load_moved/task_load * group_weight;
+ */
+ load_moved = (group_weight * load_moved) / task_load;
+
+ /* Adjust shares on both cpus to reflect load_moved */
+ group_weight -= load_moved;
+ set_se_shares(se, group_weight);
+
+ se = busy_cfs_rq->tg->se[this_cpu];
+ if (!thisload)
+ group_weight = load_moved;
+ else
+ group_weight = se->load.weight + load_moved;
+ set_se_shares(se, group_weight);
+#endif
+
+ rem_load_move -= load_moved;
+ if (rem_load_move <= 0)
+ break;
+ }
+
+ return max_load_move - rem_load_move;
+}
+
+static int
+move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ struct cfs_rq *busy_cfs_rq;
+ struct rq_iterator cfs_rq_iterator;
+
+ cfs_rq_iterator.start = load_balance_start_fair;
+ cfs_rq_iterator.next = load_balance_next_fair;
+
+ for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+ /*
+ * pass busy_cfs_rq argument into
+ * load_balance_[start|next]_fair iterators
+ */
+ cfs_rq_iterator.arg = busy_cfs_rq;
+ if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+ &cfs_rq_iterator))
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
+/*
+ * scheduler tick hitting a task of our scheduling class:
+ */
+static void task_tick_fair(struct rq *rq, struct task_struct *curr)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &curr->se;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ entity_tick(cfs_rq, se);
+ }
+}
+
+#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
+
+/*
+ * Share the fairness runtime between parent and child, thus the
+ * total amount of pressure for CPU stays equal - new tasks
+ * get a chance to run but frequent forkers are not allowed to
+ * monopolize the CPU. Note: the parent runqueue is locked,
+ * the child is not running yet.
+ */
+static void task_new_fair(struct rq *rq, struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+ struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+ int this_cpu = smp_processor_id();
+
+ sched_info_queued(p);
+
+ update_curr(cfs_rq);
+ place_entity(cfs_rq, se, 1);
+
+ /* 'curr' will be NULL if the child belongs to a different group */
+ if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
+ curr && curr->vruntime < se->vruntime) {
+ /*
+ * Upon rescheduling, sched_class::put_prev_task() will place
+ * 'current' within the tree based on its new key value.
+ */
+ swap(curr->vruntime, se->vruntime);
+ }
+
+ enqueue_task_fair(rq, p, 0);
+ resched_task(rq->curr);
+}
+
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+ struct sched_entity *se = &rq->curr->se;
+
+ for_each_sched_entity(se)
+ set_next_entity(cfs_rq_of(se), se);
+}
+
+/*
+ * All the scheduling class methods:
+ */
+static const struct sched_class fair_sched_class = {
+ .next = &idle_sched_class,
+ .enqueue_task = enqueue_task_fair,
+ .dequeue_task = dequeue_task_fair,
+ .yield_task = yield_task_fair,
+
+ .check_preempt_curr = check_preempt_wakeup,
+
+ .pick_next_task = pick_next_task_fair,
+ .put_prev_task = put_prev_task_fair,
+
+#ifdef CONFIG_SMP
+ .load_balance = load_balance_fair,
+ .move_one_task = move_one_task_fair,
+#endif
+
+ .set_curr_task = set_curr_task_fair,
+ .task_tick = task_tick_fair,
+ .task_new = task_new_fair,
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+static void print_cfs_stats(struct seq_file *m, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
+#endif
+
+ rcu_read_lock();
+ for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ print_cfs_rq(m, cpu, cfs_rq);
+ rcu_read_unlock();
+}
+#endif
Index: linux-cfs-2.6.22.13.q/kernel/sched_idletask.c
===================================================================
--- /dev/null
+++ linux-cfs-2.6.22.13.q/kernel/sched_idletask.c
@@ -0,0 +1,89 @@
+/*
+ * idle-task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE tasks which are
+ * handled in sched_fair.c)
+ */
+
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
+{
+ resched_task(rq->idle);
+}
+
+static struct task_struct *pick_next_task_idle(struct rq *rq)
+{
+ schedstat_inc(rq, sched_goidle);
+
+ return rq->idle;
+}
+
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+{
+ spin_unlock_irq(&rq->lock);
+ printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ dump_stack();
+ spin_lock_irq(&rq->lock);
+}
+
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+}
+
+#ifdef CONFIG_SMP
+static unsigned long
+load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ return 0;
+}
+
+static int
+move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ return 0;
+}
+#endif
+
+static void task_tick_idle(struct rq *rq, struct task_struct *curr)
+{
+}
+
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+const struct sched_class idle_sched_class = {
+ /* .next is NULL */
+ /* no enqueue/yield_task for idle tasks */
+
+ /* dequeue is not valid, we print a debug message there: */
+ .dequeue_task = dequeue_task_idle,
+
+ .check_preempt_curr = check_preempt_curr_idle,
+
+ .pick_next_task = pick_next_task_idle,
+ .put_prev_task = put_prev_task_idle,
+
+#ifdef CONFIG_SMP
+ .load_balance = load_balance_idle,
+ .move_one_task = move_one_task_idle,
+#endif
+
+ .set_curr_task = set_curr_task_idle,
+ .task_tick = task_tick_idle,
+ /* no .task_new for idle tasks */
+};
Index: linux-cfs-2.6.22.13.q/kernel/sched_rt.c
===================================================================
--- /dev/null
+++ linux-cfs-2.6.22.13.q/kernel/sched_rt.c
@@ -0,0 +1,259 @@
+/*
+ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
+ * policies)
+ */
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static void update_curr_rt(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ if (!task_has_rt_policy(curr))
+ return;
+
+ delta_exec = rq->clock - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ curr->se.exec_start = rq->clock;
+}
+
+static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+{
+ struct rt_prio_array *array = &rq->rt.active;
+
+ list_add_tail(&p->run_list, array->queue + p->prio);
+ __set_bit(p->prio, array->bitmap);
+ inc_load(rq, p->se.load.weight);
+}
+
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+{
+ struct rt_prio_array *array = &rq->rt.active;
+
+ update_curr_rt(rq);
+
+ list_del(&p->run_list);
+ if (list_empty(array->queue + p->prio))
+ __clear_bit(p->prio, array->bitmap);
+ dec_load(rq, p->se.load.weight);
+}
+
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void requeue_task_rt(struct rq *rq, struct task_struct *p)
+{
+ struct rt_prio_array *array = &rq->rt.active;
+
+ list_move_tail(&p->run_list, array->queue + p->prio);
+}
+
+static void
+yield_task_rt(struct rq *rq)
+{
+ requeue_task_rt(rq, rq->curr);
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+{
+ if (p->prio < rq->curr->prio)
+ resched_task(rq->curr);
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct rt_prio_array *array = &rq->rt.active;
+ struct task_struct *next;
+ struct list_head *queue;
+ int idx;
+
+ idx = sched_find_first_bit(array->bitmap);
+ if (idx >= MAX_RT_PRIO)
+ return NULL;
+
+ queue = array->queue + idx;
+ next = list_entry(queue->next, struct task_struct, run_list);
+
+ next->se.exec_start = rq->clock;
+
+ return next;
+}
+
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+{
+ update_curr_rt(rq);
+ p->se.exec_start = 0;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Load-balancing iterator. Note: while the runqueue stays locked
+ * during the whole iteration, the current task might be
+ * dequeued so the iterator has to be dequeue-safe. Here we
+ * achieve that by always pre-iterating before returning
+ * the current task:
+ */
+static struct task_struct *load_balance_start_rt(void *arg)
+{
+ struct rq *rq = arg;
+ struct rt_prio_array *array = &rq->rt.active;
+ struct list_head *head, *curr;
+ struct task_struct *p;
+ int idx;
+
+ idx = sched_find_first_bit(array->bitmap);
+ if (idx >= MAX_RT_PRIO)
+ return NULL;
+
+ head = array->queue + idx;
+ curr = head->prev;
+
+ p = list_entry(curr, struct task_struct, run_list);
+
+ curr = curr->prev;
+
+ rq->rt.rt_load_balance_idx = idx;
+ rq->rt.rt_load_balance_head = head;
+ rq->rt.rt_load_balance_curr = curr;
+
+ return p;
+}
+
+static struct task_struct *load_balance_next_rt(void *arg)
+{
+ struct rq *rq = arg;
+ struct rt_prio_array *array = &rq->rt.active;
+ struct list_head *head, *curr;
+ struct task_struct *p;
+ int idx;
+
+ idx = rq->rt.rt_load_balance_idx;
+ head = rq->rt.rt_load_balance_head;
+ curr = rq->rt.rt_load_balance_curr;
+
+ /*
+ * If we arrived back to the head again then
+ * iterate to the next queue (if any):
+ */
+ if (unlikely(head == curr)) {
+ int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+
+ if (next_idx >= MAX_RT_PRIO)
+ return NULL;
+
+ idx = next_idx;
+ head = array->queue + idx;
+ curr = head->prev;
+
+ rq->rt.rt_load_balance_idx = idx;
+ rq->rt.rt_load_balance_head = head;
+ }
+
+ p = list_entry(curr, struct task_struct, run_list);
+
+ curr = curr->prev;
+
+ rq->rt.rt_load_balance_curr = curr;
+
+ return p;
+}
+
+static unsigned long
+load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ struct rq_iterator rt_rq_iterator;
+
+ rt_rq_iterator.start = load_balance_start_rt;
+ rt_rq_iterator.next = load_balance_next_rt;
+ /* pass 'busiest' rq argument into
+ * load_balance_[start|next]_rt iterators
+ */
+ rt_rq_iterator.arg = busiest;
+
+ return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
+ idle, all_pinned, this_best_prio, &rt_rq_iterator);
+}
+
+static int
+move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ struct rq_iterator rt_rq_iterator;
+
+ rt_rq_iterator.start = load_balance_start_rt;
+ rt_rq_iterator.next = load_balance_next_rt;
+ rt_rq_iterator.arg = busiest;
+
+ return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+ &rt_rq_iterator);
+}
+#endif
+
+static void task_tick_rt(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * RR tasks need a special form of timeslice management.
+ * FIFO tasks have no timeslices.
+ */
+ if (p->policy != SCHED_RR)
+ return;
+
+ if (--p->time_slice)
+ return;
+
+ p->time_slice = DEF_TIMESLICE;
+
+ /*
+ * Requeue to the end of queue if we are not the only element
+ * on the queue:
+ */
+ if (p->run_list.prev != p->run_list.next) {
+ requeue_task_rt(rq, p);
+ set_tsk_need_resched(p);
+ }
+}
+
+static void set_curr_task_rt(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ p->se.exec_start = rq->clock;
+}
+
+const struct sched_class rt_sched_class = {
+ .next = &fair_sched_class,
+ .enqueue_task = enqueue_task_rt,
+ .dequeue_task = dequeue_task_rt,
+ .yield_task = yield_task_rt,
+
+ .check_preempt_curr = check_preempt_curr_rt,
+
+ .pick_next_task = pick_next_task_rt,
+ .put_prev_task = put_prev_task_rt,
+
+#ifdef CONFIG_SMP
+ .load_balance = load_balance_rt,
+ .move_one_task = move_one_task_rt,
+#endif
+
+ .set_curr_task = set_curr_task_rt,
+ .task_tick = task_tick_rt,
+};
Index: linux-cfs-2.6.22.13.q/kernel/sched_stats.h
===================================================================
--- /dev/null
+++ linux-cfs-2.6.22.13.q/kernel/sched_stats.h
@@ -0,0 +1,236 @@
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 14
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+ int cpu;
+
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ for_each_online_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+ struct sched_domain *sd;
+ int dcount = 0;
+#endif
+
+ /* runqueue-specific stats */
+ seq_printf(seq,
+ "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu",
+ cpu, rq->yld_both_empty,
+ rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
+ rq->sched_switch, rq->sched_count, rq->sched_goidle,
+ rq->ttwu_count, rq->ttwu_local,
+ rq->rq_sched_info.cpu_time,
+ rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+
+ seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+ /* domain-specific stats */
+ preempt_disable();
+ for_each_domain(cpu, sd) {
+ enum cpu_idle_type itype;
+ char mask_str[NR_CPUS];
+
+ cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
+ seq_printf(seq, "domain%d %s", dcount++, mask_str);
+ for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+ itype++) {
+ seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ sd->lb_count[itype],
+ sd->lb_balanced[itype],
+ sd->lb_failed[itype],
+ sd->lb_imbalance[itype],
+ sd->lb_gained[itype],
+ sd->lb_hot_gained[itype],
+ sd->lb_nobusyq[itype],
+ sd->lb_nobusyg[itype]);
+ }
+ seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+ sd->alb_count, sd->alb_failed, sd->alb_pushed,
+ sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+ sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+ sd->ttwu_wake_remote, sd->ttwu_move_affine,
+ sd->ttwu_move_balance);
+ }
+ preempt_enable();
+#endif
+ }
+ return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+ unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+ char *buf = kmalloc(size, GFP_KERNEL);
+ struct seq_file *m;
+ int res;
+
+ if (!buf)
+ return -ENOMEM;
+ res = single_open(file, show_schedstat, NULL);
+ if (!res) {
+ m = file->private_data;
+ m->buf = buf;
+ m->size = size;
+ } else
+ kfree(buf);
+ return res;
+}
+
+const struct file_operations proc_schedstat_operations = {
+ .open = schedstat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{
+ if (rq) {
+ rq->rq_sched_info.run_delay += delta;
+ rq->rq_sched_info.pcount++;
+ }
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{
+ if (rq)
+ rq->rq_sched_info.cpu_time += delta;
+}
+# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
+# define schedstat_set(var, val) do { var = (val); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{}
+# define schedstat_inc(rq, field) do { } while (0)
+# define schedstat_add(rq, field, amt) do { } while (0)
+# define schedstat_set(var, val) do { } while (0)
+#endif
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+/*
+ * Called when a process is dequeued from the active array and given
+ * the cpu. We should note that with the exception of interactive
+ * tasks, the expired queue will become the active queue after the active
+ * queue is empty, without explicitly dequeuing and requeuing tasks in the
+ * expired queue. (Interactive tasks may be requeued directly to the
+ * active queue, thus delaying tasks in the expired queue from running;
+ * see scheduler_tick()).
+ *
+ * This function is only called from sched_info_arrive(), rather than
+ * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * times as it is shuffled about, we're really interested in knowing how
+ * long it was from the *first* time it was queued to the time that it
+ * finally hit a cpu.
+ */
+static inline void sched_info_dequeued(struct task_struct *t)
+{
+ t->sched_info.last_queued = 0;
+}
+
+/*
+ * Called when a task finally hits the cpu. We can now calculate how
+ * long it was waiting to run. We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static void sched_info_arrive(struct task_struct *t)
+{
+ unsigned long long now = task_rq(t)->clock, delta = 0;
+
+ if (t->sched_info.last_queued)
+ delta = now - t->sched_info.last_queued;
+ sched_info_dequeued(t);
+ t->sched_info.run_delay += delta;
+ t->sched_info.last_arrival = now;
+ t->sched_info.pcount++;
+
+ rq_sched_info_arrive(task_rq(t), delta);
+}
+
+/*
+ * Called when a process is queued into either the active or expired
+ * array. The time is noted and later used to determine how long we
+ * had to wait for us to reach the cpu. Since the expired queue will
+ * become the active queue after active queue is empty, without dequeuing
+ * and requeuing any tasks, we are interested in queuing to either. It
+ * is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued in the same or another array: this can happen in sched_yield(),
+ * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+ * to runqueue.
+ *
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set. It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(struct task_struct *t)
+{
+ if (unlikely(sched_info_on()))
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = task_rq(t)->clock;
+}
+
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily. Now we can calculate how long we ran.
+ */
+static inline void sched_info_depart(struct task_struct *t)
+{
+ unsigned long long delta = task_rq(t)->clock -
+ t->sched_info.last_arrival;
+
+ t->sched_info.cpu_time += delta;
+ rq_sched_info_depart(task_rq(t), delta);
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice. (This may also be called when switching to or from
+ * the idle task.) We are only called when prev != next.
+ */
+static inline void
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+ struct rq *rq = task_rq(prev);
+
+ /*
+ * prev now departs the cpu. It's not interesting to record
+ * stats about how efficient we were at scheduling the idle
+ * process, however.
+ */
+ if (prev != rq->idle)
+ sched_info_depart(prev);
+
+ if (next != rq->idle)
+ sched_info_arrive(next);
+}
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+ if (unlikely(sched_info_on()))
+ __sched_info_switch(prev, next);
+}
+#else
+#define sched_info_queued(t) do { } while (0)
+#define sched_info_switch(t, next) do { } while (0)
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+
Index: linux-cfs-2.6.22.13.q/kernel/softirq.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/softirq.c
+++ linux-cfs-2.6.22.13.q/kernel/softirq.c
@@ -488,7 +488,6 @@ void __init softirq_init(void)

static int ksoftirqd(void * __bind_cpu)
{
- set_user_nice(current, 19);
current->flags |= PF_NOFREEZE;

set_current_state(TASK_INTERRUPTIBLE);
Index: linux-cfs-2.6.22.13.q/kernel/sys.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/sys.c
+++ linux-cfs-2.6.22.13.q/kernel/sys.c
@@ -31,11 +31,13 @@
#include <linux/cn_proc.h>
#include <linux/getcpu.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/seccomp.h>
#include <linux/cpu.h>

#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/kprobes.h>
+#include <linux/user_namespace.h>

#include <asm/uaccess.h>
#include <asm/io.h>
@@ -1080,13 +1082,13 @@ static int set_user(uid_t new_ruid, int
{
struct user_struct *new_user;

- new_user = alloc_uid(new_ruid);
+ new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
if (!new_user)
return -EAGAIN;

if (atomic_read(&new_user->processes) >=
current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
- new_user != &root_user) {
+ new_user != current->nsproxy->user_ns->root_user) {
free_uid(new_user);
return -EAGAIN;
}
Index: linux-cfs-2.6.22.13.q/kernel/sysctl.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/sysctl.c
+++ linux-cfs-2.6.22.13.q/kernel/sysctl.c
@@ -202,11 +202,135 @@ static ctl_table root_table[] = {
.mode = 0555,
.child = dev_table,
},
-
+/*
+ * NOTE: do not add new entries to this table unless you have read
+ * Documentation/sysctl/ctl_unnumbered.txt
+ */
{ .ctl_name = 0 }
};

-static ctl_table kern_table[] = {
+#ifdef CONFIG_SCHED_DEBUG
+static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
+static unsigned long max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
+static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
+static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#endif
+
+static struct ctl_table kern_table[] = {
+#ifdef CONFIG_SCHED_DEBUG
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_min_granularity_ns",
+ .data = &sysctl_sched_min_granularity,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &sched_nr_latency_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_latency_ns",
+ .data = &sysctl_sched_latency,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &sched_nr_latency_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_wakeup_granularity_ns",
+ .data = &sysctl_sched_wakeup_granularity,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_wakeup_granularity_ns,
+ .extra2 = &max_wakeup_granularity_ns,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_batch_wakeup_granularity_ns",
+ .data = &sysctl_sched_batch_wakeup_granularity,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_wakeup_granularity_ns,
+ .extra2 = &max_wakeup_granularity_ns,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_child_runs_first",
+ .data = &sysctl_sched_child_runs_first,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_features",
+ .data = &sysctl_sched_features,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_migration_cost",
+ .data = &sysctl_sched_migration_cost,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_nr_migrate",
+ .data = &sysctl_sched_nr_migrate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 644,
+ .proc_handler = &proc_dointvec,
+ },
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_min_bal_int_shares",
+ .data = &sysctl_sched_min_bal_int_shares,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_max_bal_int_shares",
+ .data = &sysctl_sched_max_bal_int_shares,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#endif
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_compat_yield",
+ .data = &sysctl_sched_compat_yield,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{
.ctl_name = KERN_PANIC,
.procname = "panic",
Index: linux-cfs-2.6.22.13.q/kernel/timer.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/timer.c
+++ linux-cfs-2.6.22.13.q/kernel/timer.c
@@ -812,10 +812,13 @@ void update_process_times(int user_tick)
int cpu = smp_processor_id();

/* Note: this timer irq context must be accounted for as well. */
- if (user_tick)
+ if (user_tick) {
account_user_time(p, jiffies_to_cputime(1));
- else
+ account_user_time_scaled(p, jiffies_to_cputime(1));
+ } else {
account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+ account_system_time_scaled(p, jiffies_to_cputime(1));
+ }
run_local_timers();
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_tick);
Index: linux-cfs-2.6.22.13.q/kernel/tsacct.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/tsacct.c
+++ linux-cfs-2.6.22.13.q/kernel/tsacct.c
@@ -62,6 +62,10 @@ void bacct_add_tsk(struct taskstats *sta
rcu_read_unlock();
stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+ stats->ac_utimescaled =
+ cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
+ stats->ac_stimescaled =
+ cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;

Index: linux-cfs-2.6.22.13.q/kernel/user.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/kernel/user.c
+++ linux-cfs-2.6.22.13.q/kernel/user.c
@@ -14,20 +14,19 @@
#include <linux/bitops.h>
#include <linux/key.h>
#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/user_namespace.h>

/*
* UID task count cache, to get fast user lookup in "alloc_uid"
* when changing user ID's (ie setuid() and friends).
*/

-#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
-#define UIDHASH_SZ (1 << UIDHASH_BITS)
#define UIDHASH_MASK (UIDHASH_SZ - 1)
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
+#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid)))

static struct kmem_cache *uid_cachep;
-static struct list_head uidhash_table[UIDHASH_SZ];

/*
* The uidhash_lock is mostly taken from process context, but it is
@@ -51,31 +50,33 @@ struct user_struct root_user = {
.uid_keyring = &root_user_keyring,
.session_keyring = &root_session_keyring,
#endif
+#ifdef CONFIG_FAIR_USER_SCHED
+ .tg = &init_task_group,
+#endif
};

/*
* These routines must be called with the uidhash spinlock held!
*/
-static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
+static inline void uid_hash_insert(struct user_struct *up,
+ struct hlist_head *hashent)
{
- list_add(&up->uidhash_list, hashent);
+ hlist_add_head(&up->uidhash_node, hashent);
}

static inline void uid_hash_remove(struct user_struct *up)
{
- list_del(&up->uidhash_list);
+ hlist_del_init(&up->uidhash_node);
}

-static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent)
+static inline struct user_struct *uid_hash_find(uid_t uid,
+ struct hlist_head *hashent)
{
- struct list_head *up;
-
- list_for_each(up, hashent) {
- struct user_struct *user;
+ struct user_struct *user;
+ struct hlist_node *h;

- user = list_entry(up, struct user_struct, uidhash_list);
-
- if(user->uid == uid) {
+ hlist_for_each_entry(user, h, hashent, uidhash_node) {
+ if (user->uid == uid) {
atomic_inc(&user->__count);
return user;
}
@@ -84,6 +85,203 @@ static inline struct user_struct *uid_ha
return NULL;
}

+#ifdef CONFIG_FAIR_USER_SCHED
+
+static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
+static DEFINE_MUTEX(uids_mutex);
+
+static void sched_destroy_user(struct user_struct *up)
+{
+ sched_destroy_group(up->tg);
+}
+
+static int sched_create_user(struct user_struct *up)
+{
+ int rc = 0;
+
+ up->tg = sched_create_group();
+ if (IS_ERR(up->tg))
+ rc = -ENOMEM;
+
+ return rc;
+}
+
+static void sched_switch_user(struct task_struct *p)
+{
+ sched_move_task(p);
+}
+
+static inline void uids_mutex_lock(void)
+{
+ mutex_lock(&uids_mutex);
+}
+
+static inline void uids_mutex_unlock(void)
+{
+ mutex_unlock(&uids_mutex);
+}
+
+/* return cpu shares held by the user */
+ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+{
+ struct user_struct *up = container_of(kset, struct user_struct, kset);
+
+ return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
+}
+
+/* modify cpu shares held by the user */
+ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
+{
+ struct user_struct *up = container_of(kset, struct user_struct, kset);
+ unsigned long shares;
+ int rc;
+
+ sscanf(buffer, "%lu", &shares);
+
+ rc = sched_group_set_shares(up->tg, shares);
+
+ return (rc ? rc : size);
+}
+
+static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
+{
+ sa->attr.name = name;
+ sa->attr.mode = mode;
+ sa->show = cpu_shares_show;
+ sa->store = cpu_shares_store;
+}
+
+/* Create "/sys/kernel/uids/<uid>" directory and
+ * "/sys/kernel/uids/<uid>/cpu_share" file for this user.
+ */
+static int user_kobject_create(struct user_struct *up)
+{
+ struct kset *kset = &up->kset;
+ struct kobject *kobj = &kset->kobj;
+ int error;
+
+ memset(kset, 0, sizeof(struct kset));
+ kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */
+ kobject_set_name(kobj, "%d", up->uid);
+ kset_init(kset);
+ user_attr_init(&up->user_attr, "cpu_share", 0644);
+
+ error = kobject_add(kobj);
+ if (error)
+ goto done;
+
+ error = sysfs_create_file(kobj, &up->user_attr.attr);
+ if (error)
+ kobject_del(kobj);
+
+ kobject_uevent(kobj, KOBJ_ADD);
+
+done:
+ return error;
+}
+
+/* create these in sysfs filesystem:
+ * "/sys/kernel/uids" directory
+ * "/sys/kernel/uids/0" directory (for root user)
+ * "/sys/kernel/uids/0/cpu_share" file (for root user)
+ */
+int __init uids_kobject_init(void)
+{
+ int error;
+
+ /* create under /sys/kernel dir */
+ uids_kobject.parent = &kernel_subsys.kobj;
+ uids_kobject.kset = &kernel_subsys;
+ kobject_set_name(&uids_kobject, "uids");
+ kobject_init(&uids_kobject);
+
+ error = kobject_add(&uids_kobject);
+ if (!error)
+ error = user_kobject_create(&root_user);
+
+ return error;
+}
+
+/* work function to remove sysfs directory for a user and free up
+ * corresponding structures.
+ */
+static void remove_user_sysfs_dir(struct work_struct *w)
+{
+ struct user_struct *up = container_of(w, struct user_struct, work);
+ struct kobject *kobj = &up->kset.kobj;
+ unsigned long flags;
+ int remove_user = 0;
+
+ /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
+ * atomic.
+ */
+ uids_mutex_lock();
+
+ local_irq_save(flags);
+
+ if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+ uid_hash_remove(up);
+ remove_user = 1;
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+
+ if (!remove_user)
+ goto done;
+
+ sysfs_remove_file(kobj, &up->user_attr.attr);
+ kobject_uevent(kobj, KOBJ_REMOVE);
+ kobject_del(kobj);
+
+ sched_destroy_user(up);
+ key_put(up->uid_keyring);
+ key_put(up->session_keyring);
+ kmem_cache_free(uid_cachep, up);
+
+done:
+ uids_mutex_unlock();
+}
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+ /* restore back the count */
+ atomic_inc(&up->__count);
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+
+ INIT_WORK(&up->work, remove_user_sysfs_dir);
+ schedule_work(&up->work);
+}
+
+#else /* CONFIG_FAIR_USER_SCHED */
+
+static void sched_destroy_user(struct user_struct *up) { }
+static int sched_create_user(struct user_struct *up) { return 0; }
+static void sched_switch_user(struct task_struct *p) { }
+static inline int user_kobject_create(struct user_struct *up) { return 0; }
+static inline void uids_mutex_lock(void) { }
+static inline void uids_mutex_unlock(void) { }
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+ uid_hash_remove(up);
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ sched_destroy_user(up);
+ key_put(up->uid_keyring);
+ key_put(up->session_keyring);
+ kmem_cache_free(uid_cachep, up);
+}
+
+#endif /* CONFIG_FAIR_USER_SCHED */
+
/*
* Locate the user_struct for the passed UID. If found, take a ref on it. The
* caller must undo that ref with free_uid().
@@ -94,9 +292,10 @@ struct user_struct *find_user(uid_t uid)
{
struct user_struct *ret;
unsigned long flags;
+ struct user_namespace *ns = current->nsproxy->user_ns;

spin_lock_irqsave(&uidhash_lock, flags);
- ret = uid_hash_find(uid, uidhashentry(uid));
+ ret = uid_hash_find(uid, uidhashentry(ns, uid));
spin_unlock_irqrestore(&uidhash_lock, flags);
return ret;
}
@@ -109,22 +308,22 @@ void free_uid(struct user_struct *up)
return;

local_irq_save(flags);
- if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
- uid_hash_remove(up);
- spin_unlock_irqrestore(&uidhash_lock, flags);
- key_put(up->uid_keyring);
- key_put(up->session_keyring);
- kmem_cache_free(uid_cachep, up);
- } else {
+ if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+ free_user(up, flags);
+ else
local_irq_restore(flags);
- }
}

-struct user_struct * alloc_uid(uid_t uid)
+struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
{
- struct list_head *hashent = uidhashentry(uid);
+ struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up;

+ /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
+ * atomic.
+ */
+ uids_mutex_lock();
+
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
spin_unlock_irq(&uidhash_lock);
@@ -153,6 +352,22 @@ struct user_struct * alloc_uid(uid_t uid
return NULL;
}

+ if (sched_create_user(new) < 0) {
+ key_put(new->uid_keyring);
+ key_put(new->session_keyring);
+ kmem_cache_free(uid_cachep, new);
+ return NULL;
+ }
+
+ if (user_kobject_create(new)) {
+ sched_destroy_user(new);
+ key_put(new->uid_keyring);
+ key_put(new->session_keyring);
+ kmem_cache_free(uid_cachep, new);
+ uids_mutex_unlock();
+ return NULL;
+ }
+
/*
* Before adding this, check whether we raced
* on adding the same user already..
@@ -160,6 +375,11 @@ struct user_struct * alloc_uid(uid_t uid
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
+ /* This case is not possible when CONFIG_FAIR_USER_SCHED
+ * is defined, since we serialize alloc_uid() using
+ * uids_mutex. Hence no need to call
+ * sched_destroy_user() or remove_user_sysfs_dir().
+ */
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
@@ -170,6 +390,9 @@ struct user_struct * alloc_uid(uid_t uid
spin_unlock_irq(&uidhash_lock);

}
+
+ uids_mutex_unlock();
+
return up;
}

@@ -187,6 +410,7 @@ void switch_uid(struct user_struct *new_
atomic_dec(&old_user->processes);
switch_uid_keyring(new_user);
current->user = new_user;
+ sched_switch_user(current);

/*
* We need to synchronize with __sigqueue_alloc()
@@ -202,6 +426,30 @@ void switch_uid(struct user_struct *new_
suid_keys(current);
}

+void release_uids(struct user_namespace *ns)
+{
+ int i;
+ unsigned long flags;
+ struct hlist_head *head;
+ struct hlist_node *nd;
+
+ spin_lock_irqsave(&uidhash_lock, flags);
+ /*
+ * collapse the chains so that the user_struct-s will
+ * be still alive, but not in hashes. subsequent free_uid()
+ * will free them.
+ */
+ for (i = 0; i < UIDHASH_SZ; i++) {
+ head = ns->uidhash_table + i;
+ while (!hlist_empty(head)) {
+ nd = head->first;
+ hlist_del_init(nd);
+ }
+ }
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+
+ free_uid(ns->root_user);
+}

static int __init uid_cache_init(void)
{
@@ -211,11 +459,11 @@ static int __init uid_cache_init(void)
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);

for(n = 0; n < UIDHASH_SZ; ++n)
- INIT_LIST_HEAD(uidhash_table + n);
+ INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);

/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
- uid_hash_insert(&root_user, uidhashentry(0));
+ uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
spin_unlock_irq(&uidhash_lock);

return 0;
Index: linux-cfs-2.6.22.13.q/kernel/user_namespace.c
===================================================================
--- /dev/null
+++ linux-cfs-2.6.22.13.q/kernel/user_namespace.c
@@ -0,0 +1,88 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
+
+struct user_namespace init_user_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
+ .root_user = &root_user,
+};
+
+EXPORT_SYMBOL_GPL(init_user_ns);
+
+#ifdef CONFIG_USER_NS
+
+/*
+ * Clone a new ns copying an original user ns, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+{
+ struct user_namespace *ns;
+ struct user_struct *new_user;
+ int n;
+
+ ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+ if (!ns)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&ns->kref);
+
+ for (n = 0; n < UIDHASH_SZ; ++n)
+ INIT_HLIST_HEAD(ns->uidhash_table + n);
+
+ /* Insert new root user. */
+ ns->root_user = alloc_uid(ns, 0);
+ if (!ns->root_user) {
+ kfree(ns);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Reset current->user with a new one */
+ new_user = alloc_uid(ns, current->uid);
+ if (!new_user) {
+ free_uid(ns->root_user);
+ kfree(ns);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ switch_uid(new_user);
+ return ns;
+}
+
+struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
+{
+ struct user_namespace *new_ns;
+
+ BUG_ON(!old_ns);
+ get_user_ns(old_ns);
+
+ if (!(flags & CLONE_NEWUSER))
+ return old_ns;
+
+ new_ns = clone_user_ns(old_ns);
+
+ put_user_ns(old_ns);
+ return new_ns;
+}
+
+void free_user_ns(struct kref *kref)
+{
+ struct user_namespace *ns;
+
+ ns = container_of(kref, struct user_namespace, kref);
+ release_uids(ns);
+ kfree(ns);
+}
+
+#endif /* CONFIG_USER_NS */
Index: linux-cfs-2.6.22.13.q/lib/Kconfig.debug
===================================================================
--- linux-cfs-2.6.22.13.q.orig/lib/Kconfig.debug
+++ linux-cfs-2.6.22.13.q/lib/Kconfig.debug
@@ -105,6 +105,15 @@ config DETECT_SOFTLOCKUP
can be detected via the NMI-watchdog, on platforms that
support it.)

+config SCHED_DEBUG
+ bool "Collect scheduler debugging info"
+ depends on DEBUG_KERNEL && PROC_FS
+ default y
+ help
+ If you say Y here, the /proc/sched_debug file will be provided
+ that can help debug the scheduler. The runtime overhead of this
+ option is minimal.
+
config SCHEDSTATS
bool "Collect scheduler statistics"
depends on DEBUG_KERNEL && PROC_FS
Index: linux-cfs-2.6.22.13.q/mm/memory_hotplug.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/mm/memory_hotplug.c
+++ linux-cfs-2.6.22.13.q/mm/memory_hotplug.c
@@ -217,6 +217,10 @@ int online_pages(unsigned long pfn, unsi
zone->zone_pgdat->node_present_pages += onlined_pages;

setup_per_zone_pages_min();
+ if (onlined_pages) {
+ kswapd_run(zone_to_nid(zone));
+ node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+ }

if (need_zonelists_rebuild)
build_all_zonelists();
@@ -271,9 +275,6 @@ int add_memory(int nid, u64 start, u64 s
if (!pgdat)
return -ENOMEM;
new_pgdat = 1;
- ret = kswapd_run(nid);
- if (ret)
- goto error;
}

/* call arch's memory hotadd */
Index: linux-cfs-2.6.22.13.q/mm/page_alloc.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/mm/page_alloc.c
+++ linux-cfs-2.6.22.13.q/mm/page_alloc.c
@@ -47,13 +47,21 @@
#include "internal.h"

/*
- * MCD - HACK: Find somewhere to initialize this EARLY, or make this
- * initializer cleaner
+ * Array of node states.
*/
-nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
-EXPORT_SYMBOL(node_online_map);
-nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
-EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
+ [N_POSSIBLE] = NODE_MASK_ALL,
+ [N_ONLINE] = { { [0] = 1UL } },
+#ifndef CONFIG_NUMA
+ [N_NORMAL_MEMORY] = { { [0] = 1UL } },
+#ifdef CONFIG_HIGHMEM
+ [N_HIGH_MEMORY] = { { [0] = 1UL } },
+#endif
+ [N_CPU] = { { [0] = 1UL } },
+#endif /* NUMA */
+};
+EXPORT_SYMBOL(node_states);
+
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
long nr_swap_pages;
@@ -1819,14 +1827,35 @@ static void __meminit build_zonelist_cac

#endif /* CONFIG_NUMA */

+/* Any regular memory on that node ? */
+static void check_for_regular_memory(pg_data_t *pgdat)
+{
+#ifdef CONFIG_HIGHMEM
+ enum zone_type zone_type;
+
+ for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+ if (zone->present_pages)
+ node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+ }
+#endif
+}
+
/* return values int ....just for stop_machine_run() */
static int __meminit __build_all_zonelists(void *dummy)
{
int nid;

for_each_online_node(nid) {
- build_zonelists(NODE_DATA(nid));
- build_zonelist_cache(NODE_DATA(nid));
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ build_zonelists(pgdat);
+ build_zonelist_cache(pgdat);
+
+ /* Any memory on that node */
+ if (pgdat->node_present_pages)
+ node_set_state(nid, N_HIGH_MEMORY);
+ check_for_regular_memory(pgdat);
}
return 0;
}
@@ -2064,6 +2093,9 @@ static struct per_cpu_pageset boot_pages
static int __cpuinit process_zones(int cpu)
{
struct zone *zone, *dzone;
+ int node = cpu_to_node(cpu);
+
+ node_set_state(node, N_CPU); /* this node has a cpu */

for_each_zone(zone) {

@@ -2071,7 +2103,7 @@ static int __cpuinit process_zones(int c
continue;

zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
- GFP_KERNEL, cpu_to_node(cpu));
+ GFP_KERNEL, node);
if (!zone_pcp(zone, cpu))
goto bad;

Index: linux-cfs-2.6.22.13.q/mm/vmscan.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/mm/vmscan.c
+++ linux-cfs-2.6.22.13.q/mm/vmscan.c
@@ -1686,7 +1686,6 @@ static int __zone_reclaim(struct zone *z

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
- cpumask_t mask;
int node_id;

/*
@@ -1723,8 +1722,7 @@ int zone_reclaim(struct zone *zone, gfp_
* as wide as possible.
*/
node_id = zone_to_nid(zone);
- mask = node_to_cpumask(node_id);
- if (!cpus_empty(mask) && node_id != numa_node_id())
+ if (node_state(node_id, N_CPU) && node_id != numa_node_id())
return 0;
return __zone_reclaim(zone, gfp_mask, order);
}
Index: linux-cfs-2.6.22.13.q/net/unix/af_unix.c
===================================================================
--- linux-cfs-2.6.22.13.q.orig/net/unix/af_unix.c
+++ linux-cfs-2.6.22.13.q/net/unix/af_unix.c
@@ -307,7 +307,7 @@ static void unix_write_space(struct sock
read_lock(&sk->sk_callback_lock);
if (unix_writable(sk)) {
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ wake_up_interruptible_sync(sk->sk_sleep);
sk_wake_async(sk, 2, POLL_OUT);
}
read_unlock(&sk->sk_callback_lock);
@@ -1611,7 +1611,7 @@ static int unix_dgram_recvmsg(struct kio
if (!skb)
goto out_unlock;

- wake_up_interruptible(&u->peer_wait);
+ wake_up_interruptible_sync(&u->peer_wait);

if (msg->msg_name)
unix_copy_addr(msg, skb->sk);

Next message: Andrew Morton: "Re: [PATCH 4/4] autofs4 - add miscelaneous device for ioctls"
Previous message: Mike Snitzer: "Re: Western Digital GreenPower drives and Linux"
Next in thread: Peter Zijlstra: "Re: modifying CFS failure"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]