aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-10 15:42:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-10 15:42:31 -0400
commitb11ce8a26d26ed9019a8803aa90d580b52f23e79 (patch)
tree332f7b59487335229119c0ede371af3a9783d577
parentf6bccf695431da0e9bd773550ae91b8cb9ffb227 (diff)
parenta5d8c3483a6e19aca95ef6a2c5890e33bfa5b293 (diff)
Merge branch 'sched-v28-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-v28-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (38 commits) sched debug: add name to sched_domain sysctl entries sched: sync wakeups vs avg_overlap sched: remove redundant code in cpu_cgroup_create() sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq cpusets: scan_for_empty_cpusets(), cpuset doesn't seem to be so const sched: minor optimizations in wake_affine and select_task_rq_fair sched: maintain only task entities in cfs_rq->tasks list sched: fixup buddy selection sched: more sanity checks on the bandwidth settings sched: add some comments to the bandwidth code sched: fixlet for group load balance sched: rework wakeup preemption CFS scheduler: documentation about scheduling policies sched: clarify ifdef tangle sched: fix list traversal to use _rcu variant sched: turn off WAKEUP_OVERLAP sched: wakeup preempt when small overlap kernel/cpu.c: create a CPU_STARTING cpu_chain notifier kernel/cpu.c: Move the CPU_DYING notifiers sched: fix __load_balance_iterator() for cfq with only one task ...
-rw-r--r--Documentation/kernel-doc-nano-HOWTO.txt4
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt395
-rw-r--r--arch/alpha/kernel/smp.c3
-rw-r--r--arch/arm/kernel/smp.c1
-rw-r--r--arch/cris/arch-v32/kernel/smp.c1
-rw-r--r--arch/ia64/kernel/smpboot.c1
-rw-r--r--arch/m32r/kernel/smpboot.c2
-rw-r--r--arch/mips/kernel/smp.c2
-rw-r--r--arch/powerpc/kernel/smp.c1
-rw-r--r--arch/s390/kernel/smp.c2
-rw-r--r--arch/sh/kernel/smp.c2
-rw-r--r--arch/sparc/kernel/sun4d_smp.c1
-rw-r--r--arch/sparc/kernel/sun4m_smp.c2
-rw-r--r--arch/um/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c2
-rw-r--r--include/linux/completion.h41
-rw-r--r--include/linux/cpu.h1
-rw-r--r--include/linux/notifier.h10
-rw-r--r--include/linux/proportions.h2
-rw-r--r--include/linux/sched.h9
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/sched.c377
-rw-r--r--kernel/sched_fair.c234
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c57
-rw-r--r--kernel/user.c4
29 files changed, 702 insertions, 487 deletions
diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt
index 0bd32748a467..c6841eee9598 100644
--- a/Documentation/kernel-doc-nano-HOWTO.txt
+++ b/Documentation/kernel-doc-nano-HOWTO.txt
@@ -168,10 +168,10 @@ if ($#ARGV < 0) {
168mkdir $ARGV[0],0777; 168mkdir $ARGV[0],0777;
169$state = 0; 169$state = 0;
170while (<STDIN>) { 170while (<STDIN>) {
171 if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) { 171 if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) {
172 if ($state == 1) { close OUT } 172 if ($state == 1) { close OUT }
173 $state = 1; 173 $state = 1;
174 $fn = "$ARGV[0]/$1.4"; 174 $fn = "$ARGV[0]/$1.9";
175 print STDERR "Creating $fn\n"; 175 print STDERR "Creating $fn\n";
176 open OUT, ">$fn" or die "can't open $fn: $!\n"; 176 open OUT, ">$fn" or die "can't open $fn: $!\n";
177 print OUT $_; 177 print OUT $_;
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 88bcb8767335..9d8eb553884c 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -1,151 +1,242 @@
1 =============
2 CFS Scheduler
3 =============
1 4
2This is the CFS scheduler.
3
480% of CFS's design can be summed up in a single sentence: CFS basically
5models an "ideal, precise multi-tasking CPU" on real hardware.
6
7"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100%
8physical power and which can run each task at precise equal speed, in
9parallel, each at 1/nr_running speed. For example: if there are 2 tasks
10running then it runs each at 50% physical power - totally in parallel.
11
12On real hardware, we can run only a single task at once, so while that
13one task runs, the other tasks that are waiting for the CPU are at a
14disadvantage - the current task gets an unfair amount of CPU time. In
15CFS this fairness imbalance is expressed and tracked via the per-task
16p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
17time the task should now run on the CPU for it to become completely fair
18and balanced.
19
20( small detail: on 'ideal' hardware, the p->wait_runtime value would
21 always be zero - no task would ever get 'out of balance' from the
22 'ideal' share of CPU time. )
23
24CFS's task picking logic is based on this p->wait_runtime value and it
25is thus very simple: it always tries to run the task with the largest
26p->wait_runtime value. In other words, CFS tries to run the task with
27the 'gravest need' for more CPU time. So CFS always tries to split up
28CPU time between runnable tasks as close to 'ideal multitasking
29hardware' as possible.
30
31Most of the rest of CFS's design just falls out of this really simple
32concept, with a few add-on embellishments like nice levels,
33multiprocessing and various algorithm variants to recognize sleepers.
34
35In practice it works like this: the system runs a task a bit, and when
36the task schedules (or a scheduler tick happens) the task's CPU usage is
37'accounted for': the (small) time it just spent using the physical CPU
38is deducted from p->wait_runtime. [minus the 'fair share' it would have
39gotten anyway]. Once p->wait_runtime gets low enough so that another
40task becomes the 'leftmost task' of the time-ordered rbtree it maintains
41(plus a small amount of 'granularity' distance relative to the leftmost
42task so that we do not over-schedule tasks and trash the cache) then the
43new leftmost task is picked and the current task is preempted.
44
45The rq->fair_clock value tracks the 'CPU time a runnable task would have
46fairly gotten, had it been runnable during that time'. So by using
47rq->fair_clock values we can accurately timestamp and measure the
48'expected CPU time' a task should have gotten. All runnable tasks are
49sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
50CFS picks the 'leftmost' task and sticks to it. As the system progresses
51forwards, newly woken tasks are put into the tree more and more to the
52right - slowly but surely giving a chance for every task to become the
53'leftmost task' and thus get on the CPU within a deterministic amount of
54time.
55
56Some implementation details:
57
58 - the introduction of Scheduling Classes: an extensible hierarchy of
59 scheduler modules. These modules encapsulate scheduling policy
60 details and are handled by the scheduler core without the core
61 code assuming about them too much.
62
63 - sched_fair.c implements the 'CFS desktop scheduler': it is a
64 replacement for the vanilla scheduler's SCHED_OTHER interactivity
65 code.
66
67 I'd like to give credit to Con Kolivas for the general approach here:
68 he has proven via RSDL/SD that 'fair scheduling' is possible and that
69 it results in better desktop scheduling. Kudos Con!
70
71 The CFS patch uses a completely different approach and implementation
72 from RSDL/SD. My goal was to make CFS's interactivity quality exceed
73 that of RSDL/SD, which is a high standard to meet :-) Testing
74 feedback is welcome to decide this one way or another. [ and, in any
75 case, all of SD's logic could be added via a kernel/sched_sd.c module
76 as well, if Con is interested in such an approach. ]
77
78 CFS's design is quite radical: it does not use runqueues, it uses a
79 time-ordered rbtree to build a 'timeline' of future task execution,
80 and thus has no 'array switch' artifacts (by which both the vanilla
81 scheduler and RSDL/SD are affected).
82
83 CFS uses nanosecond granularity accounting and does not rely on any
84 jiffies or other HZ detail. Thus the CFS scheduler has no notion of
85 'timeslices' and has no heuristics whatsoever. There is only one
86 central tunable (you have to switch on CONFIG_SCHED_DEBUG):
87
88 /proc/sys/kernel/sched_granularity_ns
89
90 which can be used to tune the scheduler from 'desktop' (low
91 latencies) to 'server' (good batching) workloads. It defaults to a
92 setting suitable for desktop workloads. SCHED_BATCH is handled by the
93 CFS scheduler module too.
94
95 Due to its design, the CFS scheduler is not prone to any of the
96 'attacks' that exist today against the heuristics of the stock
97 scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
98 work fine and do not impact interactivity and produce the expected
99 behavior.
100
101 the CFS scheduler has a much stronger handling of nice levels and
102 SCHED_BATCH: both types of workloads should be isolated much more
103 agressively than under the vanilla scheduler.
104
105 ( another detail: due to nanosec accounting and timeline sorting,
106 sched_yield() support is very simple under CFS, and in fact under
107 CFS sched_yield() behaves much better than under any other
108 scheduler i have tested so far. )
109
110 - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
111 way than the vanilla scheduler does. It uses 100 runqueues (for all
112 100 RT priority levels, instead of 140 in the vanilla scheduler)
113 and it needs no expired array.
114
115 - reworked/sanitized SMP load-balancing: the runqueue-walking
116 assumptions are gone from the load-balancing code now, and
117 iterators of the scheduling modules are used. The balancing code got
118 quite a bit simpler as a result.
119
120
121Group scheduler extension to CFS
122================================
123
124Normally the scheduler operates on individual tasks and strives to provide
125fair CPU time to each task. Sometimes, it may be desirable to group tasks
126and provide fair CPU time to each such task group. For example, it may
127be desirable to first provide fair CPU time to each user on the system
128and then to each task belonging to a user.
129
130CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
131SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
132groups. At present, there are two (mutually exclusive) mechanisms to group
133tasks for CPU bandwidth control purpose:
134
135 - Based on user id (CONFIG_FAIR_USER_SCHED)
136 In this option, tasks are grouped according to their user id.
137 - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
138 This options lets the administrator create arbitrary groups
139 of tasks, using the "cgroup" pseudo filesystem. See
140 Documentation/cgroups.txt for more information about this
141 filesystem.
142 5
143Only one of these options to group tasks can be chosen and not both. 61. OVERVIEW
7
8CFS stands for "Completely Fair Scheduler," and is the new "desktop" process
9scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the
10replacement for the previous vanilla scheduler's SCHED_OTHER interactivity
11code.
12
1380% of CFS's design can be summed up in a single sentence: CFS basically models
14an "ideal, precise multi-tasking CPU" on real hardware.
15
16"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% physical
17power and which can run each task at precise equal speed, in parallel, each at
181/nr_running speed. For example: if there are 2 tasks running, then it runs
19each at 50% physical power --- i.e., actually in parallel.
20
21On real hardware, we can run only a single task at once, so we have to
22introduce the concept of "virtual runtime." The virtual runtime of a task
23specifies when its next timeslice would start execution on the ideal
24multi-tasking CPU described above. In practice, the virtual runtime of a task
25is its actual runtime normalized to the total number of running tasks.
26
27
28
292. FEW IMPLEMENTATION DETAILS
30
31In CFS the virtual runtime is expressed and tracked via the per-task
32p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately
33timestamp and measure the "expected CPU time" a task should have gotten.
34
35[ small detail: on "ideal" hardware, at any time all tasks would have the same
36 p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
37 would ever get "out of balance" from the "ideal" share of CPU time. ]
38
39CFS's task picking logic is based on this p->se.vruntime value and it is thus
40very simple: it always tries to run the task with the smallest p->se.vruntime
41value (i.e., the task which executed least so far). CFS always tries to split
42up CPU time between runnable tasks as close to "ideal multitasking hardware" as
43possible.
44
45Most of the rest of CFS's design just falls out of this really simple concept,
46with a few add-on embellishments like nice levels, multiprocessing and various
47algorithm variants to recognize sleepers.
48
49
50
513. THE RBTREE
52
53CFS's design is quite radical: it does not use the old data structures for the
54runqueues, but it uses a time-ordered rbtree to build a "timeline" of future
55task execution, and thus has no "array switch" artifacts (by which both the
56previous vanilla scheduler and RSDL/SD are affected).
57
58CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic
59increasing value tracking the smallest vruntime among all tasks in the
60runqueue. The total amount of work done by the system is tracked using
61min_vruntime; that value is used to place newly activated entities on the left
62side of the tree as much as possible.
63
64The total number of running tasks in the runqueue is accounted through the
65rq->cfs.load value, which is the sum of the weights of the tasks queued on the
66runqueue.
67
68CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the
69p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to
70account for possible wraparounds). CFS picks the "leftmost" task from this
71tree and sticks to it.
72As the system progresses forwards, the executed tasks are put into the tree
73more and more to the right --- slowly but surely giving a chance for every task
74to become the "leftmost task" and thus get on the CPU within a deterministic
75amount of time.
76
77Summing up, CFS works like this: it runs a task a bit, and when the task
78schedules (or a scheduler tick happens) the task's CPU usage is "accounted
79for": the (small) time it just spent using the physical CPU is added to
80p->se.vruntime. Once p->se.vruntime gets high enough so that another task
81becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a
82small amount of "granularity" distance relative to the leftmost task so that we
83do not over-schedule tasks and trash the cache), then the new leftmost task is
84picked and the current task is preempted.
85
86
87
884. SOME FEATURES OF CFS
89
90CFS uses nanosecond granularity accounting and does not rely on any jiffies or
91other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
92way the previous scheduler had, and has no heuristics whatsoever. There is
93only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
94
95 /proc/sys/kernel/sched_granularity_ns
96
97which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
98"server" (i.e., good batching) workloads. It defaults to a setting suitable
99for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too.
100
101Due to its design, the CFS scheduler is not prone to any of the "attacks" that
102exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c,
103chew.c, ring-test.c, massive_intr.c all work fine and do not impact
104interactivity and produce the expected behavior.
105
106The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH
107than the previous vanilla scheduler: both types of workloads are isolated much
108more aggressively.
109
110SMP load-balancing has been reworked/sanitized: the runqueue-walking
111assumptions are gone from the load-balancing code now, and iterators of the
112scheduling modules are used. The balancing code got quite a bit simpler as a
113result.
114
115
116
1175. Scheduling policies
118
119CFS implements three scheduling policies:
120
121 - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling
122 policy that is used for regular tasks.
123
124 - SCHED_BATCH: Does not preempt nearly as often as regular tasks
125 would, thereby allowing tasks to run longer and make better use of
126 caches but at the cost of interactivity. This is well suited for
127 batch jobs.
128
129 - SCHED_IDLE: This is even weaker than nice 19, but its not a true
130 idle timer scheduler in order to avoid to get into priority
131 inversion problems which would deadlock the machine.
132
133SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by
134POSIX.
135
136The command chrt from util-linux-ng 2.13.1.1 can set all of these except
137SCHED_IDLE.
144 138
145Group scheduler tunables:
146 139
147When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for 140
148each new user and a "cpu_share" file is added in that directory. 1416. SCHEDULING CLASSES
142
143The new CFS scheduler has been designed in such a way to introduce "Scheduling
144Classes," an extensible hierarchy of scheduler modules. These modules
145encapsulate scheduling policy details and are handled by the scheduler core
146without the core code assuming too much about them.
147
148sched_fair.c implements the CFS scheduler described above.
149
150sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
151the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT
152priority levels, instead of 140 in the previous scheduler) and it needs no
153expired array.
154
155Scheduling classes are implemented through the sched_class structure, which
156contains hooks to functions that must be called whenever an interesting event
157occurs.
158
159This is the (partial) list of the hooks:
160
161 - enqueue_task(...)
162
163 Called when a task enters a runnable state.
164 It puts the scheduling entity (task) into the red-black tree and
165 increments the nr_running variable.
166
167 - dequeue_tree(...)
168
169 When a task is no longer runnable, this function is called to keep the
170 corresponding scheduling entity out of the red-black tree. It decrements
171 the nr_running variable.
172
173 - yield_task(...)
174
175 This function is basically just a dequeue followed by an enqueue, unless the
176 compat_yield sysctl is turned on; in that case, it places the scheduling
177 entity at the right-most end of the red-black tree.
178
179 - check_preempt_curr(...)
180
181 This function checks if a task that entered the runnable state should
182 preempt the currently running task.
183
184 - pick_next_task(...)
185
186 This function chooses the most appropriate task eligible to run next.
187
188 - set_curr_task(...)
189
190 This function is called when a task changes its scheduling class or changes
191 its task group.
192
193 - task_tick(...)
194
195 This function is mostly called from time tick functions; it might lead to
196 process switch. This drives the running preemption.
197
198 - task_new(...)
199
200 The core scheduler gives the scheduling module an opportunity to manage new
201 task startup. The CFS scheduling module uses it for group scheduling, while
202 the scheduling module for a real-time task does not use it.
203
204
205
2067. GROUP SCHEDULER EXTENSIONS TO CFS
207
208Normally, the scheduler operates on individual tasks and strives to provide
209fair CPU time to each task. Sometimes, it may be desirable to group tasks and
210provide fair CPU time to each such task group. For example, it may be
211desirable to first provide fair CPU time to each user on the system and then to
212each task belonging to a user.
213
214CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be
215grouped and divides CPU time fairly among such groups.
216
217CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
218SCHED_RR) tasks.
219
220CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
221SCHED_BATCH) tasks.
222
223At present, there are two (mutually exclusive) mechanisms to group tasks for
224CPU bandwidth control purposes:
225
226 - Based on user id (CONFIG_USER_SCHED)
227
228 With this option, tasks are grouped according to their user id.
229
230 - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
231
232 This options needs CONFIG_CGROUPS to be defined, and lets the administrator
233 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
234 Documentation/cgroups.txt for more information about this filesystem.
235
236Only one of these options to group tasks can be chosen and not both.
237
238When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
239user and a "cpu_share" file is added in that directory.
149 240
150 # cd /sys/kernel/uids 241 # cd /sys/kernel/uids
151 # cat 512/cpu_share # Display user 512's CPU share 242 # cat 512/cpu_share # Display user 512's CPU share
@@ -155,16 +246,14 @@ each new user and a "cpu_share" file is added in that directory.
155 2048 246 2048
156 # 247 #
157 248
158CPU bandwidth between two users are divided in the ratio of their CPU shares. 249CPU bandwidth between two users is divided in the ratio of their CPU shares.
159For ex: if you would like user "root" to get twice the bandwidth of user 250For example: if you would like user "root" to get twice the bandwidth of user
160"guest", then set the cpu_share for both the users such that "root"'s 251"guest," then set the cpu_share for both the users such that "root"'s cpu_share
161cpu_share is twice "guest"'s cpu_share 252is twice "guest"'s cpu_share.
162
163 253
164When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created 254When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
165for each group created using the pseudo filesystem. See example steps 255group created using the pseudo filesystem. See example steps below to create
166below to create task groups and modify their CPU share using the "cgroups" 256task groups and modify their CPU share using the "cgroups" pseudo filesystem.
167pseudo filesystem
168 257
169 # mkdir /dev/cpuctl 258 # mkdir /dev/cpuctl
170 # mount -t cgroup -ocpu none /dev/cpuctl 259 # mount -t cgroup -ocpu none /dev/cpuctl
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 83df541650fc..06b6fdab639f 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -149,6 +149,9 @@ smp_callin(void)
149 atomic_inc(&init_mm.mm_count); 149 atomic_inc(&init_mm.mm_count);
150 current->active_mm = &init_mm; 150 current->active_mm = &init_mm;
151 151
152 /* inform the notifiers about the new cpu */
153 notify_cpu_starting(cpuid);
154
152 /* Must have completely accurate bogos. */ 155 /* Must have completely accurate bogos. */
153 local_irq_enable(); 156 local_irq_enable();
154 157
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index e9842f6767f9..e42a749a56dd 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -277,6 +277,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
277 /* 277 /*
278 * Enable local interrupts. 278 * Enable local interrupts.
279 */ 279 */
280 notify_cpu_starting(cpu);
280 local_irq_enable(); 281 local_irq_enable();
281 local_fiq_enable(); 282 local_fiq_enable();
282 283
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 952a24b2f5a9..52e16c6436f9 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -178,6 +178,7 @@ void __init smp_callin(void)
178 unmask_irq(IPI_INTR_VECT); 178 unmask_irq(IPI_INTR_VECT);
179 unmask_irq(TIMER0_INTR_VECT); 179 unmask_irq(TIMER0_INTR_VECT);
180 preempt_disable(); 180 preempt_disable();
181 notify_cpu_starting(cpu);
181 local_irq_enable(); 182 local_irq_enable();
182 183
183 cpu_set(cpu, cpu_online_map); 184 cpu_set(cpu, cpu_online_map);
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index d8f05e504fbf..1dcbb85fc4ee 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -401,6 +401,7 @@ smp_callin (void)
401 spin_lock(&vector_lock); 401 spin_lock(&vector_lock);
402 /* Setup the per cpu irq handling data structures */ 402 /* Setup the per cpu irq handling data structures */
403 __setup_vector_irq(cpuid); 403 __setup_vector_irq(cpuid);
404 notify_cpu_starting(cpuid);
404 cpu_set(cpuid, cpu_online_map); 405 cpu_set(cpuid, cpu_online_map);
405 per_cpu(cpu_state, cpuid) = CPU_ONLINE; 406 per_cpu(cpu_state, cpuid) = CPU_ONLINE;
406 spin_unlock(&vector_lock); 407 spin_unlock(&vector_lock);
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index 2c03ac1d005f..fc2994811f15 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -498,6 +498,8 @@ static void __init smp_online(void)
498{ 498{
499 int cpu_id = smp_processor_id(); 499 int cpu_id = smp_processor_id();
500 500
501 notify_cpu_starting(cpu_id);
502
501 local_irq_enable(); 503 local_irq_enable();
502 504
503 /* Get our bogomips. */ 505 /* Get our bogomips. */
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 4410f172b8ab..7b59cfb7e602 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -121,6 +121,8 @@ asmlinkage __cpuinit void start_secondary(void)
121 cpu = smp_processor_id(); 121 cpu = smp_processor_id();
122 cpu_data[cpu].udelay_val = loops_per_jiffy; 122 cpu_data[cpu].udelay_val = loops_per_jiffy;
123 123
124 notify_cpu_starting(cpu);
125
124 mp_ops->smp_finish(); 126 mp_ops->smp_finish();
125 set_cpu_sibling_map(cpu); 127 set_cpu_sibling_map(cpu);
126 128
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5337ca7bb649..c27b10a1bd79 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -453,6 +453,7 @@ int __devinit start_secondary(void *unused)
453 secondary_cpu_time_init(); 453 secondary_cpu_time_init();
454 454
455 ipi_call_lock(); 455 ipi_call_lock();
456 notify_cpu_starting(cpu);
456 cpu_set(cpu, cpu_online_map); 457 cpu_set(cpu, cpu_online_map);
457 /* Update sibling maps */ 458 /* Update sibling maps */
458 base = cpu_first_thread_in_core(cpu); 459 base = cpu_first_thread_in_core(cpu);
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 00b9b4dec5eb..9e8b1f9b8f4d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -585,6 +585,8 @@ int __cpuinit start_secondary(void *cpuvoid)
585 /* Enable pfault pseudo page faults on this cpu. */ 585 /* Enable pfault pseudo page faults on this cpu. */
586 pfault_init(); 586 pfault_init();
587 587
588 /* call cpu notifiers */
589 notify_cpu_starting(smp_processor_id());
588 /* Mark this cpu as online */ 590 /* Mark this cpu as online */
589 spin_lock(&call_lock); 591 spin_lock(&call_lock);
590 cpu_set(smp_processor_id(), cpu_online_map); 592 cpu_set(smp_processor_id(), cpu_online_map);
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 60c50841143e..001778f9adaf 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -82,6 +82,8 @@ asmlinkage void __cpuinit start_secondary(void)
82 82
83 preempt_disable(); 83 preempt_disable();
84 84
85 notify_cpu_starting(smp_processor_id());
86
85 local_irq_enable(); 87 local_irq_enable();
86 88
87 calibrate_delay(); 89 calibrate_delay();
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 69596402a500..446767e8f569 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -88,6 +88,7 @@ void __init smp4d_callin(void)
88 local_flush_cache_all(); 88 local_flush_cache_all();
89 local_flush_tlb_all(); 89 local_flush_tlb_all();
90 90
91 notify_cpu_starting(cpuid);
91 /* 92 /*
92 * Unblock the master CPU _only_ when the scheduler state 93 * Unblock the master CPU _only_ when the scheduler state
93 * of all secondary CPUs will be up-to-date, so after 94 * of all secondary CPUs will be up-to-date, so after
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index a14a76ac7f36..9964890dc1db 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -71,6 +71,8 @@ void __cpuinit smp4m_callin(void)
71 local_flush_cache_all(); 71 local_flush_cache_all();
72 local_flush_tlb_all(); 72 local_flush_tlb_all();
73 73
74 notify_cpu_starting(cpuid);
75
74 /* Get our local ticker going. */ 76 /* Get our local ticker going. */
75 smp_setup_percpu_timer(); 77 smp_setup_percpu_timer();
76 78
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index be2d50c3aa95..045772142844 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -85,6 +85,7 @@ static int idle_proc(void *cpup)
85 while (!cpu_isset(cpu, smp_commenced_mask)) 85 while (!cpu_isset(cpu, smp_commenced_mask))
86 cpu_relax(); 86 cpu_relax();
87 87
88 notify_cpu_starting(cpu);
88 cpu_set(cpu, cpu_online_map); 89 cpu_set(cpu, cpu_online_map);
89 default_idle(); 90 default_idle();
90 return 0; 91 return 0;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 45531e3ba194..4e7ccb0e2a9b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -257,6 +257,7 @@ static void __cpuinit smp_callin(void)
257 end_local_APIC_setup(); 257 end_local_APIC_setup();
258 map_cpu_to_logical_apicid(); 258 map_cpu_to_logical_apicid();
259 259
260 notify_cpu_starting(cpuid);
260 /* 261 /*
261 * Get our bogomips. 262 * Get our bogomips.
262 * 263 *
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index ee0fba092157..199a5f4a873c 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
448 448
449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); 449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
450 450
451 notify_cpu_starting(cpuid);
452
451 /* enable interrupts */ 453 /* enable interrupts */
452 local_irq_enable(); 454 local_irq_enable();
453 455
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 02ef8835999c..4a6b604ef7e4 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -10,6 +10,18 @@
10 10
11#include <linux/wait.h> 11#include <linux/wait.h>
12 12
13/**
14 * struct completion - structure used to maintain state for a "completion"
15 *
16 * This is the opaque structure used to maintain the state for a "completion".
17 * Completions currently use a FIFO to queue threads that have to wait for
18 * the "completion" event.
19 *
20 * See also: complete(), wait_for_completion() (and friends _timeout,
21 * _interruptible, _interruptible_timeout, and _killable), init_completion(),
22 * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
23 * INIT_COMPLETION().
24 */
13struct completion { 25struct completion {
14 unsigned int done; 26 unsigned int done;
15 wait_queue_head_t wait; 27 wait_queue_head_t wait;
@@ -21,6 +33,14 @@ struct completion {
21#define COMPLETION_INITIALIZER_ONSTACK(work) \ 33#define COMPLETION_INITIALIZER_ONSTACK(work) \
22 ({ init_completion(&work); work; }) 34 ({ init_completion(&work); work; })
23 35
36/**
37 * DECLARE_COMPLETION: - declare and initialize a completion structure
38 * @work: identifier for the completion structure
39 *
40 * This macro declares and initializes a completion structure. Generally used
41 * for static declarations. You should use the _ONSTACK variant for automatic
42 * variables.
43 */
24#define DECLARE_COMPLETION(work) \ 44#define DECLARE_COMPLETION(work) \
25 struct completion work = COMPLETION_INITIALIZER(work) 45 struct completion work = COMPLETION_INITIALIZER(work)
26 46
@@ -29,6 +49,13 @@ struct completion {
29 * completions - so we use the _ONSTACK() variant for those that 49 * completions - so we use the _ONSTACK() variant for those that
30 * are on the kernel stack: 50 * are on the kernel stack:
31 */ 51 */
52/**
53 * DECLARE_COMPLETION_ONSTACK: - declare and initialize a completion structure
54 * @work: identifier for the completion structure
55 *
56 * This macro declares and initializes a completion structure on the kernel
57 * stack.
58 */
32#ifdef CONFIG_LOCKDEP 59#ifdef CONFIG_LOCKDEP
33# define DECLARE_COMPLETION_ONSTACK(work) \ 60# define DECLARE_COMPLETION_ONSTACK(work) \
34 struct completion work = COMPLETION_INITIALIZER_ONSTACK(work) 61 struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
@@ -36,6 +63,13 @@ struct completion {
36# define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work) 63# define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
37#endif 64#endif
38 65
66/**
67 * init_completion: - Initialize a dynamically allocated completion
68 * @x: completion structure that is to be initialized
69 *
70 * This inline function will initialize a dynamically created completion
71 * structure.
72 */
39static inline void init_completion(struct completion *x) 73static inline void init_completion(struct completion *x)
40{ 74{
41 x->done = 0; 75 x->done = 0;
@@ -55,6 +89,13 @@ extern bool completion_done(struct completion *x);
55extern void complete(struct completion *); 89extern void complete(struct completion *);
56extern void complete_all(struct completion *); 90extern void complete_all(struct completion *);
57 91
92/**
93 * INIT_COMPLETION: - reinitialize a completion structure
94 * @x: completion structure to be reinitialized
95 *
96 * This macro should be used to reinitialize a completion structure so it can
97 * be reused. This is especially important after complete_all() is used.
98 */
58#define INIT_COMPLETION(x) ((x).done = 0) 99#define INIT_COMPLETION(x) ((x).done = 0)
59 100
60 101
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d7faf8808497..c2747ac2ae43 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -69,6 +69,7 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
69#endif 69#endif
70 70
71int cpu_up(unsigned int cpu); 71int cpu_up(unsigned int cpu);
72void notify_cpu_starting(unsigned int cpu);
72extern void cpu_hotplug_init(void); 73extern void cpu_hotplug_init(void);
73extern void cpu_maps_update_begin(void); 74extern void cpu_maps_update_begin(void);
74extern void cpu_maps_update_done(void); 75extern void cpu_maps_update_done(void);
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index da2698b0fdd1..b86fa2ffca0c 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -213,9 +213,16 @@ static inline int notifier_to_errno(int ret)
213#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ 213#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
214#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ 214#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
215#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task, 215#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task,
216 * not handling interrupts, soon dead */ 216 * not handling interrupts, soon dead.
217 * Called on the dying cpu, interrupts
218 * are already disabled. Must not
219 * sleep, must not fail */
217#define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug 220#define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug
218 * lock is dropped */ 221 * lock is dropped */
222#define CPU_STARTING 0x000A /* CPU (unsigned)v soon running.
223 * Called on the new cpu, just before
224 * enabling interrupts. Must not sleep,
225 * must not fail */
219 226
220/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend 227/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
221 * operation in progress 228 * operation in progress
@@ -229,6 +236,7 @@ static inline int notifier_to_errno(int ret)
229#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) 236#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
230#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) 237#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
231#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN) 238#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN)
239#define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN)
232 240
233/* Hibernation and suspend events */ 241/* Hibernation and suspend events */
234#define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */ 242#define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */
diff --git a/include/linux/proportions.h b/include/linux/proportions.h
index 5afc1b23346d..cf793bbbd05e 100644
--- a/include/linux/proportions.h
+++ b/include/linux/proportions.h
@@ -104,8 +104,8 @@ struct prop_local_single {
104 * snapshot of the last seen global state 104 * snapshot of the last seen global state
105 * and a lock protecting this state 105 * and a lock protecting this state
106 */ 106 */
107 int shift;
108 unsigned long period; 107 unsigned long period;
108 int shift;
109 spinlock_t lock; /* protect the snapshot state */ 109 spinlock_t lock; /* protect the snapshot state */
110}; 110};
111 111
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d9120c5ad15..5d0819ee442a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -451,8 +451,8 @@ struct signal_struct {
451 * - everyone except group_exit_task is stopped during signal delivery 451 * - everyone except group_exit_task is stopped during signal delivery
452 * of fatal signals, group_exit_task processes the signal. 452 * of fatal signals, group_exit_task processes the signal.
453 */ 453 */
454 struct task_struct *group_exit_task;
455 int notify_count; 454 int notify_count;
455 struct task_struct *group_exit_task;
456 456
457 /* thread group stop support, overloads group_exit_code too */ 457 /* thread group stop support, overloads group_exit_code too */
458 int group_stop_count; 458 int group_stop_count;
@@ -824,6 +824,9 @@ struct sched_domain {
824 unsigned int ttwu_move_affine; 824 unsigned int ttwu_move_affine;
825 unsigned int ttwu_move_balance; 825 unsigned int ttwu_move_balance;
826#endif 826#endif
827#ifdef CONFIG_SCHED_DEBUG
828 char *name;
829#endif
827}; 830};
828 831
829extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 832extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
@@ -897,7 +900,7 @@ struct sched_class {
897 void (*yield_task) (struct rq *rq); 900 void (*yield_task) (struct rq *rq);
898 int (*select_task_rq)(struct task_struct *p, int sync); 901 int (*select_task_rq)(struct task_struct *p, int sync);
899 902
900 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); 903 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
901 904
902 struct task_struct * (*pick_next_task) (struct rq *rq); 905 struct task_struct * (*pick_next_task) (struct rq *rq);
903 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 906 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
@@ -1010,8 +1013,8 @@ struct sched_entity {
1010 1013
1011struct sched_rt_entity { 1014struct sched_rt_entity {
1012 struct list_head run_list; 1015 struct list_head run_list;
1013 unsigned int time_slice;
1014 unsigned long timeout; 1016 unsigned long timeout;
1017 unsigned int time_slice;
1015 int nr_cpus_allowed; 1018 int nr_cpus_allowed;
1016 1019
1017 struct sched_rt_entity *back; 1020 struct sched_rt_entity *back;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f17e9854c246..86d49045daed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
199 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
200 int err; 200 int err;
201 201
202 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
203 param->hcpu);
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
205 err = __cpu_disable(); 203 err = __cpu_disable();
206 if (err < 0) 204 if (err < 0)
207 return err; 205 return err;
208 206
207 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
208 param->hcpu);
209
209 /* Force idle task to run as soon as we yield: it should 210 /* Force idle task to run as soon as we yield: it should
210 immediately notice cpu is offline and die quickly. */ 211 immediately notice cpu is offline and die quickly. */
211 sched_idle_next(); 212 sched_idle_next();
@@ -453,6 +454,25 @@ out:
453} 454}
454#endif /* CONFIG_PM_SLEEP_SMP */ 455#endif /* CONFIG_PM_SLEEP_SMP */
455 456
457/**
458 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
459 * @cpu: cpu that just started
460 *
461 * This function calls the cpu_chain notifiers with CPU_STARTING.
462 * It must be called by the arch code on the new cpu, before the new cpu
463 * enables interrupts and before the "boot" cpu returns from __cpu_up().
464 */
465void notify_cpu_starting(unsigned int cpu)
466{
467 unsigned long val = CPU_STARTING;
468
469#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
474}
475
456#endif /* CONFIG_SMP */ 476#endif /* CONFIG_SMP */
457 477
458/* 478/*
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 827cd9adccb2..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1921 * that has tasks along with an empty 'mems'. But if we did see such 1921 * that has tasks along with an empty 'mems'. But if we did see such
1922 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 1922 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1923 */ 1923 */
1924static void scan_for_empty_cpusets(const struct cpuset *root) 1924static void scan_for_empty_cpusets(struct cpuset *root)
1925{ 1925{
1926 LIST_HEAD(queue); 1926 LIST_HEAD(queue);
1927 struct cpuset *cp; /* scans cpusets being updated */ 1927 struct cpuset *cp; /* scans cpusets being updated */
diff --git a/kernel/sched.c b/kernel/sched.c
index ad1962dc0aa2..6f230596bd0c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -204,11 +204,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205} 205}
206 206
207static inline int rt_bandwidth_enabled(void)
208{
209 return sysctl_sched_rt_runtime >= 0;
210}
211
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 212static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 213{
209 ktime_t now; 214 ktime_t now;
210 215
211 if (rt_b->rt_runtime == RUNTIME_INF) 216 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 217 return;
213 218
214 if (hrtimer_active(&rt_b->rt_period_timer)) 219 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 303static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif /* CONFIG_RT_GROUP_SCHED */ 305#endif /* CONFIG_RT_GROUP_SCHED */
301#else /* !CONFIG_FAIR_GROUP_SCHED */ 306#else /* !CONFIG_USER_SCHED */
302#define root_task_group init_task_group 307#define root_task_group init_task_group
303#endif /* CONFIG_FAIR_GROUP_SCHED */ 308#endif /* CONFIG_USER_SCHED */
304 309
305/* task_group_lock serializes add/remove of task groups and also changes to 310/* task_group_lock serializes add/remove of task groups and also changes to
306 * a task group's cpu shares. 311 * a task group's cpu shares.
@@ -604,9 +609,9 @@ struct rq {
604 609
605static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
606 611
607static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
608{ 613{
609 rq->curr->sched_class->check_preempt_curr(rq, p); 614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
610} 615}
611 616
612static inline int cpu_of(struct rq *rq) 617static inline int cpu_of(struct rq *rq)
@@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1102 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1107 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1103} 1108}
1104 1109
1105static void init_hrtick(void) 1110static inline void init_hrtick(void)
1106{ 1111{
1107} 1112}
1108#endif /* CONFIG_SMP */ 1113#endif /* CONFIG_SMP */
@@ -1121,7 +1126,7 @@ static void init_rq_hrtick(struct rq *rq)
1121 rq->hrtick_timer.function = hrtick; 1126 rq->hrtick_timer.function = hrtick;
1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; 1127 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1123} 1128}
1124#else 1129#else /* CONFIG_SCHED_HRTICK */
1125static inline void hrtick_clear(struct rq *rq) 1130static inline void hrtick_clear(struct rq *rq)
1126{ 1131{
1127} 1132}
@@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1133static inline void init_hrtick(void) 1138static inline void init_hrtick(void)
1134{ 1139{
1135} 1140}
1136#endif 1141#endif /* CONFIG_SCHED_HRTICK */
1137 1142
1138/* 1143/*
1139 * resched_task - mark a task 'to be rescheduled now'. 1144 * resched_task - mark a task 'to be rescheduled now'.
@@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1380 update_load_sub(&rq->load, load); 1385 update_load_sub(&rq->load, load);
1381} 1386}
1382 1387
1383#ifdef CONFIG_SMP 1388#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1384static unsigned long source_load(int cpu, int type); 1389typedef int (*tg_visitor)(struct task_group *, void *);
1385static unsigned long target_load(int cpu, int type);
1386static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1387
1388static unsigned long cpu_avg_load_per_task(int cpu)
1389{
1390 struct rq *rq = cpu_rq(cpu);
1391
1392 if (rq->nr_running)
1393 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1394
1395 return rq->avg_load_per_task;
1396}
1397
1398#ifdef CONFIG_FAIR_GROUP_SCHED
1399
1400typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1401 1390
1402/* 1391/*
1403 * Iterate the full tree, calling @down when first entering a node and @up when 1392 * Iterate the full tree, calling @down when first entering a node and @up when
1404 * leaving it for the final time. 1393 * leaving it for the final time.
1405 */ 1394 */
1406static void 1395static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1407walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1408{ 1396{
1409 struct task_group *parent, *child; 1397 struct task_group *parent, *child;
1398 int ret;
1410 1399
1411 rcu_read_lock(); 1400 rcu_read_lock();
1412 parent = &root_task_group; 1401 parent = &root_task_group;
1413down: 1402down:
1414 (*down)(parent, cpu, sd); 1403 ret = (*down)(parent, data);
1404 if (ret)
1405 goto out_unlock;
1415 list_for_each_entry_rcu(child, &parent->children, siblings) { 1406 list_for_each_entry_rcu(child, &parent->children, siblings) {
1416 parent = child; 1407 parent = child;
1417 goto down; 1408 goto down;
@@ -1419,15 +1410,43 @@ down:
1419up: 1410up:
1420 continue; 1411 continue;
1421 } 1412 }
1422 (*up)(parent, cpu, sd); 1413 ret = (*up)(parent, data);
1414 if (ret)
1415 goto out_unlock;
1423 1416
1424 child = parent; 1417 child = parent;
1425 parent = parent->parent; 1418 parent = parent->parent;
1426 if (parent) 1419 if (parent)
1427 goto up; 1420 goto up;
1421out_unlock:
1428 rcu_read_unlock(); 1422 rcu_read_unlock();
1423
1424 return ret;
1429} 1425}
1430 1426
1427static int tg_nop(struct task_group *tg, void *data)
1428{
1429 return 0;
1430}
1431#endif
1432
1433#ifdef CONFIG_SMP
1434static unsigned long source_load(int cpu, int type);
1435static unsigned long target_load(int cpu, int type);
1436static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1437
1438static unsigned long cpu_avg_load_per_task(int cpu)
1439{
1440 struct rq *rq = cpu_rq(cpu);
1441
1442 if (rq->nr_running)
1443 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1444
1445 return rq->avg_load_per_task;
1446}
1447
1448#ifdef CONFIG_FAIR_GROUP_SCHED
1449
1431static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1450static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1432 1451
1433/* 1452/*
@@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1486 * This needs to be done in a bottom-up fashion because the rq weight of a 1505 * This needs to be done in a bottom-up fashion because the rq weight of a
1487 * parent group depends on the shares of its child groups. 1506 * parent group depends on the shares of its child groups.
1488 */ 1507 */
1489static void 1508static int tg_shares_up(struct task_group *tg, void *data)
1490tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1491{ 1509{
1492 unsigned long rq_weight = 0; 1510 unsigned long rq_weight = 0;
1493 unsigned long shares = 0; 1511 unsigned long shares = 0;
1512 struct sched_domain *sd = data;
1494 int i; 1513 int i;
1495 1514
1496 for_each_cpu_mask(i, sd->span) { 1515 for_each_cpu_mask(i, sd->span) {
@@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1515 __update_group_shares_cpu(tg, i, shares, rq_weight); 1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1516 spin_unlock_irqrestore(&rq->lock, flags); 1535 spin_unlock_irqrestore(&rq->lock, flags);
1517 } 1536 }
1537
1538 return 0;
1518} 1539}
1519 1540
1520/* 1541/*
@@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1522 * This needs to be done in a top-down fashion because the load of a child 1543 * This needs to be done in a top-down fashion because the load of a child
1523 * group is a fraction of its parents load. 1544 * group is a fraction of its parents load.
1524 */ 1545 */
1525static void 1546static int tg_load_down(struct task_group *tg, void *data)
1526tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1527{ 1547{
1528 unsigned long load; 1548 unsigned long load;
1549 long cpu = (long)data;
1529 1550
1530 if (!tg->parent) { 1551 if (!tg->parent) {
1531 load = cpu_rq(cpu)->load.weight; 1552 load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1536 } 1557 }
1537 1558
1538 tg->cfs_rq[cpu]->h_load = load; 1559 tg->cfs_rq[cpu]->h_load = load;
1539}
1540 1560
1541static void 1561 return 0;
1542tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1543{
1544} 1562}
1545 1563
1546static void update_shares(struct sched_domain *sd) 1564static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
1550 1568
1551 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1569 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1552 sd->last_update = now; 1570 sd->last_update = now;
1553 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1571 walk_tg_tree(tg_nop, tg_shares_up, sd);
1554 } 1572 }
1555} 1573}
1556 1574
@@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1561 spin_lock(&rq->lock); 1579 spin_lock(&rq->lock);
1562} 1580}
1563 1581
1564static void update_h_load(int cpu) 1582static void update_h_load(long cpu)
1565{ 1583{
1566 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1584 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1567} 1585}
1568 1586
1569#else 1587#else
@@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1921 running = task_running(rq, p); 1939 running = task_running(rq, p);
1922 on_rq = p->se.on_rq; 1940 on_rq = p->se.on_rq;
1923 ncsw = 0; 1941 ncsw = 0;
1924 if (!match_state || p->state == match_state) { 1942 if (!match_state || p->state == match_state)
1925 ncsw = p->nivcsw + p->nvcsw; 1943 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1926 if (unlikely(!ncsw))
1927 ncsw = 1;
1928 }
1929 task_rq_unlock(rq, &flags); 1944 task_rq_unlock(rq, &flags);
1930 1945
1931 /* 1946 /*
@@ -2285,7 +2300,7 @@ out_running:
2285 trace_mark(kernel_sched_wakeup, 2300 trace_mark(kernel_sched_wakeup,
2286 "pid %d state %ld ## rq %p task %p rq->curr %p", 2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2287 p->pid, p->state, rq, p, rq->curr); 2302 p->pid, p->state, rq, p, rq->curr);
2288 check_preempt_curr(rq, p); 2303 check_preempt_curr(rq, p, sync);
2289 2304
2290 p->state = TASK_RUNNING; 2305 p->state = TASK_RUNNING;
2291#ifdef CONFIG_SMP 2306#ifdef CONFIG_SMP
@@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2420 trace_mark(kernel_sched_wakeup_new, 2435 trace_mark(kernel_sched_wakeup_new,
2421 "pid %d state %ld ## rq %p task %p rq->curr %p", 2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2422 p->pid, p->state, rq, p, rq->curr); 2437 p->pid, p->state, rq, p, rq->curr);
2423 check_preempt_curr(rq, p); 2438 check_preempt_curr(rq, p, 0);
2424#ifdef CONFIG_SMP 2439#ifdef CONFIG_SMP
2425 if (p->sched_class->task_wake_up) 2440 if (p->sched_class->task_wake_up)
2426 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
@@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2880 * Note that idle threads have a prio of MAX_PRIO, for this test 2895 * Note that idle threads have a prio of MAX_PRIO, for this test
2881 * to be always true for them. 2896 * to be always true for them.
2882 */ 2897 */
2883 check_preempt_curr(this_rq, p); 2898 check_preempt_curr(this_rq, p, 0);
2884} 2899}
2885 2900
2886/* 2901/*
@@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4627} 4642}
4628EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4643EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4629 4644
4645/**
4646 * complete: - signals a single thread waiting on this completion
4647 * @x: holds the state of this particular completion
4648 *
4649 * This will wake up a single thread waiting on this completion. Threads will be
4650 * awakened in the same order in which they were queued.
4651 *
4652 * See also complete_all(), wait_for_completion() and related routines.
4653 */
4630void complete(struct completion *x) 4654void complete(struct completion *x)
4631{ 4655{
4632 unsigned long flags; 4656 unsigned long flags;
@@ -4638,6 +4662,12 @@ void complete(struct completion *x)
4638} 4662}
4639EXPORT_SYMBOL(complete); 4663EXPORT_SYMBOL(complete);
4640 4664
4665/**
4666 * complete_all: - signals all threads waiting on this completion
4667 * @x: holds the state of this particular completion
4668 *
4669 * This will wake up all threads waiting on this particular completion event.
4670 */
4641void complete_all(struct completion *x) 4671void complete_all(struct completion *x)
4642{ 4672{
4643 unsigned long flags; 4673 unsigned long flags;
@@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4658 wait.flags |= WQ_FLAG_EXCLUSIVE; 4688 wait.flags |= WQ_FLAG_EXCLUSIVE;
4659 __add_wait_queue_tail(&x->wait, &wait); 4689 __add_wait_queue_tail(&x->wait, &wait);
4660 do { 4690 do {
4661 if ((state == TASK_INTERRUPTIBLE && 4691 if (signal_pending_state(state, current)) {
4662 signal_pending(current)) ||
4663 (state == TASK_KILLABLE &&
4664 fatal_signal_pending(current))) {
4665 timeout = -ERESTARTSYS; 4692 timeout = -ERESTARTSYS;
4666 break; 4693 break;
4667 } 4694 }
@@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4689 return timeout; 4716 return timeout;
4690} 4717}
4691 4718
4719/**
4720 * wait_for_completion: - waits for completion of a task
4721 * @x: holds the state of this particular completion
4722 *
4723 * This waits to be signaled for completion of a specific task. It is NOT
4724 * interruptible and there is no timeout.
4725 *
4726 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4727 * and interrupt capability. Also see complete().
4728 */
4692void __sched wait_for_completion(struct completion *x) 4729void __sched wait_for_completion(struct completion *x)
4693{ 4730{
4694 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4731 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4695} 4732}
4696EXPORT_SYMBOL(wait_for_completion); 4733EXPORT_SYMBOL(wait_for_completion);
4697 4734
4735/**
4736 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4737 * @x: holds the state of this particular completion
4738 * @timeout: timeout value in jiffies
4739 *
4740 * This waits for either a completion of a specific task to be signaled or for a
4741 * specified timeout to expire. The timeout is in jiffies. It is not
4742 * interruptible.
4743 */
4698unsigned long __sched 4744unsigned long __sched
4699wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4745wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4700{ 4746{
@@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4702} 4748}
4703EXPORT_SYMBOL(wait_for_completion_timeout); 4749EXPORT_SYMBOL(wait_for_completion_timeout);
4704 4750
4751/**
4752 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4753 * @x: holds the state of this particular completion
4754 *
4755 * This waits for completion of a specific task to be signaled. It is
4756 * interruptible.
4757 */
4705int __sched wait_for_completion_interruptible(struct completion *x) 4758int __sched wait_for_completion_interruptible(struct completion *x)
4706{ 4759{
4707 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4760 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4711} 4764}
4712EXPORT_SYMBOL(wait_for_completion_interruptible); 4765EXPORT_SYMBOL(wait_for_completion_interruptible);
4713 4766
4767/**
4768 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4769 * @x: holds the state of this particular completion
4770 * @timeout: timeout value in jiffies
4771 *
4772 * This waits for either a completion of a specific task to be signaled or for a
4773 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4774 */
4714unsigned long __sched 4775unsigned long __sched
4715wait_for_completion_interruptible_timeout(struct completion *x, 4776wait_for_completion_interruptible_timeout(struct completion *x,
4716 unsigned long timeout) 4777 unsigned long timeout)
@@ -4719,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4719} 4780}
4720EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4781EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4721 4782
4783/**
4784 * wait_for_completion_killable: - waits for completion of a task (killable)
4785 * @x: holds the state of this particular completion
4786 *
4787 * This waits to be signaled for completion of a specific task. It can be
4788 * interrupted by a kill signal.
4789 */
4722int __sched wait_for_completion_killable(struct completion *x) 4790int __sched wait_for_completion_killable(struct completion *x)
4723{ 4791{
4724 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4792 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5121,7 +5189,8 @@ recheck:
5121 * Do not allow realtime tasks into groups that have no runtime 5189 * Do not allow realtime tasks into groups that have no runtime
5122 * assigned. 5190 * assigned.
5123 */ 5191 */
5124 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5192 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5193 task_group(p)->rt_bandwidth.rt_runtime == 0)
5125 return -EPERM; 5194 return -EPERM;
5126#endif 5195#endif
5127 5196
@@ -5957,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5957 set_task_cpu(p, dest_cpu); 6026 set_task_cpu(p, dest_cpu);
5958 if (on_rq) { 6027 if (on_rq) {
5959 activate_task(rq_dest, p, 0); 6028 activate_task(rq_dest, p, 0);
5960 check_preempt_curr(rq_dest, p); 6029 check_preempt_curr(rq_dest, p, 0);
5961 } 6030 }
5962done: 6031done:
5963 ret = 1; 6032 ret = 1;
@@ -6282,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
6282static struct ctl_table * 6351static struct ctl_table *
6283sd_alloc_ctl_domain_table(struct sched_domain *sd) 6352sd_alloc_ctl_domain_table(struct sched_domain *sd)
6284{ 6353{
6285 struct ctl_table *table = sd_alloc_ctl_entry(12); 6354 struct ctl_table *table = sd_alloc_ctl_entry(13);
6286 6355
6287 if (table == NULL) 6356 if (table == NULL)
6288 return NULL; 6357 return NULL;
@@ -6310,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
6310 sizeof(int), 0644, proc_dointvec_minmax); 6379 sizeof(int), 0644, proc_dointvec_minmax);
6311 set_table_entry(&table[10], "flags", &sd->flags, 6380 set_table_entry(&table[10], "flags", &sd->flags,
6312 sizeof(int), 0644, proc_dointvec_minmax); 6381 sizeof(int), 0644, proc_dointvec_minmax);
6313 /* &table[11] is terminator */ 6382 set_table_entry(&table[11], "name", sd->name,
6383 CORENAME_MAX_SIZE, 0444, proc_dostring);
6384 /* &table[12] is terminator */
6314 6385
6315 return table; 6386 return table;
6316} 6387}
@@ -7194,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7194 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7265 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7195 */ 7266 */
7196 7267
7268#ifdef CONFIG_SCHED_DEBUG
7269# define SD_INIT_NAME(sd, type) sd->name = #type
7270#else
7271# define SD_INIT_NAME(sd, type) do { } while (0)
7272#endif
7273
7197#define SD_INIT(sd, type) sd_init_##type(sd) 7274#define SD_INIT(sd, type) sd_init_##type(sd)
7275
7198#define SD_INIT_FUNC(type) \ 7276#define SD_INIT_FUNC(type) \
7199static noinline void sd_init_##type(struct sched_domain *sd) \ 7277static noinline void sd_init_##type(struct sched_domain *sd) \
7200{ \ 7278{ \
7201 memset(sd, 0, sizeof(*sd)); \ 7279 memset(sd, 0, sizeof(*sd)); \
7202 *sd = SD_##type##_INIT; \ 7280 *sd = SD_##type##_INIT; \
7203 sd->level = SD_LV_##type; \ 7281 sd->level = SD_LV_##type; \
7282 SD_INIT_NAME(sd, type); \
7204} 7283}
7205 7284
7206SD_INIT_FUNC(CPU) 7285SD_INIT_FUNC(CPU)
@@ -8242,20 +8321,25 @@ void __might_sleep(char *file, int line)
8242#ifdef in_atomic 8321#ifdef in_atomic
8243 static unsigned long prev_jiffy; /* ratelimiting */ 8322 static unsigned long prev_jiffy; /* ratelimiting */
8244 8323
8245 if ((in_atomic() || irqs_disabled()) && 8324 if ((!in_atomic() && !irqs_disabled()) ||
8246 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8325 system_state != SYSTEM_RUNNING || oops_in_progress)
8247 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8326 return;
8248 return; 8327 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8249 prev_jiffy = jiffies; 8328 return;
8250 printk(KERN_ERR "BUG: sleeping function called from invalid" 8329 prev_jiffy = jiffies;
8251 " context at %s:%d\n", file, line); 8330
8252 printk("in_atomic():%d, irqs_disabled():%d\n", 8331 printk(KERN_ERR
8253 in_atomic(), irqs_disabled()); 8332 "BUG: sleeping function called from invalid context at %s:%d\n",
8254 debug_show_held_locks(current); 8333 file, line);
8255 if (irqs_disabled()) 8334 printk(KERN_ERR
8256 print_irqtrace_events(current); 8335 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8257 dump_stack(); 8336 in_atomic(), irqs_disabled(),
8258 } 8337 current->pid, current->comm);
8338
8339 debug_show_held_locks(current);
8340 if (irqs_disabled())
8341 print_irqtrace_events(current);
8342 dump_stack();
8259#endif 8343#endif
8260} 8344}
8261EXPORT_SYMBOL(__might_sleep); 8345EXPORT_SYMBOL(__might_sleep);
@@ -8753,73 +8837,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8753static unsigned long to_ratio(u64 period, u64 runtime) 8837static unsigned long to_ratio(u64 period, u64 runtime)
8754{ 8838{
8755 if (runtime == RUNTIME_INF) 8839 if (runtime == RUNTIME_INF)
8756 return 1ULL << 16; 8840 return 1ULL << 20;
8757 8841
8758 return div64_u64(runtime << 16, period); 8842 return div64_u64(runtime << 20, period);
8759} 8843}
8760 8844
8761#ifdef CONFIG_CGROUP_SCHED 8845/* Must be called with tasklist_lock held */
8762static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8846static inline int tg_has_rt_tasks(struct task_group *tg)
8763{ 8847{
8764 struct task_group *tgi, *parent = tg->parent; 8848 struct task_struct *g, *p;
8765 unsigned long total = 0;
8766 8849
8767 if (!parent) { 8850 do_each_thread(g, p) {
8768 if (global_rt_period() < period) 8851 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8769 return 0; 8852 return 1;
8853 } while_each_thread(g, p);
8770 8854
8771 return to_ratio(period, runtime) < 8855 return 0;
8772 to_ratio(global_rt_period(), global_rt_runtime()); 8856}
8773 }
8774 8857
8775 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8858struct rt_schedulable_data {
8776 return 0; 8859 struct task_group *tg;
8860 u64 rt_period;
8861 u64 rt_runtime;
8862};
8777 8863
8778 rcu_read_lock(); 8864static int tg_schedulable(struct task_group *tg, void *data)
8779 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8865{
8780 if (tgi == tg) 8866 struct rt_schedulable_data *d = data;
8781 continue; 8867 struct task_group *child;
8868 unsigned long total, sum = 0;
8869 u64 period, runtime;
8782 8870
8783 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8871 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8784 tgi->rt_bandwidth.rt_runtime); 8872 runtime = tg->rt_bandwidth.rt_runtime;
8873
8874 if (tg == d->tg) {
8875 period = d->rt_period;
8876 runtime = d->rt_runtime;
8785 } 8877 }
8786 rcu_read_unlock();
8787 8878
8788 return total + to_ratio(period, runtime) <= 8879 /*
8789 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8880 * Cannot have more runtime than the period.
8790 parent->rt_bandwidth.rt_runtime); 8881 */
8791} 8882 if (runtime > period && runtime != RUNTIME_INF)
8792#elif defined CONFIG_USER_SCHED 8883 return -EINVAL;
8793static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8794{
8795 struct task_group *tgi;
8796 unsigned long total = 0;
8797 unsigned long global_ratio =
8798 to_ratio(global_rt_period(), global_rt_runtime());
8799 8884
8800 rcu_read_lock(); 8885 /*
8801 list_for_each_entry_rcu(tgi, &task_groups, list) { 8886 * Ensure we don't starve existing RT tasks.
8802 if (tgi == tg) 8887 */
8803 continue; 8888 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8889 return -EBUSY;
8890
8891 total = to_ratio(period, runtime);
8892
8893 /*
8894 * Nobody can have more than the global setting allows.
8895 */
8896 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8897 return -EINVAL;
8898
8899 /*
8900 * The sum of our children's runtime should not exceed our own.
8901 */
8902 list_for_each_entry_rcu(child, &tg->children, siblings) {
8903 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8904 runtime = child->rt_bandwidth.rt_runtime;
8905
8906 if (child == d->tg) {
8907 period = d->rt_period;
8908 runtime = d->rt_runtime;
8909 }
8804 8910
8805 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8911 sum += to_ratio(period, runtime);
8806 tgi->rt_bandwidth.rt_runtime);
8807 } 8912 }
8808 rcu_read_unlock();
8809 8913
8810 return total + to_ratio(period, runtime) < global_ratio; 8914 if (sum > total)
8915 return -EINVAL;
8916
8917 return 0;
8811} 8918}
8812#endif
8813 8919
8814/* Must be called with tasklist_lock held */ 8920static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8815static inline int tg_has_rt_tasks(struct task_group *tg)
8816{ 8921{
8817 struct task_struct *g, *p; 8922 struct rt_schedulable_data data = {
8818 do_each_thread(g, p) { 8923 .tg = tg,
8819 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8924 .rt_period = period,
8820 return 1; 8925 .rt_runtime = runtime,
8821 } while_each_thread(g, p); 8926 };
8822 return 0; 8927
8928 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8823} 8929}
8824 8930
8825static int tg_set_bandwidth(struct task_group *tg, 8931static int tg_set_bandwidth(struct task_group *tg,
@@ -8829,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8829 8935
8830 mutex_lock(&rt_constraints_mutex); 8936 mutex_lock(&rt_constraints_mutex);
8831 read_lock(&tasklist_lock); 8937 read_lock(&tasklist_lock);
8832 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8938 err = __rt_schedulable(tg, rt_period, rt_runtime);
8833 err = -EBUSY; 8939 if (err)
8834 goto unlock; 8940 goto unlock;
8835 }
8836 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8837 err = -EINVAL;
8838 goto unlock;
8839 }
8840 8941
8841 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8942 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8842 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8943 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8905,19 +9006,25 @@ long sched_group_rt_period(struct task_group *tg)
8905 9006
8906static int sched_rt_global_constraints(void) 9007static int sched_rt_global_constraints(void)
8907{ 9008{
8908 struct task_group *tg = &root_task_group; 9009 u64 runtime, period;
8909 u64 rt_runtime, rt_period;
8910 int ret = 0; 9010 int ret = 0;
8911 9011
8912 if (sysctl_sched_rt_period <= 0) 9012 if (sysctl_sched_rt_period <= 0)
8913 return -EINVAL; 9013 return -EINVAL;
8914 9014
8915 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9015 runtime = global_rt_runtime();
8916 rt_runtime = tg->rt_bandwidth.rt_runtime; 9016 period = global_rt_period();
9017
9018 /*
9019 * Sanity check on the sysctl variables.
9020 */
9021 if (runtime > period && runtime != RUNTIME_INF)
9022 return -EINVAL;
8917 9023
8918 mutex_lock(&rt_constraints_mutex); 9024 mutex_lock(&rt_constraints_mutex);
8919 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9025 read_lock(&tasklist_lock);
8920 ret = -EINVAL; 9026 ret = __rt_schedulable(NULL, 0, 0);
9027 read_unlock(&tasklist_lock);
8921 mutex_unlock(&rt_constraints_mutex); 9028 mutex_unlock(&rt_constraints_mutex);
8922 9029
8923 return ret; 9030 return ret;
@@ -8991,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8991 9098
8992 if (!cgrp->parent) { 9099 if (!cgrp->parent) {
8993 /* This is early initialization for the top cgroup */ 9100 /* This is early initialization for the top cgroup */
8994 init_task_group.css.cgroup = cgrp;
8995 return &init_task_group.css; 9101 return &init_task_group.css;
8996 } 9102 }
8997 9103
@@ -9000,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9000 if (IS_ERR(tg)) 9106 if (IS_ERR(tg))
9001 return ERR_PTR(-ENOMEM); 9107 return ERR_PTR(-ENOMEM);
9002 9108
9003 /* Bind the cgroup to task_group object we just created */
9004 tg->css.cgroup = cgrp;
9005
9006 return &tg->css; 9109 return &tg->css;
9007} 9110}
9008 9111
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..18fd17172eb6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
409} 409}
410 410
411/* 411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{
425 struct load_weight lw = {
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
457
458 if (se->load.weight < NICE_0_LOAD) {
459 se_lw = &lw;
460 rw += NICE_0_LOAD - se->load.weight;
461 }
462
463 delta = calc_delta_mine(delta, rw, se_lw);
464 }
465
466 return delta;
467}
468
469/*
470 * Update the current task's runtime statistics. Skip current tasks that 412 * Update the current task's runtime statistics. Skip current tasks that
471 * are not in our scheduling class. 413 * are not in our scheduling class.
472 */ 414 */
@@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
586 update_load_add(&cfs_rq->load, se->load.weight); 528 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se)) 529 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 530 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se)) 531 if (entity_is_task(se)) {
590 add_cfs_task_weight(cfs_rq, se->load.weight); 532 add_cfs_task_weight(cfs_rq, se->load.weight);
533 list_add(&se->group_node, &cfs_rq->tasks);
534 }
591 cfs_rq->nr_running++; 535 cfs_rq->nr_running++;
592 se->on_rq = 1; 536 se->on_rq = 1;
593 list_add(&se->group_node, &cfs_rq->tasks);
594} 537}
595 538
596static void 539static void
@@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
599 update_load_sub(&cfs_rq->load, se->load.weight); 542 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se)) 543 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 544 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se)) 545 if (entity_is_task(se)) {
603 add_cfs_task_weight(cfs_rq, -se->load.weight); 546 add_cfs_task_weight(cfs_rq, -se->load.weight);
547 list_del_init(&se->group_node);
548 }
604 cfs_rq->nr_running--; 549 cfs_rq->nr_running--;
605 se->on_rq = 0; 550 se->on_rq = 0;
606 list_del_init(&se->group_node);
607} 551}
608 552
609static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 553static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu,
1085 long wl, long wg) 1029 long wl, long wg)
1086{ 1030{
1087 struct sched_entity *se = tg->se[cpu]; 1031 struct sched_entity *se = tg->se[cpu];
1088 long more_w;
1089 1032
1090 if (!tg->parent) 1033 if (!tg->parent)
1091 return wl; 1034 return wl;
@@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu,
1097 if (!wl && sched_feat(ASYM_EFF_LOAD)) 1040 if (!wl && sched_feat(ASYM_EFF_LOAD))
1098 return wl; 1041 return wl;
1099 1042
1100 /*
1101 * Instead of using this increment, also add the difference
1102 * between when the shares were last updated and now.
1103 */
1104 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1105 wl += more_w;
1106 wg += more_w;
1107
1108 for_each_sched_entity(se) { 1043 for_each_sched_entity(se) {
1109#define D(n) (likely(n) ? (n) : 1)
1110
1111 long S, rw, s, a, b; 1044 long S, rw, s, a, b;
1045 long more_w;
1046
1047 /*
1048 * Instead of using this increment, also add the difference
1049 * between when the shares were last updated and now.
1050 */
1051 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1052 wl += more_w;
1053 wg += more_w;
1112 1054
1113 S = se->my_q->tg->shares; 1055 S = se->my_q->tg->shares;
1114 s = se->my_q->shares; 1056 s = se->my_q->shares;
@@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu,
1117 a = S*(rw + wl); 1059 a = S*(rw + wl);
1118 b = S*rw + s*wg; 1060 b = S*rw + s*wg;
1119 1061
1120 wl = s*(a-b)/D(b); 1062 wl = s*(a-b);
1063
1064 if (likely(b))
1065 wl /= b;
1066
1121 /* 1067 /*
1122 * Assume the group is already running and will 1068 * Assume the group is already running and will
1123 * thus already be accounted for in the weight. 1069 * thus already be accounted for in the weight.
@@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu,
1126 * alter the group weight. 1072 * alter the group weight.
1127 */ 1073 */
1128 wg = 0; 1074 wg = 0;
1129#undef D
1130 } 1075 }
1131 1076
1132 return wl; 1077 return wl;
@@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1143#endif 1088#endif
1144 1089
1145static int 1090static int
1146wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1091wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1147 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1092 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1148 int idx, unsigned long load, unsigned long this_load, 1093 int idx, unsigned long load, unsigned long this_load,
1149 unsigned int imbalance) 1094 unsigned int imbalance)
@@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1158 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1103 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1159 return 0; 1104 return 0;
1160 1105
1106 if (!sync && sched_feat(SYNC_WAKEUPS) &&
1107 curr->se.avg_overlap < sysctl_sched_migration_cost &&
1108 p->se.avg_overlap < sysctl_sched_migration_cost)
1109 sync = 1;
1110
1161 /* 1111 /*
1162 * If sync wakeup then subtract the (maximum possible) 1112 * If sync wakeup then subtract the (maximum possible)
1163 * effect of the currently running task from the load 1113 * effect of the currently running task from the load
@@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1182 * a reasonable amount of time then attract this newly 1132 * a reasonable amount of time then attract this newly
1183 * woken task: 1133 * woken task:
1184 */ 1134 */
1185 if (sync && balanced) { 1135 if (sync && balanced)
1186 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1136 return 1;
1187 p->se.avg_overlap < sysctl_sched_migration_cost)
1188 return 1;
1189 }
1190 1137
1191 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1138 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1192 tl_per_task = cpu_avg_load_per_task(this_cpu); 1139 tl_per_task = cpu_avg_load_per_task(this_cpu);
1193 1140
1194 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1141 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
1195 balanced) { 1142 tl_per_task)) {
1196 /* 1143 /*
1197 * This domain has SD_WAKE_AFFINE and 1144 * This domain has SD_WAKE_AFFINE and
1198 * p is cache cold in this domain, and 1145 * p is cache cold in this domain, and
@@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1211 struct sched_domain *sd, *this_sd = NULL; 1158 struct sched_domain *sd, *this_sd = NULL;
1212 int prev_cpu, this_cpu, new_cpu; 1159 int prev_cpu, this_cpu, new_cpu;
1213 unsigned long load, this_load; 1160 unsigned long load, this_load;
1214 struct rq *rq, *this_rq; 1161 struct rq *this_rq;
1215 unsigned int imbalance; 1162 unsigned int imbalance;
1216 int idx; 1163 int idx;
1217 1164
1218 prev_cpu = task_cpu(p); 1165 prev_cpu = task_cpu(p);
1219 rq = task_rq(p);
1220 this_cpu = smp_processor_id(); 1166 this_cpu = smp_processor_id();
1221 this_rq = cpu_rq(this_cpu); 1167 this_rq = cpu_rq(this_cpu);
1222 new_cpu = prev_cpu; 1168 new_cpu = prev_cpu;
1223 1169
1170 if (prev_cpu == this_cpu)
1171 goto out;
1224 /* 1172 /*
1225 * 'this_sd' is the first domain that both 1173 * 'this_sd' is the first domain that both
1226 * this_cpu and prev_cpu are present in: 1174 * this_cpu and prev_cpu are present in:
@@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1248 load = source_load(prev_cpu, idx); 1196 load = source_load(prev_cpu, idx);
1249 this_load = target_load(this_cpu, idx); 1197 this_load = target_load(this_cpu, idx);
1250 1198
1251 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1199 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1252 load, this_load, imbalance)) 1200 load, this_load, imbalance))
1253 return this_cpu; 1201 return this_cpu;
1254 1202
1255 if (prev_cpu == this_cpu)
1256 goto out;
1257
1258 /* 1203 /*
1259 * Start passive balancing when half the imbalance_pct 1204 * Start passive balancing when half the imbalance_pct
1260 * limit is reached. 1205 * limit is reached.
@@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1281 * + nice tasks. 1226 * + nice tasks.
1282 */ 1227 */
1283 if (sched_feat(ASYM_GRAN)) 1228 if (sched_feat(ASYM_GRAN))
1284 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); 1229 gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
1285 else
1286 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1287 1230
1288 return gran; 1231 return gran;
1289} 1232}
1290 1233
1291/* 1234/*
1292 * Should 'se' preempt 'curr'.
1293 *
1294 * |s1
1295 * |s2
1296 * |s3
1297 * g
1298 * |<--->|c
1299 *
1300 * w(c, s1) = -1
1301 * w(c, s2) = 0
1302 * w(c, s3) = 1
1303 *
1304 */
1305static int
1306wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1307{
1308 s64 gran, vdiff = curr->vruntime - se->vruntime;
1309
1310 if (vdiff < 0)
1311 return -1;
1312
1313 gran = wakeup_gran(curr);
1314 if (vdiff > gran)
1315 return 1;
1316
1317 return 0;
1318}
1319
1320/* return depth at which a sched entity is present in the hierarchy */
1321static inline int depth_se(struct sched_entity *se)
1322{
1323 int depth = 0;
1324
1325 for_each_sched_entity(se)
1326 depth++;
1327
1328 return depth;
1329}
1330
1331/*
1332 * Preempt the current task with a newly woken task if needed: 1235 * Preempt the current task with a newly woken task if needed:
1333 */ 1236 */
1334static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1237static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1335{ 1238{
1336 struct task_struct *curr = rq->curr; 1239 struct task_struct *curr = rq->curr;
1337 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1240 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1338 struct sched_entity *se = &curr->se, *pse = &p->se; 1241 struct sched_entity *se = &curr->se, *pse = &p->se;
1339 int se_depth, pse_depth; 1242 s64 delta_exec;
1340 1243
1341 if (unlikely(rt_prio(p->prio))) { 1244 if (unlikely(rt_prio(p->prio))) {
1342 update_rq_clock(rq); 1245 update_rq_clock(rq);
@@ -1351,6 +1254,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1351 cfs_rq_of(pse)->next = pse; 1254 cfs_rq_of(pse)->next = pse;
1352 1255
1353 /* 1256 /*
1257 * We can come here with TIF_NEED_RESCHED already set from new task
1258 * wake up path.
1259 */
1260 if (test_tsk_need_resched(curr))
1261 return;
1262
1263 /*
1354 * Batch tasks do not preempt (their preemption is driven by 1264 * Batch tasks do not preempt (their preemption is driven by
1355 * the tick): 1265 * the tick):
1356 */ 1266 */
@@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1360 if (!sched_feat(WAKEUP_PREEMPT)) 1270 if (!sched_feat(WAKEUP_PREEMPT))
1361 return; 1271 return;
1362 1272
1363 /* 1273 if (sched_feat(WAKEUP_OVERLAP) && (sync ||
1364 * preemption test can be made between sibling entities who are in the 1274 (se->avg_overlap < sysctl_sched_migration_cost &&
1365 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 1275 pse->avg_overlap < sysctl_sched_migration_cost))) {
1366 * both tasks until we find their ancestors who are siblings of common 1276 resched_task(curr);
1367 * parent. 1277 return;
1368 */
1369
1370 /* First walk up until both entities are at same depth */
1371 se_depth = depth_se(se);
1372 pse_depth = depth_se(pse);
1373
1374 while (se_depth > pse_depth) {
1375 se_depth--;
1376 se = parent_entity(se);
1377 }
1378
1379 while (pse_depth > se_depth) {
1380 pse_depth--;
1381 pse = parent_entity(pse);
1382 }
1383
1384 while (!is_same_group(se, pse)) {
1385 se = parent_entity(se);
1386 pse = parent_entity(pse);
1387 } 1278 }
1388 1279
1389 if (wakeup_preempt_entity(se, pse) == 1) 1280 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1281 if (delta_exec > wakeup_gran(pse))
1390 resched_task(curr); 1282 resched_task(curr);
1391} 1283}
1392 1284
@@ -1445,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1445 if (next == &cfs_rq->tasks) 1337 if (next == &cfs_rq->tasks)
1446 return NULL; 1338 return NULL;
1447 1339
1448 /* Skip over entities that are not tasks */ 1340 se = list_entry(next, struct sched_entity, group_node);
1449 do { 1341 p = task_of(se);
1450 se = list_entry(next, struct sched_entity, group_node); 1342 cfs_rq->balance_iterator = next->next;
1451 next = next->next;
1452 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1453
1454 if (next == &cfs_rq->tasks)
1455 return NULL;
1456
1457 cfs_rq->balance_iterator = next;
1458
1459 if (entity_is_task(se))
1460 p = task_of(se);
1461 1343
1462 return p; 1344 return p;
1463} 1345}
@@ -1507,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1507 rcu_read_lock(); 1389 rcu_read_lock();
1508 update_h_load(busiest_cpu); 1390 update_h_load(busiest_cpu);
1509 1391
1510 list_for_each_entry(tg, &task_groups, list) { 1392 list_for_each_entry_rcu(tg, &task_groups, list) {
1511 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; 1393 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1512 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 1394 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1513 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 1395 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1620,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1620 * 'current' within the tree based on its new key value. 1502 * 'current' within the tree based on its new key value.
1621 */ 1503 */
1622 swap(curr->vruntime, se->vruntime); 1504 swap(curr->vruntime, se->vruntime);
1505 resched_task(rq->curr);
1623 } 1506 }
1624 1507
1625 enqueue_task_fair(rq, p, 0); 1508 enqueue_task_fair(rq, p, 0);
1626 resched_task(rq->curr);
1627} 1509}
1628 1510
1629/* 1511/*
@@ -1642,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1642 if (p->prio > oldprio) 1524 if (p->prio > oldprio)
1643 resched_task(rq->curr); 1525 resched_task(rq->curr);
1644 } else 1526 } else
1645 check_preempt_curr(rq, p); 1527 check_preempt_curr(rq, p, 0);
1646} 1528}
1647 1529
1648/* 1530/*
@@ -1659,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
1659 if (running) 1541 if (running)
1660 resched_task(rq->curr); 1542 resched_task(rq->curr);
1661 else 1543 else
1662 check_preempt_curr(rq, p); 1544 check_preempt_curr(rq, p, 0);
1663} 1545}
1664 1546
1665/* Account for a task changing its policy or group. 1547/* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 9353ca78154e..7c9e8f4a049f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
11SCHED_FEAT(LB_BIAS, 1) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
76 if (running) 76 if (running)
77 resched_task(rq->curr); 77 resched_task(rq->curr);
78 else 78 else
79 check_preempt_curr(rq, p); 79 check_preempt_curr(rq, p, 0);
80} 80}
81 81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p, 82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
93 if (p->prio > oldprio) 93 if (p->prio > oldprio)
94 resched_task(rq->curr); 94 resched_task(rq->curr);
95 } else 95 } else
96 check_preempt_curr(rq, p); 96 check_preempt_curr(rq, p, 0);
97} 97}
98 98
99/* 99/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1113157b2058..cdf5740ab03e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
102 102
103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
104{ 104{
105 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
105 struct sched_rt_entity *rt_se = rt_rq->rt_se; 106 struct sched_rt_entity *rt_se = rt_rq->rt_se;
106 107
107 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { 108 if (rt_rq->rt_nr_running) {
108 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 109 if (rt_se && !on_rt_rq(rt_se))
109 110 enqueue_rt_entity(rt_se);
110 enqueue_rt_entity(rt_se);
111 if (rt_rq->highest_prio < curr->prio) 111 if (rt_rq->highest_prio < curr->prio)
112 resched_task(curr); 112 resched_task(curr);
113 } 113 }
@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
231#endif /* CONFIG_RT_GROUP_SCHED */ 231#endif /* CONFIG_RT_GROUP_SCHED */
232 232
233#ifdef CONFIG_SMP 233#ifdef CONFIG_SMP
234/*
235 * We ran out of runtime, see if we can borrow some from our neighbours.
236 */
234static int do_balance_runtime(struct rt_rq *rt_rq) 237static int do_balance_runtime(struct rt_rq *rt_rq)
235{ 238{
236 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 239 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
250 continue; 253 continue;
251 254
252 spin_lock(&iter->rt_runtime_lock); 255 spin_lock(&iter->rt_runtime_lock);
256 /*
257 * Either all rqs have inf runtime and there's nothing to steal
258 * or __disable_runtime() below sets a specific rq to inf to
259 * indicate its been disabled and disalow stealing.
260 */
253 if (iter->rt_runtime == RUNTIME_INF) 261 if (iter->rt_runtime == RUNTIME_INF)
254 goto next; 262 goto next;
255 263
264 /*
265 * From runqueues with spare time, take 1/n part of their
266 * spare time, but no more than our period.
267 */
256 diff = iter->rt_runtime - iter->rt_time; 268 diff = iter->rt_runtime - iter->rt_time;
257 if (diff > 0) { 269 if (diff > 0) {
258 diff = div_u64((u64)diff, weight); 270 diff = div_u64((u64)diff, weight);
@@ -274,6 +286,9 @@ next:
274 return more; 286 return more;
275} 287}
276 288
289/*
290 * Ensure this RQ takes back all the runtime it lend to its neighbours.
291 */
277static void __disable_runtime(struct rq *rq) 292static void __disable_runtime(struct rq *rq)
278{ 293{
279 struct root_domain *rd = rq->rd; 294 struct root_domain *rd = rq->rd;
@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
289 304
290 spin_lock(&rt_b->rt_runtime_lock); 305 spin_lock(&rt_b->rt_runtime_lock);
291 spin_lock(&rt_rq->rt_runtime_lock); 306 spin_lock(&rt_rq->rt_runtime_lock);
307 /*
308 * Either we're all inf and nobody needs to borrow, or we're
309 * already disabled and thus have nothing to do, or we have
310 * exactly the right amount of runtime to take out.
311 */
292 if (rt_rq->rt_runtime == RUNTIME_INF || 312 if (rt_rq->rt_runtime == RUNTIME_INF ||
293 rt_rq->rt_runtime == rt_b->rt_runtime) 313 rt_rq->rt_runtime == rt_b->rt_runtime)
294 goto balanced; 314 goto balanced;
295 spin_unlock(&rt_rq->rt_runtime_lock); 315 spin_unlock(&rt_rq->rt_runtime_lock);
296 316
317 /*
318 * Calculate the difference between what we started out with
319 * and what we current have, that's the amount of runtime
320 * we lend and now have to reclaim.
321 */
297 want = rt_b->rt_runtime - rt_rq->rt_runtime; 322 want = rt_b->rt_runtime - rt_rq->rt_runtime;
298 323
324 /*
325 * Greedy reclaim, take back as much as we can.
326 */
299 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu_mask(i, rd->span) {
300 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
301 s64 diff; 329 s64 diff;
302 330
331 /*
332 * Can't reclaim from ourselves or disabled runqueues.
333 */
303 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 334 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
304 continue; 335 continue;
305 336
@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
319 } 350 }
320 351
321 spin_lock(&rt_rq->rt_runtime_lock); 352 spin_lock(&rt_rq->rt_runtime_lock);
353 /*
354 * We cannot be left wanting - that would mean some runtime
355 * leaked out of the system.
356 */
322 BUG_ON(want); 357 BUG_ON(want);
323balanced: 358balanced:
359 /*
360 * Disable all the borrow logic by pretending we have inf
361 * runtime - in which case borrowing doesn't make sense.
362 */
324 rt_rq->rt_runtime = RUNTIME_INF; 363 rt_rq->rt_runtime = RUNTIME_INF;
325 spin_unlock(&rt_rq->rt_runtime_lock); 364 spin_unlock(&rt_rq->rt_runtime_lock);
326 spin_unlock(&rt_b->rt_runtime_lock); 365 spin_unlock(&rt_b->rt_runtime_lock);
@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
343 if (unlikely(!scheduler_running)) 382 if (unlikely(!scheduler_running))
344 return; 383 return;
345 384
385 /*
386 * Reset each runqueue's bandwidth settings
387 */
346 for_each_leaf_rt_rq(rt_rq, rq) { 388 for_each_leaf_rt_rq(rt_rq, rq) {
347 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 389 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
348 390
@@ -389,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
389 int i, idle = 1; 431 int i, idle = 1;
390 cpumask_t span; 432 cpumask_t span;
391 433
392 if (rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
393 return 1; 435 return 1;
394 436
395 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
@@ -487,6 +529,9 @@ static void update_curr_rt(struct rq *rq)
487 curr->se.exec_start = rq->clock; 529 curr->se.exec_start = rq->clock;
488 cpuacct_charge(curr, delta_exec); 530 cpuacct_charge(curr, delta_exec);
489 531
532 if (!rt_bandwidth_enabled())
533 return;
534
490 for_each_sched_rt_entity(rt_se) { 535 for_each_sched_rt_entity(rt_se) {
491 rt_rq = rt_rq_of_se(rt_se); 536 rt_rq = rt_rq_of_se(rt_se);
492 537
@@ -784,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
784/* 829/*
785 * Preempt the current task with a newly woken task if needed: 830 * Preempt the current task with a newly woken task if needed:
786 */ 831 */
787static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 832static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
788{ 833{
789 if (p->prio < rq->curr->prio) { 834 if (p->prio < rq->curr->prio) {
790 resched_task(rq->curr); 835 resched_task(rq->curr);
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169{ 169{
170 struct user_struct *up = container_of(kobj, struct user_struct, kobj); 170 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
171 171
172 return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); 172 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
173} 173}
174 174
175static ssize_t cpu_rt_runtime_store(struct kobject *kobj, 175static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
180 unsigned long rt_runtime; 180 unsigned long rt_runtime;
181 int rc; 181 int rc;
182 182
183 sscanf(buf, "%lu", &rt_runtime); 183 sscanf(buf, "%ld", &rt_runtime);
184 184
185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime); 185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
186 186