aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-deadline.txt340
-rw-r--r--arch/arm/kernel/topology.c4
-rw-r--r--arch/cris/arch-v10/drivers/sync_serial.c1
-rw-r--r--arch/cris/arch-v32/drivers/sync_serial.c1
-rw-r--r--arch/ia64/include/asm/processor.h1
-rw-r--r--arch/mips/include/asm/processor.h6
-rw-r--r--arch/powerpc/include/asm/cputime.h2
-rw-r--r--arch/powerpc/mm/fault.c5
-rw-r--r--arch/s390/include/asm/cputime.h2
-rw-r--r--arch/um/drivers/random.c1
-rw-r--r--arch/x86/kernel/smpboot.c55
-rw-r--r--arch/x86/mm/fault.c5
-rw-r--r--drivers/cpuidle/cpuidle.c15
-rw-r--r--drivers/gpu/vga/vgaarb.c1
-rw-r--r--drivers/md/dm-bufio.c1
-rw-r--r--drivers/parisc/power.c1
-rw-r--r--drivers/s390/net/claw.c2
-rw-r--r--drivers/scsi/fcoe/fcoe.c1
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c1
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c3
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c1
-rw-r--r--drivers/staging/lustre/lustre/libcfs/fail.c1
-rw-r--r--drivers/tty/bfin_jtag_comm.c1
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/jfs/jfs_logmgr.c2
-rw-r--r--fs/jfs/jfs_txnmgr.c3
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c1
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--include/asm-generic/cputime_jiffies.h2
-rw-r--r--include/asm-generic/cputime_nsecs.h2
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/seqlock.h19
-rw-r--r--include/linux/smp.h2
-rw-r--r--include/linux/wait.h16
-rw-r--r--init/main.c1
-rw-r--r--kernel/exit.c47
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/sched/auto_group.c5
-rw-r--r--kernel/sched/core.c295
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cputime.c64
-rw-r--r--kernel/sched/deadline.c33
-rw-r--r--kernel/sched/debug.c13
-rw-r--r--kernel/sched/fair.c479
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/rt.c21
-rw-r--r--kernel/sched/sched.h80
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/smp.c22
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/hrtimer.c1
-rw-r--r--kernel/time/posix-cpu-timers.c14
-rw-r--r--kernel/trace/ring_buffer_benchmark.c3
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--lib/Kconfig.debug12
55 files changed, 1075 insertions, 552 deletions
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index 18adc92a6b3b..21461a0441c1 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -15,6 +15,8 @@ CONTENTS
15 5. Tasks CPU affinity 15 5. Tasks CPU affinity
16 5.1 SCHED_DEADLINE and cpusets HOWTO 16 5.1 SCHED_DEADLINE and cpusets HOWTO
17 6. Future plans 17 6. Future plans
18 A. Test suite
19 B. Minimal main()
18 20
19 21
200. WARNING 220. WARNING
@@ -38,24 +40,25 @@ CONTENTS
38================== 40==================
39 41
40 SCHED_DEADLINE uses three parameters, named "runtime", "period", and 42 SCHED_DEADLINE uses three parameters, named "runtime", "period", and
41 "deadline" to schedule tasks. A SCHED_DEADLINE task is guaranteed to receive 43 "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
42 "runtime" microseconds of execution time every "period" microseconds, and 44 "runtime" microseconds of execution time every "period" microseconds, and
43 these "runtime" microseconds are available within "deadline" microseconds 45 these "runtime" microseconds are available within "deadline" microseconds
44 from the beginning of the period. In order to implement this behaviour, 46 from the beginning of the period. In order to implement this behaviour,
45 every time the task wakes up, the scheduler computes a "scheduling deadline" 47 every time the task wakes up, the scheduler computes a "scheduling deadline"
46 consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then 48 consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then
47 scheduled using EDF[1] on these scheduling deadlines (the task with the 49 scheduled using EDF[1] on these scheduling deadlines (the task with the
48 smallest scheduling deadline is selected for execution). Notice that this 50 earliest scheduling deadline is selected for execution). Notice that the
49 guaranteed is respected if a proper "admission control" strategy (see Section 51 task actually receives "runtime" time units within "deadline" if a proper
50 "4. Bandwidth management") is used. 52 "admission control" strategy (see Section "4. Bandwidth management") is used
53 (clearly, if the system is overloaded this guarantee cannot be respected).
51 54
52 Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so 55 Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so
53 that each task runs for at most its runtime every period, avoiding any 56 that each task runs for at most its runtime every period, avoiding any
54 interference between different tasks (bandwidth isolation), while the EDF[1] 57 interference between different tasks (bandwidth isolation), while the EDF[1]
55 algorithm selects the task with the smallest scheduling deadline as the one 58 algorithm selects the task with the earliest scheduling deadline as the one
56 to be executed first. Thanks to this feature, also tasks that do not 59 to be executed next. Thanks to this feature, tasks that do not strictly comply
57 strictly comply with the "traditional" real-time task model (see Section 3) 60 with the "traditional" real-time task model (see Section 3) can effectively
58 can effectively use the new policy. 61 use the new policy.
59 62
60 In more details, the CBS algorithm assigns scheduling deadlines to 63 In more details, the CBS algorithm assigns scheduling deadlines to
61 tasks in the following way: 64 tasks in the following way:
@@ -64,45 +67,45 @@ CONTENTS
64 "deadline", and "period" parameters; 67 "deadline", and "period" parameters;
65 68
66 - The state of the task is described by a "scheduling deadline", and 69 - The state of the task is described by a "scheduling deadline", and
67 a "current runtime". These two parameters are initially set to 0; 70 a "remaining runtime". These two parameters are initially set to 0;
68 71
69 - When a SCHED_DEADLINE task wakes up (becomes ready for execution), 72 - When a SCHED_DEADLINE task wakes up (becomes ready for execution),
70 the scheduler checks if 73 the scheduler checks if
71 74
72 current runtime runtime 75 remaining runtime runtime
73 ---------------------------------- > ---------------- 76 ---------------------------------- > ---------
74 scheduling deadline - current time period 77 scheduling deadline - current time period
75 78
76 then, if the scheduling deadline is smaller than the current time, or 79 then, if the scheduling deadline is smaller than the current time, or
77 this condition is verified, the scheduling deadline and the 80 this condition is verified, the scheduling deadline and the
78 current budget are re-initialised as 81 remaining runtime are re-initialised as
79 82
80 scheduling deadline = current time + deadline 83 scheduling deadline = current time + deadline
81 current runtime = runtime 84 remaining runtime = runtime
82 85
83 otherwise, the scheduling deadline and the current runtime are 86 otherwise, the scheduling deadline and the remaining runtime are
84 left unchanged; 87 left unchanged;
85 88
86 - When a SCHED_DEADLINE task executes for an amount of time t, its 89 - When a SCHED_DEADLINE task executes for an amount of time t, its
87 current runtime is decreased as 90 remaining runtime is decreased as
88 91
89 current runtime = current runtime - t 92 remaining runtime = remaining runtime - t
90 93
91 (technically, the runtime is decreased at every tick, or when the 94 (technically, the runtime is decreased at every tick, or when the
92 task is descheduled / preempted); 95 task is descheduled / preempted);
93 96
94 - When the current runtime becomes less or equal than 0, the task is 97 - When the remaining runtime becomes less or equal than 0, the task is
95 said to be "throttled" (also known as "depleted" in real-time literature) 98 said to be "throttled" (also known as "depleted" in real-time literature)
96 and cannot be scheduled until its scheduling deadline. The "replenishment 99 and cannot be scheduled until its scheduling deadline. The "replenishment
97 time" for this task (see next item) is set to be equal to the current 100 time" for this task (see next item) is set to be equal to the current
98 value of the scheduling deadline; 101 value of the scheduling deadline;
99 102
100 - When the current time is equal to the replenishment time of a 103 - When the current time is equal to the replenishment time of a
101 throttled task, the scheduling deadline and the current runtime are 104 throttled task, the scheduling deadline and the remaining runtime are
102 updated as 105 updated as
103 106
104 scheduling deadline = scheduling deadline + period 107 scheduling deadline = scheduling deadline + period
105 current runtime = current runtime + runtime 108 remaining runtime = remaining runtime + runtime
106 109
107 110
1083. Scheduling Real-Time Tasks 1113. Scheduling Real-Time Tasks
@@ -134,6 +137,50 @@ CONTENTS
134 A real-time task can be periodic with period P if r_{j+1} = r_j + P, or 137 A real-time task can be periodic with period P if r_{j+1} = r_j + P, or
135 sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally, 138 sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally,
136 d_j = r_j + D, where D is the task's relative deadline. 139 d_j = r_j + D, where D is the task's relative deadline.
140 The utilisation of a real-time task is defined as the ratio between its
141 WCET and its period (or minimum inter-arrival time), and represents
142 the fraction of CPU time needed to execute the task.
143
144 If the total utilisation sum_i(WCET_i/P_i) is larger than M (with M equal
145 to the number of CPUs), then the scheduler is unable to respect all the
146 deadlines.
147 Note that total utilisation is defined as the sum of the utilisations
148 WCET_i/P_i over all the real-time tasks in the system. When considering
149 multiple real-time tasks, the parameters of the i-th task are indicated
150 with the "_i" suffix.
151 Moreover, if the total utilisation is larger than M, then we risk starving
152 non- real-time tasks by real-time tasks.
153 If, instead, the total utilisation is smaller than M, then non real-time
154 tasks will not be starved and the system might be able to respect all the
155 deadlines.
156 As a matter of fact, in this case it is possible to provide an upper bound
157 for tardiness (defined as the maximum between 0 and the difference
158 between the finishing time of a job and its absolute deadline).
159 More precisely, it can be proven that using a global EDF scheduler the
160 maximum tardiness of each task is smaller or equal than
161 ((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max
162 where WCET_max = max_i{WCET_i} is the maximum WCET, WCET_min=min_i{WCET_i}
163 is the minimum WCET, and U_max = max_i{WCET_i/P_i} is the maximum utilisation.
164
165 If M=1 (uniprocessor system), or in case of partitioned scheduling (each
166 real-time task is statically assigned to one and only one CPU), it is
167 possible to formally check if all the deadlines are respected.
168 If D_i = P_i for all tasks, then EDF is able to respect all the deadlines
169 of all the tasks executing on a CPU if and only if the total utilisation
170 of the tasks running on such a CPU is smaller or equal than 1.
171 If D_i != P_i for some task, then it is possible to define the density of
172 a task as C_i/min{D_i,T_i}, and EDF is able to respect all the deadlines
173 of all the tasks running on a CPU if the sum sum_i C_i/min{D_i,T_i} of the
174 densities of the tasks running on such a CPU is smaller or equal than 1
175 (notice that this condition is only sufficient, and not necessary).
176
177 On multiprocessor systems with global EDF scheduling (non partitioned
178 systems), a sufficient test for schedulability can not be based on the
179 utilisations (it can be shown that task sets with utilisations slightly
180 larger than 1 can miss deadlines regardless of the number of CPUs M).
181 However, as previously stated, enforcing that the total utilisation is smaller
182 than M is enough to guarantee that non real-time tasks are not starved and
183 that the tardiness of real-time tasks has an upper bound.
137 184
138 SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that 185 SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that
139 the jobs' deadlines of a task are respected. In order to do this, a task 186 the jobs' deadlines of a task are respected. In order to do this, a task
@@ -147,6 +194,8 @@ CONTENTS
147 and the absolute deadlines (d_j) coincide, so a proper admission control 194 and the absolute deadlines (d_j) coincide, so a proper admission control
148 allows to respect the jobs' absolute deadlines for this task (this is what is 195 allows to respect the jobs' absolute deadlines for this task (this is what is
149 called "hard schedulability property" and is an extension of Lemma 1 of [2]). 196 called "hard schedulability property" and is an extension of Lemma 1 of [2]).
197 Notice that if runtime > deadline the admission control will surely reject
198 this task, as it is not possible to respect its temporal constraints.
150 199
151 References: 200 References:
152 1 - C. L. Liu and J. W. Layland. Scheduling algorithms for multiprogram- 201 1 - C. L. Liu and J. W. Layland. Scheduling algorithms for multiprogram-
@@ -156,46 +205,57 @@ CONTENTS
156 Real-Time Systems. Proceedings of the 19th IEEE Real-time Systems 205 Real-Time Systems. Proceedings of the 19th IEEE Real-time Systems
157 Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf 206 Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf
158 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab 207 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab
159 Technical Report. http://xoomer.virgilio.it/lucabe72/pubs/tr-98-01.ps 208 Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf
160 209
1614. Bandwidth management 2104. Bandwidth management
162======================= 211=======================
163 212
164 In order for the -deadline scheduling to be effective and useful, it is 213 As previously mentioned, in order for -deadline scheduling to be
165 important to have some method to keep the allocation of the available CPU 214 effective and useful (that is, to be able to provide "runtime" time units
166 bandwidth to the tasks under control. 215 within "deadline"), it is important to have some method to keep the allocation
167 This is usually called "admission control" and if it is not performed at all, 216 of the available fractions of CPU time to the various tasks under control.
217 This is usually called "admission control" and if it is not performed, then
168 no guarantee can be given on the actual scheduling of the -deadline tasks. 218 no guarantee can be given on the actual scheduling of the -deadline tasks.
169 219
170 Since when RT-throttling has been introduced each task group has a bandwidth 220 As already stated in Section 3, a necessary condition to be respected to
171 associated, calculated as a certain amount of runtime over a period. 221 correctly schedule a set of real-time tasks is that the total utilisation
172 Moreover, to make it possible to manipulate such bandwidth, readable/writable 222 is smaller than M. When talking about -deadline tasks, this requires that
173 controls have been added to both procfs (for system wide settings) and cgroupfs 223 the sum of the ratio between runtime and period for all tasks is smaller
174 (for per-group settings). 224 than M. Notice that the ratio runtime/period is equivalent to the utilisation
175 Therefore, the same interface is being used for controlling the bandwidth 225 of a "traditional" real-time task, and is also often referred to as
176 distrubution to -deadline tasks. 226 "bandwidth".
177 227 The interface used to control the CPU bandwidth that can be allocated
178 However, more discussion is needed in order to figure out how we want to manage 228 to -deadline tasks is similar to the one already used for -rt
179 SCHED_DEADLINE bandwidth at the task group level. Therefore, SCHED_DEADLINE 229 tasks with real-time group scheduling (a.k.a. RT-throttling - see
180 uses (for now) a less sophisticated, but actually very sensible, mechanism to 230 Documentation/scheduler/sched-rt-group.txt), and is based on readable/
181 ensure that a certain utilization cap is not overcome per each root_domain. 231 writable control files located in procfs (for system wide settings).
182 232 Notice that per-group settings (controlled through cgroupfs) are still not
183 Another main difference between deadline bandwidth management and RT-throttling 233 defined for -deadline tasks, because more discussion is needed in order to
234 figure out how we want to manage SCHED_DEADLINE bandwidth at the task group
235 level.
236
237 A main difference between deadline bandwidth management and RT-throttling
184 is that -deadline tasks have bandwidth on their own (while -rt ones don't!), 238 is that -deadline tasks have bandwidth on their own (while -rt ones don't!),
185 and thus we don't need an higher level throttling mechanism to enforce the 239 and thus we don't need a higher level throttling mechanism to enforce the
186 desired bandwidth. 240 desired bandwidth. In other words, this means that interface parameters are
241 only used at admission control time (i.e., when the user calls
242 sched_setattr()). Scheduling is then performed considering actual tasks'
243 parameters, so that CPU bandwidth is allocated to SCHED_DEADLINE tasks
244 respecting their needs in terms of granularity. Therefore, using this simple
245 interface we can put a cap on total utilization of -deadline tasks (i.e.,
246 \Sum (runtime_i / period_i) < global_dl_utilization_cap).
187 247
1884.1 System wide settings 2484.1 System wide settings
189------------------------ 249------------------------
190 250
191 The system wide settings are configured under the /proc virtual file system. 251 The system wide settings are configured under the /proc virtual file system.
192 252
193 For now the -rt knobs are used for dl admission control and the -deadline 253 For now the -rt knobs are used for -deadline admission control and the
194 runtime is accounted against the -rt runtime. We realise that this isn't 254 -deadline runtime is accounted against the -rt runtime. We realise that this
195 entirely desirable; however, it is better to have a small interface for now, 255 isn't entirely desirable; however, it is better to have a small interface for
196 and be able to change it easily later. The ideal situation (see 5.) is to run 256 now, and be able to change it easily later. The ideal situation (see 5.) is to
197 -rt tasks from a -deadline server; in which case the -rt bandwidth is a direct 257 run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
198 subset of dl_bw. 258 direct subset of dl_bw.
199 259
200 This means that, for a root_domain comprising M CPUs, -deadline tasks 260 This means that, for a root_domain comprising M CPUs, -deadline tasks
201 can be created while the sum of their bandwidths stays below: 261 can be created while the sum of their bandwidths stays below:
@@ -231,8 +291,16 @@ CONTENTS
231 950000. With rt_period equal to 1000000, by default, it means that -deadline 291 950000. With rt_period equal to 1000000, by default, it means that -deadline
232 tasks can use at most 95%, multiplied by the number of CPUs that compose the 292 tasks can use at most 95%, multiplied by the number of CPUs that compose the
233 root_domain, for each root_domain. 293 root_domain, for each root_domain.
294 This means that non -deadline tasks will receive at least 5% of the CPU time,
295 and that -deadline tasks will receive their runtime with a guaranteed
296 worst-case delay respect to the "deadline" parameter. If "deadline" = "period"
297 and the cpuset mechanism is used to implement partitioned scheduling (see
298 Section 5), then this simple setting of the bandwidth management is able to
299 deterministically guarantee that -deadline tasks will receive their runtime
300 in a period.
234 301
235 A -deadline task cannot fork. 302 Finally, notice that in order not to jeopardize the admission control a
303 -deadline task cannot fork.
236 304
2375. Tasks CPU affinity 3055. Tasks CPU affinity
238===================== 306=====================
@@ -279,3 +347,179 @@ CONTENTS
279 throttling patches [https://lkml.org/lkml/2010/2/23/239] but we still are in 347 throttling patches [https://lkml.org/lkml/2010/2/23/239] but we still are in
280 the preliminary phases of the merge and we really seek feedback that would 348 the preliminary phases of the merge and we really seek feedback that would
281 help us decide on the direction it should take. 349 help us decide on the direction it should take.
350
351Appendix A. Test suite
352======================
353
354 The SCHED_DEADLINE policy can be easily tested using two applications that
355 are part of a wider Linux Scheduler validation suite. The suite is
356 available as a GitHub repository: https://github.com/scheduler-tools.
357
358 The first testing application is called rt-app and can be used to
359 start multiple threads with specific parameters. rt-app supports
360 SCHED_{OTHER,FIFO,RR,DEADLINE} scheduling policies and their related
361 parameters (e.g., niceness, priority, runtime/deadline/period). rt-app
362 is a valuable tool, as it can be used to synthetically recreate certain
363 workloads (maybe mimicking real use-cases) and evaluate how the scheduler
364 behaves under such workloads. In this way, results are easily reproducible.
365 rt-app is available at: https://github.com/scheduler-tools/rt-app.
366
367 Thread parameters can be specified from the command line, with something like
368 this:
369
370 # rt-app -t 100000:10000:d -t 150000:20000:f:10 -D5
371
372 The above creates 2 threads. The first one, scheduled by SCHED_DEADLINE,
373 executes for 10ms every 100ms. The second one, scheduled at SCHED_FIFO
374 priority 10, executes for 20ms every 150ms. The test will run for a total
375 of 5 seconds.
376
377 More interestingly, configurations can be described with a json file that
378 can be passed as input to rt-app with something like this:
379
380 # rt-app my_config.json
381
382 The parameters that can be specified with the second method are a superset
383 of the command line options. Please refer to rt-app documentation for more
384 details (<rt-app-sources>/doc/*.json).
385
386 The second testing application is a modification of schedtool, called
387 schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a
388 certain pid/application. schedtool-dl is available at:
389 https://github.com/scheduler-tools/schedtool-dl.git.
390
391 The usage is straightforward:
392
393 # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app
394
395 With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
396 of 10ms every 100ms (note that parameters are expressed in microseconds).
397 You can also use schedtool to create a reservation for an already running
398 application, given that you know its pid:
399
400 # schedtool -E -t 10000000:100000000 my_app_pid
401
402Appendix B. Minimal main()
403==========================
404
405 We provide in what follows a simple (ugly) self-contained code snippet
406 showing how SCHED_DEADLINE reservations can be created by a real-time
407 application developer.
408
409 #define _GNU_SOURCE
410 #include <unistd.h>
411 #include <stdio.h>
412 #include <stdlib.h>
413 #include <string.h>
414 #include <time.h>
415 #include <linux/unistd.h>
416 #include <linux/kernel.h>
417 #include <linux/types.h>
418 #include <sys/syscall.h>
419 #include <pthread.h>
420
421 #define gettid() syscall(__NR_gettid)
422
423 #define SCHED_DEADLINE 6
424
425 /* XXX use the proper syscall numbers */
426 #ifdef __x86_64__
427 #define __NR_sched_setattr 314
428 #define __NR_sched_getattr 315
429 #endif
430
431 #ifdef __i386__
432 #define __NR_sched_setattr 351
433 #define __NR_sched_getattr 352
434 #endif
435
436 #ifdef __arm__
437 #define __NR_sched_setattr 380
438 #define __NR_sched_getattr 381
439 #endif
440
441 static volatile int done;
442
443 struct sched_attr {
444 __u32 size;
445
446 __u32 sched_policy;
447 __u64 sched_flags;
448
449 /* SCHED_NORMAL, SCHED_BATCH */
450 __s32 sched_nice;
451
452 /* SCHED_FIFO, SCHED_RR */
453 __u32 sched_priority;
454
455 /* SCHED_DEADLINE (nsec) */
456 __u64 sched_runtime;
457 __u64 sched_deadline;
458 __u64 sched_period;
459 };
460
461 int sched_setattr(pid_t pid,
462 const struct sched_attr *attr,
463 unsigned int flags)
464 {
465 return syscall(__NR_sched_setattr, pid, attr, flags);
466 }
467
468 int sched_getattr(pid_t pid,
469 struct sched_attr *attr,
470 unsigned int size,
471 unsigned int flags)
472 {
473 return syscall(__NR_sched_getattr, pid, attr, size, flags);
474 }
475
476 void *run_deadline(void *data)
477 {
478 struct sched_attr attr;
479 int x = 0;
480 int ret;
481 unsigned int flags = 0;
482
483 printf("deadline thread started [%ld]\n", gettid());
484
485 attr.size = sizeof(attr);
486 attr.sched_flags = 0;
487 attr.sched_nice = 0;
488 attr.sched_priority = 0;
489
490 /* This creates a 10ms/30ms reservation */
491 attr.sched_policy = SCHED_DEADLINE;
492 attr.sched_runtime = 10 * 1000 * 1000;
493 attr.sched_period = attr.sched_deadline = 30 * 1000 * 1000;
494
495 ret = sched_setattr(0, &attr, flags);
496 if (ret < 0) {
497 done = 0;
498 perror("sched_setattr");
499 exit(-1);
500 }
501
502 while (!done) {
503 x++;
504 }
505
506 printf("deadline thread dies [%ld]\n", gettid());
507 return NULL;
508 }
509
510 int main (int argc, char **argv)
511 {
512 pthread_t thread;
513
514 printf("main thread [%ld]\n", gettid());
515
516 pthread_create(&thread, NULL, run_deadline, NULL);
517
518 sleep(10);
519
520 done = 1;
521 pthread_join(thread, NULL);
522
523 printf("main dies [%ld]\n", gettid());
524 return 0;
525 }
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index e35d880f9773..89cfdd6e50cb 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -42,7 +42,7 @@
42 */ 42 */
43static DEFINE_PER_CPU(unsigned long, cpu_scale); 43static DEFINE_PER_CPU(unsigned long, cpu_scale);
44 44
45unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) 45unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
46{ 46{
47 return per_cpu(cpu_scale, cpu); 47 return per_cpu(cpu_scale, cpu);
48} 48}
@@ -166,7 +166,7 @@ static void update_cpu_capacity(unsigned int cpu)
166 set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); 166 set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
167 167
168 printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n", 168 printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n",
169 cpu, arch_scale_freq_capacity(NULL, cpu)); 169 cpu, arch_scale_cpu_capacity(NULL, cpu));
170} 170}
171 171
172#else 172#else
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c
index 29eb02ab3f25..0f3983241e60 100644
--- a/arch/cris/arch-v10/drivers/sync_serial.c
+++ b/arch/cris/arch-v10/drivers/sync_serial.c
@@ -1086,7 +1086,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf,
1086 } 1086 }
1087 local_irq_restore(flags); 1087 local_irq_restore(flags);
1088 schedule(); 1088 schedule();
1089 set_current_state(TASK_RUNNING);
1090 remove_wait_queue(&port->out_wait_q, &wait); 1089 remove_wait_queue(&port->out_wait_q, &wait);
1091 if (signal_pending(current)) 1090 if (signal_pending(current))
1092 return -EINTR; 1091 return -EINTR;
diff --git a/arch/cris/arch-v32/drivers/sync_serial.c b/arch/cris/arch-v32/drivers/sync_serial.c
index bbb806b68838..5a149134cfb5 100644
--- a/arch/cris/arch-v32/drivers/sync_serial.c
+++ b/arch/cris/arch-v32/drivers/sync_serial.c
@@ -1089,7 +1089,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf,
1089 } 1089 }
1090 1090
1091 schedule(); 1091 schedule();
1092 set_current_state(TASK_RUNNING);
1093 remove_wait_queue(&port->out_wait_q, &wait); 1092 remove_wait_queue(&port->out_wait_q, &wait);
1094 1093
1095 if (signal_pending(current)) 1094 if (signal_pending(current))
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index c7367130ab14..ce53c50d0ba4 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -19,7 +19,6 @@
19#include <asm/ptrace.h> 19#include <asm/ptrace.h>
20#include <asm/ustack.h> 20#include <asm/ustack.h>
21 21
22#define __ARCH_WANT_UNLOCKED_CTXSW
23#define ARCH_HAS_PREFETCH_SWITCH_STACK 22#define ARCH_HAS_PREFETCH_SWITCH_STACK
24 23
25#define IA64_NUM_PHYS_STACK_REG 96 24#define IA64_NUM_PHYS_STACK_REG 96
diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index 05f08438a7c4..f1df4cb4a286 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -397,12 +397,6 @@ unsigned long get_wchan(struct task_struct *p);
397#define ARCH_HAS_PREFETCHW 397#define ARCH_HAS_PREFETCHW
398#define prefetchw(x) __builtin_prefetch((x), 1, 1) 398#define prefetchw(x) __builtin_prefetch((x), 1, 1)
399 399
400/*
401 * See Documentation/scheduler/sched-arch.txt; prevents deadlock on SMP
402 * systems.
403 */
404#define __ARCH_WANT_UNLOCKED_CTXSW
405
406#endif 400#endif
407 401
408#endif /* _ASM_PROCESSOR_H */ 402#endif /* _ASM_PROCESSOR_H */
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 607559ab271f..6c840ceab820 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -32,6 +32,8 @@ static inline void setup_cputime_one_jiffy(void) { }
32typedef u64 __nocast cputime_t; 32typedef u64 __nocast cputime_t;
33typedef u64 __nocast cputime64_t; 33typedef u64 __nocast cputime64_t;
34 34
35#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new)
36
35#ifdef __KERNEL__ 37#ifdef __KERNEL__
36 38
37/* 39/*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 24b3f4949df4..08d659a9fcdb 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -30,7 +30,6 @@
30#include <linux/kprobes.h> 30#include <linux/kprobes.h>
31#include <linux/kdebug.h> 31#include <linux/kdebug.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/magic.h>
34#include <linux/ratelimit.h> 33#include <linux/ratelimit.h>
35#include <linux/context_tracking.h> 34#include <linux/context_tracking.h>
36#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
@@ -521,7 +520,6 @@ bail:
521void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) 520void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
522{ 521{
523 const struct exception_table_entry *entry; 522 const struct exception_table_entry *entry;
524 unsigned long *stackend;
525 523
526 /* Are we prepared to handle this fault? */ 524 /* Are we prepared to handle this fault? */
527 if ((entry = search_exception_tables(regs->nip)) != NULL) { 525 if ((entry = search_exception_tables(regs->nip)) != NULL) {
@@ -550,8 +548,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
550 printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", 548 printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
551 regs->nip); 549 regs->nip);
552 550
553 stackend = end_of_stack(current); 551 if (task_stack_end_corrupted(current))
554 if (current != &init_task && *stackend != STACK_END_MAGIC)
555 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 552 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
556 553
557 die("Kernel access of bad area", regs, sig); 554 die("Kernel access of bad area", regs, sig);
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index f65bd3634519..3001887f94b7 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -18,6 +18,8 @@
18typedef unsigned long long __nocast cputime_t; 18typedef unsigned long long __nocast cputime_t;
19typedef unsigned long long __nocast cputime64_t; 19typedef unsigned long long __nocast cputime64_t;
20 20
21#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
22
21static inline unsigned long __div(unsigned long long n, unsigned long base) 23static inline unsigned long __div(unsigned long long n, unsigned long base)
22{ 24{
23#ifndef CONFIG_64BIT 25#ifndef CONFIG_64BIT
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 9e3a72205827..dd16c902ff70 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -79,7 +79,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
79 set_task_state(current, TASK_INTERRUPTIBLE); 79 set_task_state(current, TASK_INTERRUPTIBLE);
80 80
81 schedule(); 81 schedule();
82 set_task_state(current, TASK_RUNNING);
83 remove_wait_queue(&host_read_wait, &wait); 82 remove_wait_queue(&host_read_wait, &wait);
84 83
85 if (atomic_dec_and_test(&host_sleep_count)) { 84 if (atomic_dec_and_test(&host_sleep_count)) {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 42a2dca984b3..9b1c0f8f68e6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -296,11 +296,19 @@ void smp_store_cpu_info(int id)
296} 296}
297 297
298static bool 298static bool
299topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
300{
301 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
302
303 return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
304}
305
306static bool
299topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) 307topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
300{ 308{
301 int cpu1 = c->cpu_index, cpu2 = o->cpu_index; 309 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
302 310
303 return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), 311 return !WARN_ONCE(!topology_same_node(c, o),
304 "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " 312 "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
305 "[node: %d != %d]. Ignoring dependency.\n", 313 "[node: %d != %d]. Ignoring dependency.\n",
306 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); 314 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
@@ -341,17 +349,44 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
341 return false; 349 return false;
342} 350}
343 351
344static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) 352/*
353 * Unlike the other levels, we do not enforce keeping a
354 * multicore group inside a NUMA node. If this happens, we will
355 * discard the MC level of the topology later.
356 */
357static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
345{ 358{
346 if (c->phys_proc_id == o->phys_proc_id) { 359 if (c->phys_proc_id == o->phys_proc_id)
347 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 360 return true;
348 return true;
349
350 return topology_sane(c, o, "mc");
351 }
352 return false; 361 return false;
353} 362}
354 363
364static struct sched_domain_topology_level numa_inside_package_topology[] = {
365#ifdef CONFIG_SCHED_SMT
366 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
367#endif
368#ifdef CONFIG_SCHED_MC
369 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
370#endif
371 { NULL, },
372};
373/*
374 * set_sched_topology() sets the topology internal to a CPU. The
375 * NUMA topologies are layered on top of it to build the full
376 * system topology.
377 *
378 * If NUMA nodes are observed to occur within a CPU package, this
379 * function should be called. It forces the sched domain code to
380 * only use the SMT level for the CPU portion of the topology.
381 * This essentially falls back to relying on NUMA information
382 * from the SRAT table to describe the entire system topology
383 * (except for hyperthreads).
384 */
385static void primarily_use_numa_for_topology(void)
386{
387 set_sched_topology(numa_inside_package_topology);
388}
389
355void set_cpu_sibling_map(int cpu) 390void set_cpu_sibling_map(int cpu)
356{ 391{
357 bool has_smt = smp_num_siblings > 1; 392 bool has_smt = smp_num_siblings > 1;
@@ -388,7 +423,7 @@ void set_cpu_sibling_map(int cpu)
388 for_each_cpu(i, cpu_sibling_setup_mask) { 423 for_each_cpu(i, cpu_sibling_setup_mask) {
389 o = &cpu_data(i); 424 o = &cpu_data(i);
390 425
391 if ((i == cpu) || (has_mp && match_mc(c, o))) { 426 if ((i == cpu) || (has_mp && match_die(c, o))) {
392 link_mask(core, cpu, i); 427 link_mask(core, cpu, i);
393 428
394 /* 429 /*
@@ -410,6 +445,8 @@ void set_cpu_sibling_map(int cpu)
410 } else if (i != cpu && !c->booted_cores) 445 } else if (i != cpu && !c->booted_cores)
411 c->booted_cores = cpu_data(i).booted_cores; 446 c->booted_cores = cpu_data(i).booted_cores;
412 } 447 }
448 if (match_die(c, o) && !topology_same_node(c, o))
449 primarily_use_numa_for_topology();
413 } 450 }
414} 451}
415 452
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 83bb03bfa259..9c5b32e2bdc0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,7 +3,6 @@
3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
5 */ 5 */
6#include <linux/magic.h> /* STACK_END_MAGIC */
7#include <linux/sched.h> /* test_thread_flag(), ... */ 6#include <linux/sched.h> /* test_thread_flag(), ... */
8#include <linux/kdebug.h> /* oops_begin/end, ... */ 7#include <linux/kdebug.h> /* oops_begin/end, ... */
9#include <linux/module.h> /* search_exception_table */ 8#include <linux/module.h> /* search_exception_table */
@@ -649,7 +648,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
649 unsigned long address, int signal, int si_code) 648 unsigned long address, int signal, int si_code)
650{ 649{
651 struct task_struct *tsk = current; 650 struct task_struct *tsk = current;
652 unsigned long *stackend;
653 unsigned long flags; 651 unsigned long flags;
654 int sig; 652 int sig;
655 653
@@ -709,8 +707,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
709 707
710 show_fault_oops(regs, error_code, address); 708 show_fault_oops(regs, error_code, address);
711 709
712 stackend = end_of_stack(tsk); 710 if (task_stack_end_corrupted(tsk))
713 if (tsk != &init_task && *stackend != STACK_END_MAGIC)
714 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 711 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
715 712
716 tsk->thread.cr2 = address; 713 tsk->thread.cr2 = address;
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index ee9df5e3f5eb..125150dc6e81 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -223,8 +223,14 @@ void cpuidle_uninstall_idle_handler(void)
223{ 223{
224 if (enabled_devices) { 224 if (enabled_devices) {
225 initialized = 0; 225 initialized = 0;
226 kick_all_cpus_sync(); 226 wake_up_all_idle_cpus();
227 } 227 }
228
229 /*
230 * Make sure external observers (such as the scheduler)
231 * are done looking at pointed idle states.
232 */
233 synchronize_rcu();
228} 234}
229 235
230/** 236/**
@@ -530,11 +536,6 @@ EXPORT_SYMBOL_GPL(cpuidle_register);
530 536
531#ifdef CONFIG_SMP 537#ifdef CONFIG_SMP
532 538
533static void smp_callback(void *v)
534{
535 /* we already woke the CPU up, nothing more to do */
536}
537
538/* 539/*
539 * This function gets called when a part of the kernel has a new latency 540 * This function gets called when a part of the kernel has a new latency
540 * requirement. This means we need to get all processors out of their C-state, 541 * requirement. This means we need to get all processors out of their C-state,
@@ -544,7 +545,7 @@ static void smp_callback(void *v)
544static int cpuidle_latency_notify(struct notifier_block *b, 545static int cpuidle_latency_notify(struct notifier_block *b,
545 unsigned long l, void *v) 546 unsigned long l, void *v)
546{ 547{
547 smp_call_function(smp_callback, NULL, 1); 548 wake_up_all_idle_cpus();
548 return NOTIFY_OK; 549 return NOTIFY_OK;
549} 550}
550 551
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index 77711623b973..7bcbf863656e 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -400,7 +400,6 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible)
400 } 400 }
401 schedule(); 401 schedule();
402 remove_wait_queue(&vga_wait_queue, &wait); 402 remove_wait_queue(&vga_wait_queue, &wait);
403 set_current_state(TASK_RUNNING);
404 } 403 }
405 return rc; 404 return rc;
406} 405}
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index ab472c557d18..0505559f0965 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -720,7 +720,6 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
720 720
721 io_schedule(); 721 io_schedule();
722 722
723 set_task_state(current, TASK_RUNNING);
724 remove_wait_queue(&c->free_buffer_wait, &wait); 723 remove_wait_queue(&c->free_buffer_wait, &wait);
725 724
726 dm_bufio_lock(c); 725 dm_bufio_lock(c);
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index 90cca5e3805f..ef31b77404ef 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -121,7 +121,6 @@ static int kpowerswd(void *param)
121 unsigned long soft_power_reg = (unsigned long) param; 121 unsigned long soft_power_reg = (unsigned long) param;
122 122
123 schedule_timeout_interruptible(pwrsw_enabled ? HZ : HZ/POWERSWITCH_POLL_PER_SEC); 123 schedule_timeout_interruptible(pwrsw_enabled ? HZ : HZ/POWERSWITCH_POLL_PER_SEC);
124 __set_current_state(TASK_RUNNING);
125 124
126 if (unlikely(!pwrsw_enabled)) 125 if (unlikely(!pwrsw_enabled))
127 continue; 126 continue;
diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c
index fbc6701bef30..213e54ee8a66 100644
--- a/drivers/s390/net/claw.c
+++ b/drivers/s390/net/claw.c
@@ -481,7 +481,6 @@ claw_open(struct net_device *dev)
481 spin_unlock_irqrestore( 481 spin_unlock_irqrestore(
482 get_ccwdev_lock(privptr->channel[i].cdev), saveflags); 482 get_ccwdev_lock(privptr->channel[i].cdev), saveflags);
483 schedule(); 483 schedule();
484 set_current_state(TASK_RUNNING);
485 remove_wait_queue(&privptr->channel[i].wait, &wait); 484 remove_wait_queue(&privptr->channel[i].wait, &wait);
486 if(rc != 0) 485 if(rc != 0)
487 ccw_check_return_code(privptr->channel[i].cdev, rc); 486 ccw_check_return_code(privptr->channel[i].cdev, rc);
@@ -828,7 +827,6 @@ claw_release(struct net_device *dev)
828 spin_unlock_irqrestore( 827 spin_unlock_irqrestore(
829 get_ccwdev_lock(privptr->channel[i].cdev), saveflags); 828 get_ccwdev_lock(privptr->channel[i].cdev), saveflags);
830 schedule(); 829 schedule();
831 set_current_state(TASK_RUNNING);
832 remove_wait_queue(&privptr->channel[i].wait, &wait); 830 remove_wait_queue(&privptr->channel[i].wait, &wait);
833 if (rc != 0) { 831 if (rc != 0) {
834 ccw_check_return_code(privptr->channel[i].cdev, rc); 832 ccw_check_return_code(privptr->channel[i].cdev, rc);
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 00ee0ed642aa..4a8ac7d8c76b 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -1884,7 +1884,6 @@ retry:
1884 set_current_state(TASK_INTERRUPTIBLE); 1884 set_current_state(TASK_INTERRUPTIBLE);
1885 spin_unlock_bh(&p->fcoe_rx_list.lock); 1885 spin_unlock_bh(&p->fcoe_rx_list.lock);
1886 schedule(); 1886 schedule();
1887 set_current_state(TASK_RUNNING);
1888 goto retry; 1887 goto retry;
1889 } 1888 }
1890 1889
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index dabd25429c58..db3dbd999cb6 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -4875,7 +4875,6 @@ qla2x00_do_dpc(void *data)
4875 "DPC handler sleeping.\n"); 4875 "DPC handler sleeping.\n");
4876 4876
4877 schedule(); 4877 schedule();
4878 __set_current_state(TASK_RUNNING);
4879 4878
4880 if (!base_vha->flags.init_done || ha->flags.mbox_busy) 4879 if (!base_vha->flags.init_done || ha->flags.mbox_busy)
4881 goto end_loop; 4880 goto end_loop;
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 86f1a91e896f..14c9c8d18d02 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -3215,7 +3215,6 @@ kiblnd_connd (void *arg)
3215 3215
3216 schedule_timeout(timeout); 3216 schedule_timeout(timeout);
3217 3217
3218 set_current_state(TASK_RUNNING);
3219 remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); 3218 remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
3220 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); 3219 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
3221 } 3220 }
@@ -3432,7 +3431,6 @@ kiblnd_scheduler(void *arg)
3432 busy_loops = 0; 3431 busy_loops = 0;
3433 3432
3434 remove_wait_queue(&sched->ibs_waitq, &wait); 3433 remove_wait_queue(&sched->ibs_waitq, &wait);
3435 set_current_state(TASK_RUNNING);
3436 spin_lock_irqsave(&sched->ibs_lock, flags); 3434 spin_lock_irqsave(&sched->ibs_lock, flags);
3437 } 3435 }
3438 3436
@@ -3507,7 +3505,6 @@ kiblnd_failover_thread(void *arg)
3507 3505
3508 rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : 3506 rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
3509 cfs_time_seconds(1)); 3507 cfs_time_seconds(1));
3510 set_current_state(TASK_RUNNING);
3511 remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); 3508 remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
3512 write_lock_irqsave(glock, flags); 3509 write_lock_irqsave(glock, flags);
3513 3510
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
index bcfee7c21942..d29f5f134b89 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -2232,7 +2232,6 @@ ksocknal_connd (void *arg)
2232 nloops = 0; 2232 nloops = 0;
2233 schedule_timeout(timeout); 2233 schedule_timeout(timeout);
2234 2234
2235 set_current_state(TASK_RUNNING);
2236 remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); 2235 remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
2237 spin_lock_bh(connd_lock); 2236 spin_lock_bh(connd_lock);
2238 } 2237 }
diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c
index 1bf9c90b4789..e73ca3df9734 100644
--- a/drivers/staging/lustre/lustre/libcfs/fail.c
+++ b/drivers/staging/lustre/lustre/libcfs/fail.c
@@ -131,7 +131,6 @@ int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
131 id, ms); 131 id, ms);
132 set_current_state(TASK_UNINTERRUPTIBLE); 132 set_current_state(TASK_UNINTERRUPTIBLE);
133 schedule_timeout(cfs_time_seconds(ms) / 1000); 133 schedule_timeout(cfs_time_seconds(ms) / 1000);
134 set_current_state(TASK_RUNNING);
135 CERROR("cfs_fail_timeout id %x awake\n", id); 134 CERROR("cfs_fail_timeout id %x awake\n", id);
136 } 135 }
137 return ret; 136 return ret;
diff --git a/drivers/tty/bfin_jtag_comm.c b/drivers/tty/bfin_jtag_comm.c
index 8096fcbe2dc1..d7b198c400c7 100644
--- a/drivers/tty/bfin_jtag_comm.c
+++ b/drivers/tty/bfin_jtag_comm.c
@@ -77,7 +77,6 @@ bfin_jc_emudat_manager(void *arg)
77 pr_debug("waiting for readers\n"); 77 pr_debug("waiting for readers\n");
78 __set_current_state(TASK_UNINTERRUPTIBLE); 78 __set_current_state(TASK_UNINTERRUPTIBLE);
79 schedule(); 79 schedule();
80 __set_current_state(TASK_RUNNING);
81 continue; 80 continue;
82 } 81 }
83 82
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index b6df2e83809f..52976785a32c 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -130,7 +130,6 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
130 /* second+ BUSY - sleep a little bit */ 130 /* second+ BUSY - sleep a little bit */
131 set_current_state(TASK_UNINTERRUPTIBLE); 131 set_current_state(TASK_UNINTERRUPTIBLE);
132 schedule_timeout(1); 132 schedule_timeout(1);
133 __set_current_state(TASK_RUNNING);
134 } 133 }
135 continue; 134 continue;
136 } 135 }
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 0acddf60af55..bc462dcd7a40 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1585,7 +1585,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1585 set_current_state(TASK_UNINTERRUPTIBLE); 1585 set_current_state(TASK_UNINTERRUPTIBLE);
1586 LOGGC_UNLOCK(log); 1586 LOGGC_UNLOCK(log);
1587 schedule(); 1587 schedule();
1588 __set_current_state(TASK_RUNNING);
1589 LOGGC_LOCK(log); 1588 LOGGC_LOCK(log);
1590 remove_wait_queue(&target->gcwait, &__wait); 1589 remove_wait_queue(&target->gcwait, &__wait);
1591 } 1590 }
@@ -2359,7 +2358,6 @@ int jfsIOWait(void *arg)
2359 set_current_state(TASK_INTERRUPTIBLE); 2358 set_current_state(TASK_INTERRUPTIBLE);
2360 spin_unlock_irq(&log_redrive_lock); 2359 spin_unlock_irq(&log_redrive_lock);
2361 schedule(); 2360 schedule();
2362 __set_current_state(TASK_RUNNING);
2363 } 2361 }
2364 } while (!kthread_should_stop()); 2362 } while (!kthread_should_stop());
2365 2363
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 564c4f279ac6..d595856453b2 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -136,7 +136,6 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
136 set_current_state(TASK_UNINTERRUPTIBLE); 136 set_current_state(TASK_UNINTERRUPTIBLE);
137 TXN_UNLOCK(); 137 TXN_UNLOCK();
138 io_schedule(); 138 io_schedule();
139 __set_current_state(TASK_RUNNING);
140 remove_wait_queue(event, &wait); 139 remove_wait_queue(event, &wait);
141} 140}
142 141
@@ -2808,7 +2807,6 @@ int jfs_lazycommit(void *arg)
2808 set_current_state(TASK_INTERRUPTIBLE); 2807 set_current_state(TASK_INTERRUPTIBLE);
2809 LAZY_UNLOCK(flags); 2808 LAZY_UNLOCK(flags);
2810 schedule(); 2809 schedule();
2811 __set_current_state(TASK_RUNNING);
2812 remove_wait_queue(&jfs_commit_thread_wait, &wq); 2810 remove_wait_queue(&jfs_commit_thread_wait, &wq);
2813 } 2811 }
2814 } while (!kthread_should_stop()); 2812 } while (!kthread_should_stop());
@@ -2996,7 +2994,6 @@ int jfs_sync(void *arg)
2996 set_current_state(TASK_INTERRUPTIBLE); 2994 set_current_state(TASK_INTERRUPTIBLE);
2997 TXN_UNLOCK(); 2995 TXN_UNLOCK();
2998 schedule(); 2996 schedule();
2999 __set_current_state(TASK_RUNNING);
3000 } 2997 }
3001 } while (!kthread_should_stop()); 2998 } while (!kthread_should_stop());
3002 2999
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index 8d04bda2bd2e..e966c023b1b7 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -92,7 +92,6 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
92 92
93 set_current_state(TASK_UNINTERRUPTIBLE); 93 set_current_state(TASK_UNINTERRUPTIBLE);
94 schedule(); 94 schedule();
95 __set_current_state(TASK_RUNNING);
96 remove_wait_queue(&nn->bl_wq, &wq); 95 remove_wait_queue(&nn->bl_wq, &wq);
97 96
98 if (reply->status != BL_DEVICE_REQUEST_PROC) { 97 if (reply->status != BL_DEVICE_REQUEST_PROC) {
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index ea95a2bc21b5..a25490ae6c62 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -675,7 +675,6 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
675 } 675 }
676 676
677 schedule(); 677 schedule();
678 set_current_state(TASK_RUNNING);
679 678
680 if (msg.errno < 0) 679 if (msg.errno < 0)
681 ret = msg.errno; 680 ret = msg.errno;
diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h
index d5cb78f53986..fe386fc6e85e 100644
--- a/include/asm-generic/cputime_jiffies.h
+++ b/include/asm-generic/cputime_jiffies.h
@@ -3,6 +3,8 @@
3 3
4typedef unsigned long __nocast cputime_t; 4typedef unsigned long __nocast cputime_t;
5 5
6#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new)
7
6#define cputime_one_jiffy jiffies_to_cputime(1) 8#define cputime_one_jiffy jiffies_to_cputime(1)
7#define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) 9#define cputime_to_jiffies(__ct) (__force unsigned long)(__ct)
8#define cputime_to_scaled(__ct) (__ct) 10#define cputime_to_scaled(__ct) (__ct)
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
index 4e817606c549..0419485891f2 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -21,6 +21,8 @@
21typedef u64 __nocast cputime_t; 21typedef u64 __nocast cputime_t;
22typedef u64 __nocast cputime64_t; 22typedef u64 __nocast cputime64_t;
23 23
24#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
25
24#define cputime_one_jiffy jiffies_to_cputime(1) 26#define cputime_one_jiffy jiffies_to_cputime(1)
25 27
26#define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) 28#define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 05a8c00e8339..5e344bbe63ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -57,6 +57,7 @@ struct sched_param {
57#include <linux/llist.h> 57#include <linux/llist.h>
58#include <linux/uidgid.h> 58#include <linux/uidgid.h>
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/magic.h>
60 61
61#include <asm/processor.h> 62#include <asm/processor.h>
62 63
@@ -646,6 +647,7 @@ struct signal_struct {
646 * Live threads maintain their own counters and add to these 647 * Live threads maintain their own counters and add to these
647 * in __exit_signal, except for the group leader. 648 * in __exit_signal, except for the group leader.
648 */ 649 */
650 seqlock_t stats_lock;
649 cputime_t utime, stime, cutime, cstime; 651 cputime_t utime, stime, cutime, cstime;
650 cputime_t gtime; 652 cputime_t gtime;
651 cputime_t cgtime; 653 cputime_t cgtime;
@@ -1024,6 +1026,7 @@ struct sched_domain_topology_level {
1024extern struct sched_domain_topology_level *sched_domain_topology; 1026extern struct sched_domain_topology_level *sched_domain_topology;
1025 1027
1026extern void set_sched_topology(struct sched_domain_topology_level *tl); 1028extern void set_sched_topology(struct sched_domain_topology_level *tl);
1029extern void wake_up_if_idle(int cpu);
1027 1030
1028#ifdef CONFIG_SCHED_DEBUG 1031#ifdef CONFIG_SCHED_DEBUG
1029# define SD_INIT_NAME(type) .name = #type 1032# define SD_INIT_NAME(type) .name = #type
@@ -2647,6 +2650,8 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
2647} 2650}
2648 2651
2649#endif 2652#endif
2653#define task_stack_end_corrupted(task) \
2654 (*(end_of_stack(task)) != STACK_END_MAGIC)
2650 2655
2651static inline int object_is_on_stack(void *obj) 2656static inline int object_is_on_stack(void *obj)
2652{ 2657{
@@ -2669,6 +2674,7 @@ static inline unsigned long stack_not_used(struct task_struct *p)
2669 return (unsigned long)n - (unsigned long)end_of_stack(p); 2674 return (unsigned long)n - (unsigned long)end_of_stack(p);
2670} 2675}
2671#endif 2676#endif
2677extern void set_task_stack_end_magic(struct task_struct *tsk);
2672 2678
2673/* set thread flags in other task's structures 2679/* set thread flags in other task's structures
2674 * - see asm/thread_info.h for TIF_xxxx flags available 2680 * - see asm/thread_info.h for TIF_xxxx flags available
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index cc359636cfa3..f5df8f687b4d 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -456,4 +456,23 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
456 spin_unlock_irqrestore(&sl->lock, flags); 456 spin_unlock_irqrestore(&sl->lock, flags);
457} 457}
458 458
459static inline unsigned long
460read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
461{
462 unsigned long flags = 0;
463
464 if (!(*seq & 1)) /* Even */
465 *seq = read_seqbegin(lock);
466 else /* Odd */
467 read_seqlock_excl_irqsave(lock, flags);
468
469 return flags;
470}
471
472static inline void
473done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
474{
475 if (seq & 1)
476 read_sequnlock_excl_irqrestore(lock, flags);
477}
459#endif /* __LINUX_SEQLOCK_H */ 478#endif /* __LINUX_SEQLOCK_H */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 34347f26be9b..93dff5fff524 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -100,6 +100,7 @@ int smp_call_function_any(const struct cpumask *mask,
100 smp_call_func_t func, void *info, int wait); 100 smp_call_func_t func, void *info, int wait);
101 101
102void kick_all_cpus_sync(void); 102void kick_all_cpus_sync(void);
103void wake_up_all_idle_cpus(void);
103 104
104/* 105/*
105 * Generic and arch helpers 106 * Generic and arch helpers
@@ -148,6 +149,7 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
148} 149}
149 150
150static inline void kick_all_cpus_sync(void) { } 151static inline void kick_all_cpus_sync(void) { }
152static inline void wake_up_all_idle_cpus(void) { }
151 153
152#endif /* !SMP */ 154#endif /* !SMP */
153 155
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 80115bf88671..e4a8eb9312ea 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -281,9 +281,11 @@ do { \
281 * wake_up() has to be called after changing any variable that could 281 * wake_up() has to be called after changing any variable that could
282 * change the result of the wait condition. 282 * change the result of the wait condition.
283 * 283 *
284 * The function returns 0 if the @timeout elapsed, or the remaining 284 * Returns:
285 * jiffies (at least 1) if the @condition evaluated to %true before 285 * 0 if the @condition evaluated to %false after the @timeout elapsed,
286 * the @timeout elapsed. 286 * 1 if the @condition evaluated to %true after the @timeout elapsed,
287 * or the remaining jiffies (at least 1) if the @condition evaluated
288 * to %true before the @timeout elapsed.
287 */ 289 */
288#define wait_event_timeout(wq, condition, timeout) \ 290#define wait_event_timeout(wq, condition, timeout) \
289({ \ 291({ \
@@ -364,9 +366,11 @@ do { \
364 * change the result of the wait condition. 366 * change the result of the wait condition.
365 * 367 *
366 * Returns: 368 * Returns:
367 * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by 369 * 0 if the @condition evaluated to %false after the @timeout elapsed,
368 * a signal, or the remaining jiffies (at least 1) if the @condition 370 * 1 if the @condition evaluated to %true after the @timeout elapsed,
369 * evaluated to %true before the @timeout elapsed. 371 * the remaining jiffies (at least 1) if the @condition evaluated
372 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
373 * interrupted by a signal.
370 */ 374 */
371#define wait_event_interruptible_timeout(wq, condition, timeout) \ 375#define wait_event_interruptible_timeout(wq, condition, timeout) \
372({ \ 376({ \
diff --git a/init/main.c b/init/main.c
index c5c11da6c4e1..89ec862da2d4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -508,6 +508,7 @@ asmlinkage __visible void __init start_kernel(void)
508 * lockdep hash: 508 * lockdep hash:
509 */ 509 */
510 lockdep_init(); 510 lockdep_init();
511 set_task_stack_end_magic(&init_task);
511 smp_setup_processor_id(); 512 smp_setup_processor_id();
512 debug_objects_early_init(); 513 debug_objects_early_init();
513 514
diff --git a/kernel/exit.c b/kernel/exit.c
index d13f2eec4bb8..5d30019ff953 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,32 +115,33 @@ static void __exit_signal(struct task_struct *tsk)
115 115
116 if (tsk == sig->curr_target) 116 if (tsk == sig->curr_target)
117 sig->curr_target = next_thread(tsk); 117 sig->curr_target = next_thread(tsk);
118 /*
119 * Accumulate here the counters for all threads but the
120 * group leader as they die, so they can be added into
121 * the process-wide totals when those are taken.
122 * The group leader stays around as a zombie as long
123 * as there are other threads. When it gets reaped,
124 * the exit.c code will add its counts into these totals.
125 * We won't ever get here for the group leader, since it
126 * will have been the last reference on the signal_struct.
127 */
128 task_cputime(tsk, &utime, &stime);
129 sig->utime += utime;
130 sig->stime += stime;
131 sig->gtime += task_gtime(tsk);
132 sig->min_flt += tsk->min_flt;
133 sig->maj_flt += tsk->maj_flt;
134 sig->nvcsw += tsk->nvcsw;
135 sig->nivcsw += tsk->nivcsw;
136 sig->inblock += task_io_get_inblock(tsk);
137 sig->oublock += task_io_get_oublock(tsk);
138 task_io_accounting_add(&sig->ioac, &tsk->ioac);
139 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
140 } 118 }
141 119
120 /*
121 * Accumulate here the counters for all threads but the group leader
122 * as they die, so they can be added into the process-wide totals
123 * when those are taken. The group leader stays around as a zombie as
124 * long as there are other threads. When it gets reaped, the exit.c
125 * code will add its counts into these totals. We won't ever get here
126 * for the group leader, since it will have been the last reference on
127 * the signal_struct.
128 */
129 task_cputime(tsk, &utime, &stime);
130 write_seqlock(&sig->stats_lock);
131 sig->utime += utime;
132 sig->stime += stime;
133 sig->gtime += task_gtime(tsk);
134 sig->min_flt += tsk->min_flt;
135 sig->maj_flt += tsk->maj_flt;
136 sig->nvcsw += tsk->nvcsw;
137 sig->nivcsw += tsk->nivcsw;
138 sig->inblock += task_io_get_inblock(tsk);
139 sig->oublock += task_io_get_oublock(tsk);
140 task_io_accounting_add(&sig->ioac, &tsk->ioac);
141 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
142 sig->nr_threads--; 142 sig->nr_threads--;
143 __unhash_process(tsk, group_dead); 143 __unhash_process(tsk, group_dead);
144 write_sequnlock(&sig->stats_lock);
144 145
145 /* 146 /*
146 * Do this under ->siglock, we can race with another thread 147 * Do this under ->siglock, we can race with another thread
@@ -1046,6 +1047,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1046 spin_lock_irq(&p->real_parent->sighand->siglock); 1047 spin_lock_irq(&p->real_parent->sighand->siglock);
1047 psig = p->real_parent->signal; 1048 psig = p->real_parent->signal;
1048 sig = p->signal; 1049 sig = p->signal;
1050 write_seqlock(&psig->stats_lock);
1049 psig->cutime += tgutime + sig->cutime; 1051 psig->cutime += tgutime + sig->cutime;
1050 psig->cstime += tgstime + sig->cstime; 1052 psig->cstime += tgstime + sig->cstime;
1051 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; 1053 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1068,6 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1068 psig->cmaxrss = maxrss; 1070 psig->cmaxrss = maxrss;
1069 task_io_accounting_add(&psig->ioac, &p->ioac); 1071 task_io_accounting_add(&psig->ioac, &p->ioac);
1070 task_io_accounting_add(&psig->ioac, &sig->ioac); 1072 task_io_accounting_add(&psig->ioac, &sig->ioac);
1073 write_sequnlock(&psig->stats_lock);
1071 spin_unlock_irq(&p->real_parent->sighand->siglock); 1074 spin_unlock_irq(&p->real_parent->sighand->siglock);
1072 } 1075 }
1073 1076
diff --git a/kernel/fork.c b/kernel/fork.c
index 8c162d102740..9b7d746d6d62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst,
294 return 0; 294 return 0;
295} 295}
296 296
297void set_task_stack_end_magic(struct task_struct *tsk)
298{
299 unsigned long *stackend;
300
301 stackend = end_of_stack(tsk);
302 *stackend = STACK_END_MAGIC; /* for overflow detection */
303}
304
297static struct task_struct *dup_task_struct(struct task_struct *orig) 305static struct task_struct *dup_task_struct(struct task_struct *orig)
298{ 306{
299 struct task_struct *tsk; 307 struct task_struct *tsk;
300 struct thread_info *ti; 308 struct thread_info *ti;
301 unsigned long *stackend;
302 int node = tsk_fork_get_node(orig); 309 int node = tsk_fork_get_node(orig);
303 int err; 310 int err;
304 311
@@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
328 setup_thread_stack(tsk, orig); 335 setup_thread_stack(tsk, orig);
329 clear_user_return_notifier(tsk); 336 clear_user_return_notifier(tsk);
330 clear_tsk_need_resched(tsk); 337 clear_tsk_need_resched(tsk);
331 stackend = end_of_stack(tsk); 338 set_task_stack_end_magic(tsk);
332 *stackend = STACK_END_MAGIC; /* for overflow detection */
333 339
334#ifdef CONFIG_CC_STACKPROTECTOR 340#ifdef CONFIG_CC_STACKPROTECTOR
335 tsk->stack_canary = get_random_int(); 341 tsk->stack_canary = get_random_int();
@@ -1067,6 +1073,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1067 sig->curr_target = tsk; 1073 sig->curr_target = tsk;
1068 init_sigpending(&sig->shared_pending); 1074 init_sigpending(&sig->shared_pending);
1069 INIT_LIST_HEAD(&sig->posix_timers); 1075 INIT_LIST_HEAD(&sig->posix_timers);
1076 seqlock_init(&sig->stats_lock);
1070 1077
1071 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1078 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1072 sig->real_timer.function = it_real_fn; 1079 sig->real_timer.function = it_real_fn;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba98301..8a2e230fb86a 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) 148 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
149 goto out; 149 goto out;
150 150
151 t = p; 151 for_each_thread(p, t)
152 do {
153 sched_move_task(t); 152 sched_move_task(t);
154 } while_each_thread(p, t);
155
156out: 153out:
157 unlock_task_sighand(p, &flags); 154 unlock_task_sighand(p, &flags);
158 autogroup_kref_put(prev); 155 autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f235c41a3532..44999505e1bf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -317,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
317 for (;;) { 317 for (;;) {
318 rq = task_rq(p); 318 rq = task_rq(p);
319 raw_spin_lock(&rq->lock); 319 raw_spin_lock(&rq->lock);
320 if (likely(rq == task_rq(p))) 320 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
321 return rq; 321 return rq;
322 raw_spin_unlock(&rq->lock); 322 raw_spin_unlock(&rq->lock);
323
324 while (unlikely(task_on_rq_migrating(p)))
325 cpu_relax();
323 } 326 }
324} 327}
325 328
@@ -336,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
336 raw_spin_lock_irqsave(&p->pi_lock, *flags); 339 raw_spin_lock_irqsave(&p->pi_lock, *flags);
337 rq = task_rq(p); 340 rq = task_rq(p);
338 raw_spin_lock(&rq->lock); 341 raw_spin_lock(&rq->lock);
339 if (likely(rq == task_rq(p))) 342 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
340 return rq; 343 return rq;
341 raw_spin_unlock(&rq->lock); 344 raw_spin_unlock(&rq->lock);
342 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 345 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
346
347 while (unlikely(task_on_rq_migrating(p)))
348 cpu_relax();
343 } 349 }
344} 350}
345 351
@@ -433,7 +439,15 @@ static void __hrtick_start(void *arg)
433void hrtick_start(struct rq *rq, u64 delay) 439void hrtick_start(struct rq *rq, u64 delay)
434{ 440{
435 struct hrtimer *timer = &rq->hrtick_timer; 441 struct hrtimer *timer = &rq->hrtick_timer;
436 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 442 ktime_t time;
443 s64 delta;
444
445 /*
446 * Don't schedule slices shorter than 10000ns, that just
447 * doesn't make sense and can cause timer DoS.
448 */
449 delta = max_t(s64, delay, 10000LL);
450 time = ktime_add_ns(timer->base->get_time(), delta);
437 451
438 hrtimer_set_expires(timer, time); 452 hrtimer_set_expires(timer, time);
439 453
@@ -1027,7 +1041,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1027 * A queue event has occurred, and we're going to schedule. In 1041 * A queue event has occurred, and we're going to schedule. In
1028 * this case, we can save a useless back to back clock update. 1042 * this case, we can save a useless back to back clock update.
1029 */ 1043 */
1030 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 1044 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1031 rq->skip_clock_update = 1; 1045 rq->skip_clock_update = 1;
1032} 1046}
1033 1047
@@ -1072,7 +1086,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1072 1086
1073static void __migrate_swap_task(struct task_struct *p, int cpu) 1087static void __migrate_swap_task(struct task_struct *p, int cpu)
1074{ 1088{
1075 if (p->on_rq) { 1089 if (task_on_rq_queued(p)) {
1076 struct rq *src_rq, *dst_rq; 1090 struct rq *src_rq, *dst_rq;
1077 1091
1078 src_rq = task_rq(p); 1092 src_rq = task_rq(p);
@@ -1198,7 +1212,7 @@ static int migration_cpu_stop(void *data);
1198unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1212unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1199{ 1213{
1200 unsigned long flags; 1214 unsigned long flags;
1201 int running, on_rq; 1215 int running, queued;
1202 unsigned long ncsw; 1216 unsigned long ncsw;
1203 struct rq *rq; 1217 struct rq *rq;
1204 1218
@@ -1236,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1236 rq = task_rq_lock(p, &flags); 1250 rq = task_rq_lock(p, &flags);
1237 trace_sched_wait_task(p); 1251 trace_sched_wait_task(p);
1238 running = task_running(rq, p); 1252 running = task_running(rq, p);
1239 on_rq = p->on_rq; 1253 queued = task_on_rq_queued(p);
1240 ncsw = 0; 1254 ncsw = 0;
1241 if (!match_state || p->state == match_state) 1255 if (!match_state || p->state == match_state)
1242 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1256 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1268,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1268 * running right now), it's preempted, and we should 1282 * running right now), it's preempted, and we should
1269 * yield - it could be a while. 1283 * yield - it could be a while.
1270 */ 1284 */
1271 if (unlikely(on_rq)) { 1285 if (unlikely(queued)) {
1272 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1286 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1273 1287
1274 set_current_state(TASK_UNINTERRUPTIBLE); 1288 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1462,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1462static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1476static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1463{ 1477{
1464 activate_task(rq, p, en_flags); 1478 activate_task(rq, p, en_flags);
1465 p->on_rq = 1; 1479 p->on_rq = TASK_ON_RQ_QUEUED;
1466 1480
1467 /* if a worker is waking up, notify workqueue */ 1481 /* if a worker is waking up, notify workqueue */
1468 if (p->flags & PF_WQ_WORKER) 1482 if (p->flags & PF_WQ_WORKER)
@@ -1521,7 +1535,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1521 int ret = 0; 1535 int ret = 0;
1522 1536
1523 rq = __task_rq_lock(p); 1537 rq = __task_rq_lock(p);
1524 if (p->on_rq) { 1538 if (task_on_rq_queued(p)) {
1525 /* check_preempt_curr() may use rq clock */ 1539 /* check_preempt_curr() may use rq clock */
1526 update_rq_clock(rq); 1540 update_rq_clock(rq);
1527 ttwu_do_wakeup(rq, p, wake_flags); 1541 ttwu_do_wakeup(rq, p, wake_flags);
@@ -1604,6 +1618,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
1604 } 1618 }
1605} 1619}
1606 1620
1621void wake_up_if_idle(int cpu)
1622{
1623 struct rq *rq = cpu_rq(cpu);
1624 unsigned long flags;
1625
1626 if (!is_idle_task(rq->curr))
1627 return;
1628
1629 if (set_nr_if_polling(rq->idle)) {
1630 trace_sched_wake_idle_without_ipi(cpu);
1631 } else {
1632 raw_spin_lock_irqsave(&rq->lock, flags);
1633 if (is_idle_task(rq->curr))
1634 smp_send_reschedule(cpu);
1635 /* Else cpu is not in idle, do nothing here */
1636 raw_spin_unlock_irqrestore(&rq->lock, flags);
1637 }
1638}
1639
1607bool cpus_share_cache(int this_cpu, int that_cpu) 1640bool cpus_share_cache(int this_cpu, int that_cpu)
1608{ 1641{
1609 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1642 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1726,7 +1759,7 @@ static void try_to_wake_up_local(struct task_struct *p)
1726 if (!(p->state & TASK_NORMAL)) 1759 if (!(p->state & TASK_NORMAL))
1727 goto out; 1760 goto out;
1728 1761
1729 if (!p->on_rq) 1762 if (!task_on_rq_queued(p))
1730 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1763 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1731 1764
1732 ttwu_do_wakeup(rq, p, 0); 1765 ttwu_do_wakeup(rq, p, 0);
@@ -1760,6 +1793,20 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1760} 1793}
1761 1794
1762/* 1795/*
1796 * This function clears the sched_dl_entity static params.
1797 */
1798void __dl_clear_params(struct task_struct *p)
1799{
1800 struct sched_dl_entity *dl_se = &p->dl;
1801
1802 dl_se->dl_runtime = 0;
1803 dl_se->dl_deadline = 0;
1804 dl_se->dl_period = 0;
1805 dl_se->flags = 0;
1806 dl_se->dl_bw = 0;
1807}
1808
1809/*
1763 * Perform scheduler related setup for a newly forked process p. 1810 * Perform scheduler related setup for a newly forked process p.
1764 * p is forked by current. 1811 * p is forked by current.
1765 * 1812 *
@@ -1783,10 +1830,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1783 1830
1784 RB_CLEAR_NODE(&p->dl.rb_node); 1831 RB_CLEAR_NODE(&p->dl.rb_node);
1785 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1832 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1786 p->dl.dl_runtime = p->dl.runtime = 0; 1833 __dl_clear_params(p);
1787 p->dl.dl_deadline = p->dl.deadline = 0;
1788 p->dl.dl_period = 0;
1789 p->dl.flags = 0;
1790 1834
1791 INIT_LIST_HEAD(&p->rt.run_list); 1835 INIT_LIST_HEAD(&p->rt.run_list);
1792 1836
@@ -1961,6 +2005,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
1961#ifdef CONFIG_SMP 2005#ifdef CONFIG_SMP
1962inline struct dl_bw *dl_bw_of(int i) 2006inline struct dl_bw *dl_bw_of(int i)
1963{ 2007{
2008 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2009 "sched RCU must be held");
1964 return &cpu_rq(i)->rd->dl_bw; 2010 return &cpu_rq(i)->rd->dl_bw;
1965} 2011}
1966 2012
@@ -1969,6 +2015,8 @@ static inline int dl_bw_cpus(int i)
1969 struct root_domain *rd = cpu_rq(i)->rd; 2015 struct root_domain *rd = cpu_rq(i)->rd;
1970 int cpus = 0; 2016 int cpus = 0;
1971 2017
2018 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2019 "sched RCU must be held");
1972 for_each_cpu_and(i, rd->span, cpu_active_mask) 2020 for_each_cpu_and(i, rd->span, cpu_active_mask)
1973 cpus++; 2021 cpus++;
1974 2022
@@ -2079,7 +2127,7 @@ void wake_up_new_task(struct task_struct *p)
2079 init_task_runnable_average(p); 2127 init_task_runnable_average(p);
2080 rq = __task_rq_lock(p); 2128 rq = __task_rq_lock(p);
2081 activate_task(rq, p, 0); 2129 activate_task(rq, p, 0);
2082 p->on_rq = 1; 2130 p->on_rq = TASK_ON_RQ_QUEUED;
2083 trace_sched_wakeup_new(p, true); 2131 trace_sched_wakeup_new(p, true);
2084 check_preempt_curr(rq, p, WF_FORK); 2132 check_preempt_curr(rq, p, WF_FORK);
2085#ifdef CONFIG_SMP 2133#ifdef CONFIG_SMP
@@ -2271,10 +2319,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
2271 */ 2319 */
2272 post_schedule(rq); 2320 post_schedule(rq);
2273 2321
2274#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2275 /* In this case, finish_task_switch does not reenable preemption */
2276 preempt_enable();
2277#endif
2278 if (current->set_child_tid) 2322 if (current->set_child_tid)
2279 put_user(task_pid_vnr(current), current->set_child_tid); 2323 put_user(task_pid_vnr(current), current->set_child_tid);
2280} 2324}
@@ -2317,9 +2361,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2317 * of the scheduler it's an obvious special-case), so we 2361 * of the scheduler it's an obvious special-case), so we
2318 * do an early lockdep release here: 2362 * do an early lockdep release here:
2319 */ 2363 */
2320#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2321 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2364 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2322#endif
2323 2365
2324 context_tracking_task_switch(prev, next); 2366 context_tracking_task_switch(prev, next);
2325 /* Here we just switch the register state and the stack. */ 2367 /* Here we just switch the register state and the stack. */
@@ -2447,7 +2489,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2447 * project cycles that may never be accounted to this 2489 * project cycles that may never be accounted to this
2448 * thread, breaking clock_gettime(). 2490 * thread, breaking clock_gettime().
2449 */ 2491 */
2450 if (task_current(rq, p) && p->on_rq) { 2492 if (task_current(rq, p) && task_on_rq_queued(p)) {
2451 update_rq_clock(rq); 2493 update_rq_clock(rq);
2452 ns = rq_clock_task(rq) - p->se.exec_start; 2494 ns = rq_clock_task(rq) - p->se.exec_start;
2453 if ((s64)ns < 0) 2495 if ((s64)ns < 0)
@@ -2493,7 +2535,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2493 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 2535 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
2494 * been accounted, so we're correct here as well. 2536 * been accounted, so we're correct here as well.
2495 */ 2537 */
2496 if (!p->on_cpu || !p->on_rq) 2538 if (!p->on_cpu || !task_on_rq_queued(p))
2497 return p->se.sum_exec_runtime; 2539 return p->se.sum_exec_runtime;
2498#endif 2540#endif
2499 2541
@@ -2656,6 +2698,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
2656 */ 2698 */
2657static inline void schedule_debug(struct task_struct *prev) 2699static inline void schedule_debug(struct task_struct *prev)
2658{ 2700{
2701#ifdef CONFIG_SCHED_STACK_END_CHECK
2702 BUG_ON(unlikely(task_stack_end_corrupted(prev)));
2703#endif
2659 /* 2704 /*
2660 * Test if we are atomic. Since do_exit() needs to call into 2705 * Test if we are atomic. Since do_exit() needs to call into
2661 * schedule() atomically, we ignore that path. Otherwise whine 2706 * schedule() atomically, we ignore that path. Otherwise whine
@@ -2797,7 +2842,7 @@ need_resched:
2797 switch_count = &prev->nvcsw; 2842 switch_count = &prev->nvcsw;
2798 } 2843 }
2799 2844
2800 if (prev->on_rq || rq->skip_clock_update < 0) 2845 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
2801 update_rq_clock(rq); 2846 update_rq_clock(rq);
2802 2847
2803 next = pick_next_task(rq, prev); 2848 next = pick_next_task(rq, prev);
@@ -2962,7 +3007,7 @@ EXPORT_SYMBOL(default_wake_function);
2962 */ 3007 */
2963void rt_mutex_setprio(struct task_struct *p, int prio) 3008void rt_mutex_setprio(struct task_struct *p, int prio)
2964{ 3009{
2965 int oldprio, on_rq, running, enqueue_flag = 0; 3010 int oldprio, queued, running, enqueue_flag = 0;
2966 struct rq *rq; 3011 struct rq *rq;
2967 const struct sched_class *prev_class; 3012 const struct sched_class *prev_class;
2968 3013
@@ -2991,12 +3036,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2991 trace_sched_pi_setprio(p, prio); 3036 trace_sched_pi_setprio(p, prio);
2992 oldprio = p->prio; 3037 oldprio = p->prio;
2993 prev_class = p->sched_class; 3038 prev_class = p->sched_class;
2994 on_rq = p->on_rq; 3039 queued = task_on_rq_queued(p);
2995 running = task_current(rq, p); 3040 running = task_current(rq, p);
2996 if (on_rq) 3041 if (queued)
2997 dequeue_task(rq, p, 0); 3042 dequeue_task(rq, p, 0);
2998 if (running) 3043 if (running)
2999 p->sched_class->put_prev_task(rq, p); 3044 put_prev_task(rq, p);
3000 3045
3001 /* 3046 /*
3002 * Boosting condition are: 3047 * Boosting condition are:
@@ -3033,7 +3078,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3033 3078
3034 if (running) 3079 if (running)
3035 p->sched_class->set_curr_task(rq); 3080 p->sched_class->set_curr_task(rq);
3036 if (on_rq) 3081 if (queued)
3037 enqueue_task(rq, p, enqueue_flag); 3082 enqueue_task(rq, p, enqueue_flag);
3038 3083
3039 check_class_changed(rq, p, prev_class, oldprio); 3084 check_class_changed(rq, p, prev_class, oldprio);
@@ -3044,7 +3089,7 @@ out_unlock:
3044 3089
3045void set_user_nice(struct task_struct *p, long nice) 3090void set_user_nice(struct task_struct *p, long nice)
3046{ 3091{
3047 int old_prio, delta, on_rq; 3092 int old_prio, delta, queued;
3048 unsigned long flags; 3093 unsigned long flags;
3049 struct rq *rq; 3094 struct rq *rq;
3050 3095
@@ -3065,8 +3110,8 @@ void set_user_nice(struct task_struct *p, long nice)
3065 p->static_prio = NICE_TO_PRIO(nice); 3110 p->static_prio = NICE_TO_PRIO(nice);
3066 goto out_unlock; 3111 goto out_unlock;
3067 } 3112 }
3068 on_rq = p->on_rq; 3113 queued = task_on_rq_queued(p);
3069 if (on_rq) 3114 if (queued)
3070 dequeue_task(rq, p, 0); 3115 dequeue_task(rq, p, 0);
3071 3116
3072 p->static_prio = NICE_TO_PRIO(nice); 3117 p->static_prio = NICE_TO_PRIO(nice);
@@ -3075,7 +3120,7 @@ void set_user_nice(struct task_struct *p, long nice)
3075 p->prio = effective_prio(p); 3120 p->prio = effective_prio(p);
3076 delta = p->prio - old_prio; 3121 delta = p->prio - old_prio;
3077 3122
3078 if (on_rq) { 3123 if (queued) {
3079 enqueue_task(rq, p, 0); 3124 enqueue_task(rq, p, 0);
3080 /* 3125 /*
3081 * If the task increased its priority or is running and 3126 * If the task increased its priority or is running and
@@ -3347,7 +3392,7 @@ static int __sched_setscheduler(struct task_struct *p,
3347{ 3392{
3348 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3393 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3349 MAX_RT_PRIO - 1 - attr->sched_priority; 3394 MAX_RT_PRIO - 1 - attr->sched_priority;
3350 int retval, oldprio, oldpolicy = -1, on_rq, running; 3395 int retval, oldprio, oldpolicy = -1, queued, running;
3351 int policy = attr->sched_policy; 3396 int policy = attr->sched_policy;
3352 unsigned long flags; 3397 unsigned long flags;
3353 const struct sched_class *prev_class; 3398 const struct sched_class *prev_class;
@@ -3544,19 +3589,19 @@ change:
3544 return 0; 3589 return 0;
3545 } 3590 }
3546 3591
3547 on_rq = p->on_rq; 3592 queued = task_on_rq_queued(p);
3548 running = task_current(rq, p); 3593 running = task_current(rq, p);
3549 if (on_rq) 3594 if (queued)
3550 dequeue_task(rq, p, 0); 3595 dequeue_task(rq, p, 0);
3551 if (running) 3596 if (running)
3552 p->sched_class->put_prev_task(rq, p); 3597 put_prev_task(rq, p);
3553 3598
3554 prev_class = p->sched_class; 3599 prev_class = p->sched_class;
3555 __setscheduler(rq, p, attr); 3600 __setscheduler(rq, p, attr);
3556 3601
3557 if (running) 3602 if (running)
3558 p->sched_class->set_curr_task(rq); 3603 p->sched_class->set_curr_task(rq);
3559 if (on_rq) { 3604 if (queued) {
3560 /* 3605 /*
3561 * We enqueue to tail when the priority of a task is 3606 * We enqueue to tail when the priority of a task is
3562 * increased (user space view). 3607 * increased (user space view).
@@ -3980,14 +4025,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3980 rcu_read_lock(); 4025 rcu_read_lock();
3981 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4026 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3982 rcu_read_unlock(); 4027 rcu_read_unlock();
3983 goto out_unlock; 4028 goto out_free_new_mask;
3984 } 4029 }
3985 rcu_read_unlock(); 4030 rcu_read_unlock();
3986 } 4031 }
3987 4032
3988 retval = security_task_setscheduler(p); 4033 retval = security_task_setscheduler(p);
3989 if (retval) 4034 if (retval)
3990 goto out_unlock; 4035 goto out_free_new_mask;
3991 4036
3992 4037
3993 cpuset_cpus_allowed(p, cpus_allowed); 4038 cpuset_cpus_allowed(p, cpus_allowed);
@@ -4000,13 +4045,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4000 * root_domain. 4045 * root_domain.
4001 */ 4046 */
4002#ifdef CONFIG_SMP 4047#ifdef CONFIG_SMP
4003 if (task_has_dl_policy(p)) { 4048 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4004 const struct cpumask *span = task_rq(p)->rd->span; 4049 rcu_read_lock();
4005 4050 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4006 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
4007 retval = -EBUSY; 4051 retval = -EBUSY;
4008 goto out_unlock; 4052 rcu_read_unlock();
4053 goto out_free_new_mask;
4009 } 4054 }
4055 rcu_read_unlock();
4010 } 4056 }
4011#endif 4057#endif
4012again: 4058again:
@@ -4024,7 +4070,7 @@ again:
4024 goto again; 4070 goto again;
4025 } 4071 }
4026 } 4072 }
4027out_unlock: 4073out_free_new_mask:
4028 free_cpumask_var(new_mask); 4074 free_cpumask_var(new_mask);
4029out_free_cpus_allowed: 4075out_free_cpus_allowed:
4030 free_cpumask_var(cpus_allowed); 4076 free_cpumask_var(cpus_allowed);
@@ -4508,7 +4554,7 @@ void show_state_filter(unsigned long state_filter)
4508 " task PC stack pid father\n"); 4554 " task PC stack pid father\n");
4509#endif 4555#endif
4510 rcu_read_lock(); 4556 rcu_read_lock();
4511 do_each_thread(g, p) { 4557 for_each_process_thread(g, p) {
4512 /* 4558 /*
4513 * reset the NMI-timeout, listing all files on a slow 4559 * reset the NMI-timeout, listing all files on a slow
4514 * console might take a lot of time: 4560 * console might take a lot of time:
@@ -4516,7 +4562,7 @@ void show_state_filter(unsigned long state_filter)
4516 touch_nmi_watchdog(); 4562 touch_nmi_watchdog();
4517 if (!state_filter || (p->state & state_filter)) 4563 if (!state_filter || (p->state & state_filter))
4518 sched_show_task(p); 4564 sched_show_task(p);
4519 } while_each_thread(g, p); 4565 }
4520 4566
4521 touch_all_softlockup_watchdogs(); 4567 touch_all_softlockup_watchdogs();
4522 4568
@@ -4571,7 +4617,7 @@ void init_idle(struct task_struct *idle, int cpu)
4571 rcu_read_unlock(); 4617 rcu_read_unlock();
4572 4618
4573 rq->curr = rq->idle = idle; 4619 rq->curr = rq->idle = idle;
4574 idle->on_rq = 1; 4620 idle->on_rq = TASK_ON_RQ_QUEUED;
4575#if defined(CONFIG_SMP) 4621#if defined(CONFIG_SMP)
4576 idle->on_cpu = 1; 4622 idle->on_cpu = 1;
4577#endif 4623#endif
@@ -4592,6 +4638,33 @@ void init_idle(struct task_struct *idle, int cpu)
4592} 4638}
4593 4639
4594#ifdef CONFIG_SMP 4640#ifdef CONFIG_SMP
4641/*
4642 * move_queued_task - move a queued task to new rq.
4643 *
4644 * Returns (locked) new rq. Old rq's lock is released.
4645 */
4646static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4647{
4648 struct rq *rq = task_rq(p);
4649
4650 lockdep_assert_held(&rq->lock);
4651
4652 dequeue_task(rq, p, 0);
4653 p->on_rq = TASK_ON_RQ_MIGRATING;
4654 set_task_cpu(p, new_cpu);
4655 raw_spin_unlock(&rq->lock);
4656
4657 rq = cpu_rq(new_cpu);
4658
4659 raw_spin_lock(&rq->lock);
4660 BUG_ON(task_cpu(p) != new_cpu);
4661 p->on_rq = TASK_ON_RQ_QUEUED;
4662 enqueue_task(rq, p, 0);
4663 check_preempt_curr(rq, p, 0);
4664
4665 return rq;
4666}
4667
4595void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4668void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4596{ 4669{
4597 if (p->sched_class && p->sched_class->set_cpus_allowed) 4670 if (p->sched_class && p->sched_class->set_cpus_allowed)
@@ -4648,14 +4721,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4648 goto out; 4721 goto out;
4649 4722
4650 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4723 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4651 if (p->on_rq) { 4724 if (task_running(rq, p) || p->state == TASK_WAKING) {
4652 struct migration_arg arg = { p, dest_cpu }; 4725 struct migration_arg arg = { p, dest_cpu };
4653 /* Need help from migration thread: drop lock and wait. */ 4726 /* Need help from migration thread: drop lock and wait. */
4654 task_rq_unlock(rq, p, &flags); 4727 task_rq_unlock(rq, p, &flags);
4655 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4728 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4656 tlb_migrate_finish(p->mm); 4729 tlb_migrate_finish(p->mm);
4657 return 0; 4730 return 0;
4658 } 4731 } else if (task_on_rq_queued(p))
4732 rq = move_queued_task(p, dest_cpu);
4659out: 4733out:
4660 task_rq_unlock(rq, p, &flags); 4734 task_rq_unlock(rq, p, &flags);
4661 4735
@@ -4676,20 +4750,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4676 */ 4750 */
4677static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4751static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4678{ 4752{
4679 struct rq *rq_dest, *rq_src; 4753 struct rq *rq;
4680 int ret = 0; 4754 int ret = 0;
4681 4755
4682 if (unlikely(!cpu_active(dest_cpu))) 4756 if (unlikely(!cpu_active(dest_cpu)))
4683 return ret; 4757 return ret;
4684 4758
4685 rq_src = cpu_rq(src_cpu); 4759 rq = cpu_rq(src_cpu);
4686 rq_dest = cpu_rq(dest_cpu);
4687 4760
4688 raw_spin_lock(&p->pi_lock); 4761 raw_spin_lock(&p->pi_lock);
4689 double_rq_lock(rq_src, rq_dest); 4762 raw_spin_lock(&rq->lock);
4690 /* Already moved. */ 4763 /* Already moved. */
4691 if (task_cpu(p) != src_cpu) 4764 if (task_cpu(p) != src_cpu)
4692 goto done; 4765 goto done;
4766
4693 /* Affinity changed (again). */ 4767 /* Affinity changed (again). */
4694 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4768 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4695 goto fail; 4769 goto fail;
@@ -4698,16 +4772,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4698 * If we're not on a rq, the next wake-up will ensure we're 4772 * If we're not on a rq, the next wake-up will ensure we're
4699 * placed properly. 4773 * placed properly.
4700 */ 4774 */
4701 if (p->on_rq) { 4775 if (task_on_rq_queued(p))
4702 dequeue_task(rq_src, p, 0); 4776 rq = move_queued_task(p, dest_cpu);
4703 set_task_cpu(p, dest_cpu);
4704 enqueue_task(rq_dest, p, 0);
4705 check_preempt_curr(rq_dest, p, 0);
4706 }
4707done: 4777done:
4708 ret = 1; 4778 ret = 1;
4709fail: 4779fail:
4710 double_rq_unlock(rq_src, rq_dest); 4780 raw_spin_unlock(&rq->lock);
4711 raw_spin_unlock(&p->pi_lock); 4781 raw_spin_unlock(&p->pi_lock);
4712 return ret; 4782 return ret;
4713} 4783}
@@ -4739,22 +4809,22 @@ void sched_setnuma(struct task_struct *p, int nid)
4739{ 4809{
4740 struct rq *rq; 4810 struct rq *rq;
4741 unsigned long flags; 4811 unsigned long flags;
4742 bool on_rq, running; 4812 bool queued, running;
4743 4813
4744 rq = task_rq_lock(p, &flags); 4814 rq = task_rq_lock(p, &flags);
4745 on_rq = p->on_rq; 4815 queued = task_on_rq_queued(p);
4746 running = task_current(rq, p); 4816 running = task_current(rq, p);
4747 4817
4748 if (on_rq) 4818 if (queued)
4749 dequeue_task(rq, p, 0); 4819 dequeue_task(rq, p, 0);
4750 if (running) 4820 if (running)
4751 p->sched_class->put_prev_task(rq, p); 4821 put_prev_task(rq, p);
4752 4822
4753 p->numa_preferred_nid = nid; 4823 p->numa_preferred_nid = nid;
4754 4824
4755 if (running) 4825 if (running)
4756 p->sched_class->set_curr_task(rq); 4826 p->sched_class->set_curr_task(rq);
4757 if (on_rq) 4827 if (queued)
4758 enqueue_task(rq, p, 0); 4828 enqueue_task(rq, p, 0);
4759 task_rq_unlock(rq, p, &flags); 4829 task_rq_unlock(rq, p, &flags);
4760} 4830}
@@ -4774,6 +4844,12 @@ static int migration_cpu_stop(void *data)
4774 * be on another cpu but it doesn't matter. 4844 * be on another cpu but it doesn't matter.
4775 */ 4845 */
4776 local_irq_disable(); 4846 local_irq_disable();
4847 /*
4848 * We need to explicitly wake pending tasks before running
4849 * __migrate_task() such that we will not miss enforcing cpus_allowed
4850 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
4851 */
4852 sched_ttwu_pending();
4777 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4853 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4778 local_irq_enable(); 4854 local_irq_enable();
4779 return 0; 4855 return 0;
@@ -5184,6 +5260,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
5184{ 5260{
5185 unsigned long flags; 5261 unsigned long flags;
5186 long cpu = (long)hcpu; 5262 long cpu = (long)hcpu;
5263 struct dl_bw *dl_b;
5187 5264
5188 switch (action & ~CPU_TASKS_FROZEN) { 5265 switch (action & ~CPU_TASKS_FROZEN) {
5189 case CPU_DOWN_PREPARE: 5266 case CPU_DOWN_PREPARE:
@@ -5191,15 +5268,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
5191 5268
5192 /* explicitly allow suspend */ 5269 /* explicitly allow suspend */
5193 if (!(action & CPU_TASKS_FROZEN)) { 5270 if (!(action & CPU_TASKS_FROZEN)) {
5194 struct dl_bw *dl_b = dl_bw_of(cpu);
5195 bool overflow; 5271 bool overflow;
5196 int cpus; 5272 int cpus;
5197 5273
5274 rcu_read_lock_sched();
5275 dl_b = dl_bw_of(cpu);
5276
5198 raw_spin_lock_irqsave(&dl_b->lock, flags); 5277 raw_spin_lock_irqsave(&dl_b->lock, flags);
5199 cpus = dl_bw_cpus(cpu); 5278 cpus = dl_bw_cpus(cpu);
5200 overflow = __dl_overflow(dl_b, cpus, 0, 0); 5279 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5201 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5280 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5202 5281
5282 rcu_read_unlock_sched();
5283
5203 if (overflow) 5284 if (overflow)
5204 return notifier_from_errno(-EBUSY); 5285 return notifier_from_errno(-EBUSY);
5205 } 5286 }
@@ -5742,7 +5823,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5742 const struct cpumask *span = sched_domain_span(sd); 5823 const struct cpumask *span = sched_domain_span(sd);
5743 struct cpumask *covered = sched_domains_tmpmask; 5824 struct cpumask *covered = sched_domains_tmpmask;
5744 struct sd_data *sdd = sd->private; 5825 struct sd_data *sdd = sd->private;
5745 struct sched_domain *child; 5826 struct sched_domain *sibling;
5746 int i; 5827 int i;
5747 5828
5748 cpumask_clear(covered); 5829 cpumask_clear(covered);
@@ -5753,10 +5834,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5753 if (cpumask_test_cpu(i, covered)) 5834 if (cpumask_test_cpu(i, covered))
5754 continue; 5835 continue;
5755 5836
5756 child = *per_cpu_ptr(sdd->sd, i); 5837 sibling = *per_cpu_ptr(sdd->sd, i);
5757 5838
5758 /* See the comment near build_group_mask(). */ 5839 /* See the comment near build_group_mask(). */
5759 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5840 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5760 continue; 5841 continue;
5761 5842
5762 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5843 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5766,10 +5847,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5766 goto fail; 5847 goto fail;
5767 5848
5768 sg_span = sched_group_cpus(sg); 5849 sg_span = sched_group_cpus(sg);
5769 if (child->child) { 5850 if (sibling->child)
5770 child = child->child; 5851 cpumask_copy(sg_span, sched_domain_span(sibling->child));
5771 cpumask_copy(sg_span, sched_domain_span(child)); 5852 else
5772 } else
5773 cpumask_set_cpu(i, sg_span); 5853 cpumask_set_cpu(i, sg_span);
5774 5854
5775 cpumask_or(covered, covered, sg_span); 5855 cpumask_or(covered, covered, sg_span);
@@ -7120,13 +7200,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7120 .sched_policy = SCHED_NORMAL, 7200 .sched_policy = SCHED_NORMAL,
7121 }; 7201 };
7122 int old_prio = p->prio; 7202 int old_prio = p->prio;
7123 int on_rq; 7203 int queued;
7124 7204
7125 on_rq = p->on_rq; 7205 queued = task_on_rq_queued(p);
7126 if (on_rq) 7206 if (queued)
7127 dequeue_task(rq, p, 0); 7207 dequeue_task(rq, p, 0);
7128 __setscheduler(rq, p, &attr); 7208 __setscheduler(rq, p, &attr);
7129 if (on_rq) { 7209 if (queued) {
7130 enqueue_task(rq, p, 0); 7210 enqueue_task(rq, p, 0);
7131 resched_curr(rq); 7211 resched_curr(rq);
7132 } 7212 }
@@ -7140,12 +7220,12 @@ void normalize_rt_tasks(void)
7140 unsigned long flags; 7220 unsigned long flags;
7141 struct rq *rq; 7221 struct rq *rq;
7142 7222
7143 read_lock_irqsave(&tasklist_lock, flags); 7223 read_lock(&tasklist_lock);
7144 do_each_thread(g, p) { 7224 for_each_process_thread(g, p) {
7145 /* 7225 /*
7146 * Only normalize user tasks: 7226 * Only normalize user tasks:
7147 */ 7227 */
7148 if (!p->mm) 7228 if (p->flags & PF_KTHREAD)
7149 continue; 7229 continue;
7150 7230
7151 p->se.exec_start = 0; 7231 p->se.exec_start = 0;
@@ -7160,21 +7240,16 @@ void normalize_rt_tasks(void)
7160 * Renice negative nice level userspace 7240 * Renice negative nice level userspace
7161 * tasks back to 0: 7241 * tasks back to 0:
7162 */ 7242 */
7163 if (task_nice(p) < 0 && p->mm) 7243 if (task_nice(p) < 0)
7164 set_user_nice(p, 0); 7244 set_user_nice(p, 0);
7165 continue; 7245 continue;
7166 } 7246 }
7167 7247
7168 raw_spin_lock(&p->pi_lock); 7248 rq = task_rq_lock(p, &flags);
7169 rq = __task_rq_lock(p);
7170
7171 normalize_task(rq, p); 7249 normalize_task(rq, p);
7172 7250 task_rq_unlock(rq, p, &flags);
7173 __task_rq_unlock(rq); 7251 }
7174 raw_spin_unlock(&p->pi_lock); 7252 read_unlock(&tasklist_lock);
7175 } while_each_thread(g, p);
7176
7177 read_unlock_irqrestore(&tasklist_lock, flags);
7178} 7253}
7179 7254
7180#endif /* CONFIG_MAGIC_SYSRQ */ 7255#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7314,19 +7389,19 @@ void sched_offline_group(struct task_group *tg)
7314void sched_move_task(struct task_struct *tsk) 7389void sched_move_task(struct task_struct *tsk)
7315{ 7390{
7316 struct task_group *tg; 7391 struct task_group *tg;
7317 int on_rq, running; 7392 int queued, running;
7318 unsigned long flags; 7393 unsigned long flags;
7319 struct rq *rq; 7394 struct rq *rq;
7320 7395
7321 rq = task_rq_lock(tsk, &flags); 7396 rq = task_rq_lock(tsk, &flags);
7322 7397
7323 running = task_current(rq, tsk); 7398 running = task_current(rq, tsk);
7324 on_rq = tsk->on_rq; 7399 queued = task_on_rq_queued(tsk);
7325 7400
7326 if (on_rq) 7401 if (queued)
7327 dequeue_task(rq, tsk, 0); 7402 dequeue_task(rq, tsk, 0);
7328 if (unlikely(running)) 7403 if (unlikely(running))
7329 tsk->sched_class->put_prev_task(rq, tsk); 7404 put_prev_task(rq, tsk);
7330 7405
7331 tg = container_of(task_css_check(tsk, cpu_cgrp_id, 7406 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7332 lockdep_is_held(&tsk->sighand->siglock)), 7407 lockdep_is_held(&tsk->sighand->siglock)),
@@ -7336,14 +7411,14 @@ void sched_move_task(struct task_struct *tsk)
7336 7411
7337#ifdef CONFIG_FAIR_GROUP_SCHED 7412#ifdef CONFIG_FAIR_GROUP_SCHED
7338 if (tsk->sched_class->task_move_group) 7413 if (tsk->sched_class->task_move_group)
7339 tsk->sched_class->task_move_group(tsk, on_rq); 7414 tsk->sched_class->task_move_group(tsk, queued);
7340 else 7415 else
7341#endif 7416#endif
7342 set_task_rq(tsk, task_cpu(tsk)); 7417 set_task_rq(tsk, task_cpu(tsk));
7343 7418
7344 if (unlikely(running)) 7419 if (unlikely(running))
7345 tsk->sched_class->set_curr_task(rq); 7420 tsk->sched_class->set_curr_task(rq);
7346 if (on_rq) 7421 if (queued)
7347 enqueue_task(rq, tsk, 0); 7422 enqueue_task(rq, tsk, 0);
7348 7423
7349 task_rq_unlock(rq, tsk, &flags); 7424 task_rq_unlock(rq, tsk, &flags);
@@ -7361,10 +7436,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
7361{ 7436{
7362 struct task_struct *g, *p; 7437 struct task_struct *g, *p;
7363 7438
7364 do_each_thread(g, p) { 7439 for_each_process_thread(g, p) {
7365 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7440 if (rt_task(p) && task_group(p) == tg)
7366 return 1; 7441 return 1;
7367 } while_each_thread(g, p); 7442 }
7368 7443
7369 return 0; 7444 return 0;
7370} 7445}
@@ -7573,6 +7648,7 @@ static int sched_dl_global_constraints(void)
7573 u64 runtime = global_rt_runtime(); 7648 u64 runtime = global_rt_runtime();
7574 u64 period = global_rt_period(); 7649 u64 period = global_rt_period();
7575 u64 new_bw = to_ratio(period, runtime); 7650 u64 new_bw = to_ratio(period, runtime);
7651 struct dl_bw *dl_b;
7576 int cpu, ret = 0; 7652 int cpu, ret = 0;
7577 unsigned long flags; 7653 unsigned long flags;
7578 7654
@@ -7586,13 +7662,16 @@ static int sched_dl_global_constraints(void)
7586 * solutions is welcome! 7662 * solutions is welcome!
7587 */ 7663 */
7588 for_each_possible_cpu(cpu) { 7664 for_each_possible_cpu(cpu) {
7589 struct dl_bw *dl_b = dl_bw_of(cpu); 7665 rcu_read_lock_sched();
7666 dl_b = dl_bw_of(cpu);
7590 7667
7591 raw_spin_lock_irqsave(&dl_b->lock, flags); 7668 raw_spin_lock_irqsave(&dl_b->lock, flags);
7592 if (new_bw < dl_b->total_bw) 7669 if (new_bw < dl_b->total_bw)
7593 ret = -EBUSY; 7670 ret = -EBUSY;
7594 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7671 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7595 7672
7673 rcu_read_unlock_sched();
7674
7596 if (ret) 7675 if (ret)
7597 break; 7676 break;
7598 } 7677 }
@@ -7603,6 +7682,7 @@ static int sched_dl_global_constraints(void)
7603static void sched_dl_do_global(void) 7682static void sched_dl_do_global(void)
7604{ 7683{
7605 u64 new_bw = -1; 7684 u64 new_bw = -1;
7685 struct dl_bw *dl_b;
7606 int cpu; 7686 int cpu;
7607 unsigned long flags; 7687 unsigned long flags;
7608 7688
@@ -7616,11 +7696,14 @@ static void sched_dl_do_global(void)
7616 * FIXME: As above... 7696 * FIXME: As above...
7617 */ 7697 */
7618 for_each_possible_cpu(cpu) { 7698 for_each_possible_cpu(cpu) {
7619 struct dl_bw *dl_b = dl_bw_of(cpu); 7699 rcu_read_lock_sched();
7700 dl_b = dl_bw_of(cpu);
7620 7701
7621 raw_spin_lock_irqsave(&dl_b->lock, flags); 7702 raw_spin_lock_irqsave(&dl_b->lock, flags);
7622 dl_b->bw = new_bw; 7703 dl_b->bw = new_bw;
7623 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7704 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7705
7706 rcu_read_unlock_sched();
7624 } 7707 }
7625} 7708}
7626 7709
@@ -8001,7 +8084,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8001 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 8084 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8002 8085
8003 quota = normalize_cfs_quota(tg, d); 8086 quota = normalize_cfs_quota(tg, d);
8004 parent_quota = parent_b->hierarchal_quota; 8087 parent_quota = parent_b->hierarchical_quota;
8005 8088
8006 /* 8089 /*
8007 * ensure max(child_quota) <= parent_quota, inherit when no 8090 * ensure max(child_quota) <= parent_quota, inherit when no
@@ -8012,7 +8095,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8012 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 8095 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8013 return -EINVAL; 8096 return -EINVAL;
8014 } 8097 }
8015 cfs_b->hierarchal_quota = quota; 8098 cfs_b->hierarchical_quota = quota;
8016 8099
8017 return 0; 8100 return 0;
8018} 8101}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index bd95963dae80..539ca3ce071b 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
107 int best_cpu = -1; 107 int best_cpu = -1;
108 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
109 109
110 if (later_mask && cpumask_and(later_mask, cp->free_cpus, 110 if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
111 &p->cpus_allowed) && cpumask_and(later_mask,
112 later_mask, cpu_active_mask)) {
113 best_cpu = cpumask_any(later_mask); 111 best_cpu = cpumask_any(later_mask);
114 goto out; 112 goto out;
115 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 113 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06ef865..8394b1ee600c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
288 struct signal_struct *sig = tsk->signal; 288 struct signal_struct *sig = tsk->signal;
289 cputime_t utime, stime; 289 cputime_t utime, stime;
290 struct task_struct *t; 290 struct task_struct *t;
291 291 unsigned int seq, nextseq;
292 times->utime = sig->utime; 292 unsigned long flags;
293 times->stime = sig->stime;
294 times->sum_exec_runtime = sig->sum_sched_runtime;
295 293
296 rcu_read_lock(); 294 rcu_read_lock();
297 /* make sure we can trust tsk->thread_group list */ 295 /* Attempt a lockless read on the first round. */
298 if (!likely(pid_alive(tsk))) 296 nextseq = 0;
299 goto out;
300
301 t = tsk;
302 do { 297 do {
303 task_cputime(t, &utime, &stime); 298 seq = nextseq;
304 times->utime += utime; 299 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
305 times->stime += stime; 300 times->utime = sig->utime;
306 times->sum_exec_runtime += task_sched_runtime(t); 301 times->stime = sig->stime;
307 } while_each_thread(tsk, t); 302 times->sum_exec_runtime = sig->sum_sched_runtime;
308out: 303
304 for_each_thread(tsk, t) {
305 task_cputime(t, &utime, &stime);
306 times->utime += utime;
307 times->stime += stime;
308 times->sum_exec_runtime += task_sched_runtime(t);
309 }
310 /* If lockless access failed, take the lock. */
311 nextseq = 1;
312 } while (need_seqretry(&sig->stats_lock, seq));
313 done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
309 rcu_read_unlock(); 314 rcu_read_unlock();
310} 315}
311 316
@@ -550,6 +555,23 @@ drop_precision:
550} 555}
551 556
552/* 557/*
558 * Atomically advance counter to the new value. Interrupts, vcpu
559 * scheduling, and scaling inaccuracies can cause cputime_advance
560 * to be occasionally called with a new value smaller than counter.
561 * Let's enforce atomicity.
562 *
563 * Normally a caller will only go through this loop once, or not
564 * at all in case a previous caller updated counter the same jiffy.
565 */
566static void cputime_advance(cputime_t *counter, cputime_t new)
567{
568 cputime_t old;
569
570 while (new > (old = ACCESS_ONCE(*counter)))
571 cmpxchg_cputime(counter, old, new);
572}
573
574/*
553 * Adjust tick based cputime random precision against scheduler 575 * Adjust tick based cputime random precision against scheduler
554 * runtime accounting. 576 * runtime accounting.
555 */ 577 */
@@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr,
594 utime = rtime - stime; 616 utime = rtime - stime;
595 } 617 }
596 618
597 /* 619 cputime_advance(&prev->stime, stime);
598 * If the tick based count grows faster than the scheduler one, 620 cputime_advance(&prev->utime, utime);
599 * the result of the scaling may go backward.
600 * Let's enforce monotonicity.
601 */
602 prev->stime = max(prev->stime, stime);
603 prev->utime = max(prev->utime, utime);
604 621
605out: 622out:
606 *ut = prev->utime; 623 *ut = prev->utime;
@@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
617 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 634 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
618} 635}
619 636
620/*
621 * Must be called with siglock held.
622 */
623void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 637void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
624{ 638{
625 struct task_cputime cputime; 639 struct task_cputime cputime;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce138b652..abfaf3d9a29f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -530,7 +530,7 @@ again:
530 update_rq_clock(rq); 530 update_rq_clock(rq);
531 dl_se->dl_throttled = 0; 531 dl_se->dl_throttled = 0;
532 dl_se->dl_yielded = 0; 532 dl_se->dl_yielded = 0;
533 if (p->on_rq) { 533 if (task_on_rq_queued(p)) {
534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
535 if (task_has_dl_policy(rq->curr)) 535 if (task_has_dl_policy(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 536 check_preempt_curr_dl(rq, p, 0);
@@ -997,10 +997,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
997#ifdef CONFIG_SCHED_HRTICK 997#ifdef CONFIG_SCHED_HRTICK
998static void start_hrtick_dl(struct rq *rq, struct task_struct *p) 998static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
999{ 999{
1000 s64 delta = p->dl.dl_runtime - p->dl.runtime; 1000 hrtick_start(rq, p->dl.runtime);
1001
1002 if (delta > 10000)
1003 hrtick_start(rq, p->dl.runtime);
1004} 1001}
1005#endif 1002#endif
1006 1003
@@ -1030,7 +1027,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1030 * means a stop task can slip in, in which case we need to 1027 * means a stop task can slip in, in which case we need to
1031 * re-start task selection. 1028 * re-start task selection.
1032 */ 1029 */
1033 if (rq->stop && rq->stop->on_rq) 1030 if (rq->stop && task_on_rq_queued(rq->stop))
1034 return RETRY_TASK; 1031 return RETRY_TASK;
1035 } 1032 }
1036 1033
@@ -1124,10 +1121,8 @@ static void set_curr_task_dl(struct rq *rq)
1124static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1121static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1125{ 1122{
1126 if (!task_running(rq, p) && 1123 if (!task_running(rq, p) &&
1127 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1124 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1128 (p->nr_cpus_allowed > 1))
1129 return 1; 1125 return 1;
1130
1131 return 0; 1126 return 0;
1132} 1127}
1133 1128
@@ -1169,6 +1164,13 @@ static int find_later_rq(struct task_struct *task)
1169 if (task->nr_cpus_allowed == 1) 1164 if (task->nr_cpus_allowed == 1)
1170 return -1; 1165 return -1;
1171 1166
1167 /*
1168 * We have to consider system topology and task affinity
1169 * first, then we can look for a suitable cpu.
1170 */
1171 cpumask_copy(later_mask, task_rq(task)->rd->span);
1172 cpumask_and(later_mask, later_mask, cpu_active_mask);
1173 cpumask_and(later_mask, later_mask, &task->cpus_allowed);
1172 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1174 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1173 task, later_mask); 1175 task, later_mask);
1174 if (best_cpu == -1) 1176 if (best_cpu == -1)
@@ -1257,7 +1259,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1257 if (unlikely(task_rq(task) != rq || 1259 if (unlikely(task_rq(task) != rq ||
1258 !cpumask_test_cpu(later_rq->cpu, 1260 !cpumask_test_cpu(later_rq->cpu,
1259 &task->cpus_allowed) || 1261 &task->cpus_allowed) ||
1260 task_running(rq, task) || !task->on_rq)) { 1262 task_running(rq, task) ||
1263 !task_on_rq_queued(task))) {
1261 double_unlock_balance(rq, later_rq); 1264 double_unlock_balance(rq, later_rq);
1262 later_rq = NULL; 1265 later_rq = NULL;
1263 break; 1266 break;
@@ -1296,7 +1299,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1296 BUG_ON(task_current(rq, p)); 1299 BUG_ON(task_current(rq, p));
1297 BUG_ON(p->nr_cpus_allowed <= 1); 1300 BUG_ON(p->nr_cpus_allowed <= 1);
1298 1301
1299 BUG_ON(!p->on_rq); 1302 BUG_ON(!task_on_rq_queued(p));
1300 BUG_ON(!dl_task(p)); 1303 BUG_ON(!dl_task(p));
1301 1304
1302 return p; 1305 return p;
@@ -1443,7 +1446,7 @@ static int pull_dl_task(struct rq *this_rq)
1443 dl_time_before(p->dl.deadline, 1446 dl_time_before(p->dl.deadline,
1444 this_rq->dl.earliest_dl.curr))) { 1447 this_rq->dl.earliest_dl.curr))) {
1445 WARN_ON(p == src_rq->curr); 1448 WARN_ON(p == src_rq->curr);
1446 WARN_ON(!p->on_rq); 1449 WARN_ON(!task_on_rq_queued(p));
1447 1450
1448 /* 1451 /*
1449 * Then we pull iff p has actually an earlier 1452 * Then we pull iff p has actually an earlier
@@ -1569,6 +1572,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
1569 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) 1572 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
1570 hrtimer_try_to_cancel(&p->dl.dl_timer); 1573 hrtimer_try_to_cancel(&p->dl.dl_timer);
1571 1574
1575 __dl_clear_params(p);
1576
1572#ifdef CONFIG_SMP 1577#ifdef CONFIG_SMP
1573 /* 1578 /*
1574 * Since this might be the only -deadline task on the rq, 1579 * Since this might be the only -deadline task on the rq,
@@ -1596,7 +1601,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1596 if (unlikely(p->dl.dl_throttled)) 1601 if (unlikely(p->dl.dl_throttled))
1597 return; 1602 return;
1598 1603
1599 if (p->on_rq && rq->curr != p) { 1604 if (task_on_rq_queued(p) && rq->curr != p) {
1600#ifdef CONFIG_SMP 1605#ifdef CONFIG_SMP
1601 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1606 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1602 /* Only reschedule if pushing failed */ 1607 /* Only reschedule if pushing failed */
@@ -1614,7 +1619,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1614static void prio_changed_dl(struct rq *rq, struct task_struct *p, 1619static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1615 int oldprio) 1620 int oldprio)
1616{ 1621{
1617 if (p->on_rq || rq->curr == p) { 1622 if (task_on_rq_queued(p) || rq->curr == p) {
1618#ifdef CONFIG_SMP 1623#ifdef CONFIG_SMP
1619 /* 1624 /*
1620 * This might be too much, but unfortunately 1625 * This might be too much, but unfortunately
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c34b821..ce33780d8f20 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
150static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) 150static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
151{ 151{
152 struct task_struct *g, *p; 152 struct task_struct *g, *p;
153 unsigned long flags;
154 153
155 SEQ_printf(m, 154 SEQ_printf(m,
156 "\nrunnable tasks:\n" 155 "\nrunnable tasks:\n"
@@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 "------------------------------------------------------" 158 "------------------------------------------------------"
160 "----------------------------------------------------\n"); 159 "----------------------------------------------------\n");
161 160
162 read_lock_irqsave(&tasklist_lock, flags); 161 rcu_read_lock();
163 162 for_each_process_thread(g, p) {
164 do_each_thread(g, p) {
165 if (task_cpu(p) != rq_cpu) 163 if (task_cpu(p) != rq_cpu)
166 continue; 164 continue;
167 165
168 print_task(m, rq, p); 166 print_task(m, rq, p);
169 } while_each_thread(g, p); 167 }
170 168 rcu_read_unlock();
171 read_unlock_irqrestore(&tasklist_lock, flags);
172} 169}
173 170
174void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 171void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -333,9 +330,7 @@ do { \
333 print_cfs_stats(m, cpu); 330 print_cfs_stats(m, cpu);
334 print_rt_stats(m, cpu); 331 print_rt_stats(m, cpu);
335 332
336 rcu_read_lock();
337 print_rq(m, rq, cpu); 333 print_rq(m, rq, cpu);
338 rcu_read_unlock();
339 spin_unlock_irqrestore(&sched_debug_lock, flags); 334 spin_unlock_irqrestore(&sched_debug_lock, flags);
340 SEQ_printf(m, "\n"); 335 SEQ_printf(m, "\n");
341} 336}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 82088b29704e..b78280c59b46 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/cpuidle.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
27#include <linux/profile.h> 28#include <linux/profile.h>
28#include <linux/interrupt.h> 29#include <linux/interrupt.h>
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665} 666}
666 667
667#ifdef CONFIG_SMP 668#ifdef CONFIG_SMP
669static int select_idle_sibling(struct task_struct *p, int cpu);
668static unsigned long task_h_load(struct task_struct *p); 670static unsigned long task_h_load(struct task_struct *p);
669 671
670static inline void __update_task_entity_contrib(struct sched_entity *se); 672static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -1038,7 +1040,8 @@ struct numa_stats {
1038 */ 1040 */
1039static void update_numa_stats(struct numa_stats *ns, int nid) 1041static void update_numa_stats(struct numa_stats *ns, int nid)
1040{ 1042{
1041 int cpu, cpus = 0; 1043 int smt, cpu, cpus = 0;
1044 unsigned long capacity;
1042 1045
1043 memset(ns, 0, sizeof(*ns)); 1046 memset(ns, 0, sizeof(*ns));
1044 for_each_cpu(cpu, cpumask_of_node(nid)) { 1047 for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1065,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1062 if (!cpus) 1065 if (!cpus)
1063 return; 1066 return;
1064 1067
1065 ns->task_capacity = 1068 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1066 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); 1069 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1070 capacity = cpus / smt; /* cores */
1071
1072 ns->task_capacity = min_t(unsigned, capacity,
1073 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1067 ns->has_free_capacity = (ns->nr_running < ns->task_capacity); 1074 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1068} 1075}
1069 1076
@@ -1206,7 +1213,7 @@ static void task_numa_compare(struct task_numa_env *env,
1206 1213
1207 if (!cur) { 1214 if (!cur) {
1208 /* Is there capacity at our destination? */ 1215 /* Is there capacity at our destination? */
1209 if (env->src_stats.has_free_capacity && 1216 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1210 !env->dst_stats.has_free_capacity) 1217 !env->dst_stats.has_free_capacity)
1211 goto unlock; 1218 goto unlock;
1212 1219
@@ -1252,6 +1259,13 @@ balance:
1252 if (load_too_imbalanced(src_load, dst_load, env)) 1259 if (load_too_imbalanced(src_load, dst_load, env))
1253 goto unlock; 1260 goto unlock;
1254 1261
1262 /*
1263 * One idle CPU per node is evaluated for a task numa move.
1264 * Call select_idle_sibling to maybe find a better one.
1265 */
1266 if (!cur)
1267 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1268
1255assign: 1269assign:
1256 task_numa_assign(env, cur, imp); 1270 task_numa_assign(env, cur, imp);
1257unlock: 1271unlock:
@@ -1775,7 +1789,7 @@ void task_numa_free(struct task_struct *p)
1775 list_del(&p->numa_entry); 1789 list_del(&p->numa_entry);
1776 grp->nr_tasks--; 1790 grp->nr_tasks--;
1777 spin_unlock_irqrestore(&grp->lock, flags); 1791 spin_unlock_irqrestore(&grp->lock, flags);
1778 rcu_assign_pointer(p->numa_group, NULL); 1792 RCU_INIT_POINTER(p->numa_group, NULL);
1779 put_numa_group(grp); 1793 put_numa_group(grp);
1780 } 1794 }
1781 1795
@@ -1804,10 +1818,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1804 if (!p->mm) 1818 if (!p->mm)
1805 return; 1819 return;
1806 1820
1807 /* Do not worry about placement if exiting */
1808 if (p->state == TASK_DEAD)
1809 return;
1810
1811 /* Allocate buffer to track faults on a per-node basis */ 1821 /* Allocate buffer to track faults on a per-node basis */
1812 if (unlikely(!p->numa_faults_memory)) { 1822 if (unlikely(!p->numa_faults_memory)) {
1813 int size = sizeof(*p->numa_faults_memory) * 1823 int size = sizeof(*p->numa_faults_memory) *
@@ -2211,8 +2221,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
2211 2221
2212 /* 2222 /*
2213 * As y^PERIOD = 1/2, we can combine 2223 * As y^PERIOD = 1/2, we can combine
2214 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) 2224 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2215 * With a look-up table which covers k^n (n<PERIOD) 2225 * With a look-up table which covers y^n (n<PERIOD)
2216 * 2226 *
2217 * To achieve constant time decay_load. 2227 * To achieve constant time decay_load.
2218 */ 2228 */
@@ -2377,6 +2387,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2377 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; 2387 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2378 tg_contrib -= cfs_rq->tg_load_contrib; 2388 tg_contrib -= cfs_rq->tg_load_contrib;
2379 2389
2390 if (!tg_contrib)
2391 return;
2392
2380 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { 2393 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2381 atomic_long_add(tg_contrib, &tg->load_avg); 2394 atomic_long_add(tg_contrib, &tg->load_avg);
2382 cfs_rq->tg_load_contrib += tg_contrib; 2395 cfs_rq->tg_load_contrib += tg_contrib;
@@ -3892,14 +3905,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3892 resched_curr(rq); 3905 resched_curr(rq);
3893 return; 3906 return;
3894 } 3907 }
3895
3896 /*
3897 * Don't schedule slices shorter than 10000ns, that just
3898 * doesn't make sense. Rely on vruntime for fairness.
3899 */
3900 if (rq->curr != p)
3901 delta = max_t(s64, 10000LL, delta);
3902
3903 hrtick_start(rq, delta); 3908 hrtick_start(rq, delta);
3904 } 3909 }
3905} 3910}
@@ -4087,7 +4092,7 @@ static unsigned long capacity_of(int cpu)
4087static unsigned long cpu_avg_load_per_task(int cpu) 4092static unsigned long cpu_avg_load_per_task(int cpu)
4088{ 4093{
4089 struct rq *rq = cpu_rq(cpu); 4094 struct rq *rq = cpu_rq(cpu);
4090 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 4095 unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
4091 unsigned long load_avg = rq->cfs.runnable_load_avg; 4096 unsigned long load_avg = rq->cfs.runnable_load_avg;
4092 4097
4093 if (nr_running) 4098 if (nr_running)
@@ -4276,8 +4281,8 @@ static int wake_wide(struct task_struct *p)
4276static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 4281static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4277{ 4282{
4278 s64 this_load, load; 4283 s64 this_load, load;
4284 s64 this_eff_load, prev_eff_load;
4279 int idx, this_cpu, prev_cpu; 4285 int idx, this_cpu, prev_cpu;
4280 unsigned long tl_per_task;
4281 struct task_group *tg; 4286 struct task_group *tg;
4282 unsigned long weight; 4287 unsigned long weight;
4283 int balanced; 4288 int balanced;
@@ -4320,47 +4325,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4320 * Otherwise check if either cpus are near enough in load to allow this 4325 * Otherwise check if either cpus are near enough in load to allow this
4321 * task to be woken on this_cpu. 4326 * task to be woken on this_cpu.
4322 */ 4327 */
4323 if (this_load > 0) { 4328 this_eff_load = 100;
4324 s64 this_eff_load, prev_eff_load; 4329 this_eff_load *= capacity_of(prev_cpu);
4325 4330
4326 this_eff_load = 100; 4331 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4327 this_eff_load *= capacity_of(prev_cpu); 4332 prev_eff_load *= capacity_of(this_cpu);
4333
4334 if (this_load > 0) {
4328 this_eff_load *= this_load + 4335 this_eff_load *= this_load +
4329 effective_load(tg, this_cpu, weight, weight); 4336 effective_load(tg, this_cpu, weight, weight);
4330 4337
4331 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4332 prev_eff_load *= capacity_of(this_cpu);
4333 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); 4338 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4339 }
4334 4340
4335 balanced = this_eff_load <= prev_eff_load; 4341 balanced = this_eff_load <= prev_eff_load;
4336 } else
4337 balanced = true;
4338
4339 /*
4340 * If the currently running task will sleep within
4341 * a reasonable amount of time then attract this newly
4342 * woken task:
4343 */
4344 if (sync && balanced)
4345 return 1;
4346 4342
4347 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); 4343 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4348 tl_per_task = cpu_avg_load_per_task(this_cpu);
4349 4344
4350 if (balanced || 4345 if (!balanced)
4351 (this_load <= load && 4346 return 0;
4352 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4353 /*
4354 * This domain has SD_WAKE_AFFINE and
4355 * p is cache cold in this domain, and
4356 * there is no bad imbalance.
4357 */
4358 schedstat_inc(sd, ttwu_move_affine);
4359 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4360 4347
4361 return 1; 4348 schedstat_inc(sd, ttwu_move_affine);
4362 } 4349 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4363 return 0; 4350
4351 return 1;
4364} 4352}
4365 4353
4366/* 4354/*
@@ -4428,20 +4416,46 @@ static int
4428find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 4416find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4429{ 4417{
4430 unsigned long load, min_load = ULONG_MAX; 4418 unsigned long load, min_load = ULONG_MAX;
4431 int idlest = -1; 4419 unsigned int min_exit_latency = UINT_MAX;
4420 u64 latest_idle_timestamp = 0;
4421 int least_loaded_cpu = this_cpu;
4422 int shallowest_idle_cpu = -1;
4432 int i; 4423 int i;
4433 4424
4434 /* Traverse only the allowed CPUs */ 4425 /* Traverse only the allowed CPUs */
4435 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { 4426 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4436 load = weighted_cpuload(i); 4427 if (idle_cpu(i)) {
4437 4428 struct rq *rq = cpu_rq(i);
4438 if (load < min_load || (load == min_load && i == this_cpu)) { 4429 struct cpuidle_state *idle = idle_get_state(rq);
4439 min_load = load; 4430 if (idle && idle->exit_latency < min_exit_latency) {
4440 idlest = i; 4431 /*
4432 * We give priority to a CPU whose idle state
4433 * has the smallest exit latency irrespective
4434 * of any idle timestamp.
4435 */
4436 min_exit_latency = idle->exit_latency;
4437 latest_idle_timestamp = rq->idle_stamp;
4438 shallowest_idle_cpu = i;
4439 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
4440 rq->idle_stamp > latest_idle_timestamp) {
4441 /*
4442 * If equal or no active idle state, then
4443 * the most recently idled CPU might have
4444 * a warmer cache.
4445 */
4446 latest_idle_timestamp = rq->idle_stamp;
4447 shallowest_idle_cpu = i;
4448 }
4449 } else {
4450 load = weighted_cpuload(i);
4451 if (load < min_load || (load == min_load && i == this_cpu)) {
4452 min_load = load;
4453 least_loaded_cpu = i;
4454 }
4441 } 4455 }
4442 } 4456 }
4443 4457
4444 return idlest; 4458 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4445} 4459}
4446 4460
4447/* 4461/*
@@ -4513,11 +4527,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4513 if (p->nr_cpus_allowed == 1) 4527 if (p->nr_cpus_allowed == 1)
4514 return prev_cpu; 4528 return prev_cpu;
4515 4529
4516 if (sd_flag & SD_BALANCE_WAKE) { 4530 if (sd_flag & SD_BALANCE_WAKE)
4517 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 4531 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4518 want_affine = 1;
4519 new_cpu = prev_cpu;
4520 }
4521 4532
4522 rcu_read_lock(); 4533 rcu_read_lock();
4523 for_each_domain(cpu, tmp) { 4534 for_each_domain(cpu, tmp) {
@@ -4704,7 +4715,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
4704 return; 4715 return;
4705 4716
4706 /* 4717 /*
4707 * This is possible from callers such as move_task(), in which we 4718 * This is possible from callers such as attach_tasks(), in which we
4708 * unconditionally check_prempt_curr() after an enqueue (which may have 4719 * unconditionally check_prempt_curr() after an enqueue (which may have
4709 * lead to a throttle). This both saves work and prevents false 4720 * lead to a throttle). This both saves work and prevents false
4710 * next-buddy nomination below. 4721 * next-buddy nomination below.
@@ -5112,27 +5123,18 @@ struct lb_env {
5112 unsigned int loop_max; 5123 unsigned int loop_max;
5113 5124
5114 enum fbq_type fbq_type; 5125 enum fbq_type fbq_type;
5126 struct list_head tasks;
5115}; 5127};
5116 5128
5117/* 5129/*
5118 * move_task - move a task from one runqueue to another runqueue.
5119 * Both runqueues must be locked.
5120 */
5121static void move_task(struct task_struct *p, struct lb_env *env)
5122{
5123 deactivate_task(env->src_rq, p, 0);
5124 set_task_cpu(p, env->dst_cpu);
5125 activate_task(env->dst_rq, p, 0);
5126 check_preempt_curr(env->dst_rq, p, 0);
5127}
5128
5129/*
5130 * Is this task likely cache-hot: 5130 * Is this task likely cache-hot:
5131 */ 5131 */
5132static int task_hot(struct task_struct *p, struct lb_env *env) 5132static int task_hot(struct task_struct *p, struct lb_env *env)
5133{ 5133{
5134 s64 delta; 5134 s64 delta;
5135 5135
5136 lockdep_assert_held(&env->src_rq->lock);
5137
5136 if (p->sched_class != &fair_sched_class) 5138 if (p->sched_class != &fair_sched_class)
5137 return 0; 5139 return 0;
5138 5140
@@ -5252,6 +5254,9 @@ static
5252int can_migrate_task(struct task_struct *p, struct lb_env *env) 5254int can_migrate_task(struct task_struct *p, struct lb_env *env)
5253{ 5255{
5254 int tsk_cache_hot = 0; 5256 int tsk_cache_hot = 0;
5257
5258 lockdep_assert_held(&env->src_rq->lock);
5259
5255 /* 5260 /*
5256 * We do not migrate tasks that are: 5261 * We do not migrate tasks that are:
5257 * 1) throttled_lb_pair, or 5262 * 1) throttled_lb_pair, or
@@ -5310,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5310 if (!tsk_cache_hot) 5315 if (!tsk_cache_hot)
5311 tsk_cache_hot = migrate_degrades_locality(p, env); 5316 tsk_cache_hot = migrate_degrades_locality(p, env);
5312 5317
5313 if (migrate_improves_locality(p, env)) { 5318 if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
5314#ifdef CONFIG_SCHEDSTATS 5319 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5315 if (tsk_cache_hot) { 5320 if (tsk_cache_hot) {
5316 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 5321 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5317 schedstat_inc(p, se.statistics.nr_forced_migrations); 5322 schedstat_inc(p, se.statistics.nr_forced_migrations);
5318 } 5323 }
5319#endif
5320 return 1;
5321 }
5322
5323 if (!tsk_cache_hot ||
5324 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5325
5326 if (tsk_cache_hot) {
5327 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5328 schedstat_inc(p, se.statistics.nr_forced_migrations);
5329 }
5330
5331 return 1; 5324 return 1;
5332 } 5325 }
5333 5326
@@ -5336,47 +5329,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5336} 5329}
5337 5330
5338/* 5331/*
5339 * move_one_task tries to move exactly one task from busiest to this_rq, as 5332 * detach_task() -- detach the task for the migration specified in env
5333 */
5334static void detach_task(struct task_struct *p, struct lb_env *env)
5335{
5336 lockdep_assert_held(&env->src_rq->lock);
5337
5338 deactivate_task(env->src_rq, p, 0);
5339 p->on_rq = TASK_ON_RQ_MIGRATING;
5340 set_task_cpu(p, env->dst_cpu);
5341}
5342
5343/*
5344 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5340 * part of active balancing operations within "domain". 5345 * part of active balancing operations within "domain".
5341 * Returns 1 if successful and 0 otherwise.
5342 * 5346 *
5343 * Called with both runqueues locked. 5347 * Returns a task if successful and NULL otherwise.
5344 */ 5348 */
5345static int move_one_task(struct lb_env *env) 5349static struct task_struct *detach_one_task(struct lb_env *env)
5346{ 5350{
5347 struct task_struct *p, *n; 5351 struct task_struct *p, *n;
5348 5352
5353 lockdep_assert_held(&env->src_rq->lock);
5354
5349 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 5355 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5350 if (!can_migrate_task(p, env)) 5356 if (!can_migrate_task(p, env))
5351 continue; 5357 continue;
5352 5358
5353 move_task(p, env); 5359 detach_task(p, env);
5360
5354 /* 5361 /*
5355 * Right now, this is only the second place move_task() 5362 * Right now, this is only the second place where
5356 * is called, so we can safely collect move_task() 5363 * lb_gained[env->idle] is updated (other is detach_tasks)
5357 * stats here rather than inside move_task(). 5364 * so we can safely collect stats here rather than
5365 * inside detach_tasks().
5358 */ 5366 */
5359 schedstat_inc(env->sd, lb_gained[env->idle]); 5367 schedstat_inc(env->sd, lb_gained[env->idle]);
5360 return 1; 5368 return p;
5361 } 5369 }
5362 return 0; 5370 return NULL;
5363} 5371}
5364 5372
5365static const unsigned int sched_nr_migrate_break = 32; 5373static const unsigned int sched_nr_migrate_break = 32;
5366 5374
5367/* 5375/*
5368 * move_tasks tries to move up to imbalance weighted load from busiest to 5376 * detach_tasks() -- tries to detach up to imbalance weighted load from
5369 * this_rq, as part of a balancing operation within domain "sd". 5377 * busiest_rq, as part of a balancing operation within domain "sd".
5370 * Returns 1 if successful and 0 otherwise.
5371 * 5378 *
5372 * Called with both runqueues locked. 5379 * Returns number of detached tasks if successful and 0 otherwise.
5373 */ 5380 */
5374static int move_tasks(struct lb_env *env) 5381static int detach_tasks(struct lb_env *env)
5375{ 5382{
5376 struct list_head *tasks = &env->src_rq->cfs_tasks; 5383 struct list_head *tasks = &env->src_rq->cfs_tasks;
5377 struct task_struct *p; 5384 struct task_struct *p;
5378 unsigned long load; 5385 unsigned long load;
5379 int pulled = 0; 5386 int detached = 0;
5387
5388 lockdep_assert_held(&env->src_rq->lock);
5380 5389
5381 if (env->imbalance <= 0) 5390 if (env->imbalance <= 0)
5382 return 0; 5391 return 0;
@@ -5407,14 +5416,16 @@ static int move_tasks(struct lb_env *env)
5407 if ((load / 2) > env->imbalance) 5416 if ((load / 2) > env->imbalance)
5408 goto next; 5417 goto next;
5409 5418
5410 move_task(p, env); 5419 detach_task(p, env);
5411 pulled++; 5420 list_add(&p->se.group_node, &env->tasks);
5421
5422 detached++;
5412 env->imbalance -= load; 5423 env->imbalance -= load;
5413 5424
5414#ifdef CONFIG_PREEMPT 5425#ifdef CONFIG_PREEMPT
5415 /* 5426 /*
5416 * NEWIDLE balancing is a source of latency, so preemptible 5427 * NEWIDLE balancing is a source of latency, so preemptible
5417 * kernels will stop after the first task is pulled to minimize 5428 * kernels will stop after the first task is detached to minimize
5418 * the critical section. 5429 * the critical section.
5419 */ 5430 */
5420 if (env->idle == CPU_NEWLY_IDLE) 5431 if (env->idle == CPU_NEWLY_IDLE)
@@ -5434,13 +5445,58 @@ next:
5434 } 5445 }
5435 5446
5436 /* 5447 /*
5437 * Right now, this is one of only two places move_task() is called, 5448 * Right now, this is one of only two places we collect this stat
5438 * so we can safely collect move_task() stats here rather than 5449 * so we can safely collect detach_one_task() stats here rather
5439 * inside move_task(). 5450 * than inside detach_one_task().
5440 */ 5451 */
5441 schedstat_add(env->sd, lb_gained[env->idle], pulled); 5452 schedstat_add(env->sd, lb_gained[env->idle], detached);
5453
5454 return detached;
5455}
5456
5457/*
5458 * attach_task() -- attach the task detached by detach_task() to its new rq.
5459 */
5460static void attach_task(struct rq *rq, struct task_struct *p)
5461{
5462 lockdep_assert_held(&rq->lock);
5463
5464 BUG_ON(task_rq(p) != rq);
5465 p->on_rq = TASK_ON_RQ_QUEUED;
5466 activate_task(rq, p, 0);
5467 check_preempt_curr(rq, p, 0);
5468}
5469
5470/*
5471 * attach_one_task() -- attaches the task returned from detach_one_task() to
5472 * its new rq.
5473 */
5474static void attach_one_task(struct rq *rq, struct task_struct *p)
5475{
5476 raw_spin_lock(&rq->lock);
5477 attach_task(rq, p);
5478 raw_spin_unlock(&rq->lock);
5479}
5480
5481/*
5482 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5483 * new rq.
5484 */
5485static void attach_tasks(struct lb_env *env)
5486{
5487 struct list_head *tasks = &env->tasks;
5488 struct task_struct *p;
5489
5490 raw_spin_lock(&env->dst_rq->lock);
5491
5492 while (!list_empty(tasks)) {
5493 p = list_first_entry(tasks, struct task_struct, se.group_node);
5494 list_del_init(&p->se.group_node);
5442 5495
5443 return pulled; 5496 attach_task(env->dst_rq, p);
5497 }
5498
5499 raw_spin_unlock(&env->dst_rq->lock);
5444} 5500}
5445 5501
5446#ifdef CONFIG_FAIR_GROUP_SCHED 5502#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5559,6 +5615,13 @@ static unsigned long task_h_load(struct task_struct *p)
5559#endif 5615#endif
5560 5616
5561/********** Helpers for find_busiest_group ************************/ 5617/********** Helpers for find_busiest_group ************************/
5618
5619enum group_type {
5620 group_other = 0,
5621 group_imbalanced,
5622 group_overloaded,
5623};
5624
5562/* 5625/*
5563 * sg_lb_stats - stats of a sched_group required for load_balancing 5626 * sg_lb_stats - stats of a sched_group required for load_balancing
5564 */ 5627 */
@@ -5572,7 +5635,7 @@ struct sg_lb_stats {
5572 unsigned int group_capacity_factor; 5635 unsigned int group_capacity_factor;
5573 unsigned int idle_cpus; 5636 unsigned int idle_cpus;
5574 unsigned int group_weight; 5637 unsigned int group_weight;
5575 int group_imb; /* Is there an imbalance in the group ? */ 5638 enum group_type group_type;
5576 int group_has_free_capacity; 5639 int group_has_free_capacity;
5577#ifdef CONFIG_NUMA_BALANCING 5640#ifdef CONFIG_NUMA_BALANCING
5578 unsigned int nr_numa_running; 5641 unsigned int nr_numa_running;
@@ -5610,6 +5673,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5610 .total_capacity = 0UL, 5673 .total_capacity = 0UL,
5611 .busiest_stat = { 5674 .busiest_stat = {
5612 .avg_load = 0UL, 5675 .avg_load = 0UL,
5676 .sum_nr_running = 0,
5677 .group_type = group_other,
5613 }, 5678 },
5614 }; 5679 };
5615} 5680}
@@ -5652,19 +5717,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5652 return default_scale_capacity(sd, cpu); 5717 return default_scale_capacity(sd, cpu);
5653} 5718}
5654 5719
5655static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) 5720static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5656{ 5721{
5657 unsigned long weight = sd->span_weight; 5722 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
5658 unsigned long smt_gain = sd->smt_gain; 5723 return sd->smt_gain / sd->span_weight;
5659 5724
5660 smt_gain /= weight; 5725 return SCHED_CAPACITY_SCALE;
5661
5662 return smt_gain;
5663} 5726}
5664 5727
5665unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) 5728unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5666{ 5729{
5667 return default_scale_smt_capacity(sd, cpu); 5730 return default_scale_cpu_capacity(sd, cpu);
5668} 5731}
5669 5732
5670static unsigned long scale_rt_capacity(int cpu) 5733static unsigned long scale_rt_capacity(int cpu)
@@ -5703,18 +5766,15 @@ static unsigned long scale_rt_capacity(int cpu)
5703 5766
5704static void update_cpu_capacity(struct sched_domain *sd, int cpu) 5767static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5705{ 5768{
5706 unsigned long weight = sd->span_weight;
5707 unsigned long capacity = SCHED_CAPACITY_SCALE; 5769 unsigned long capacity = SCHED_CAPACITY_SCALE;
5708 struct sched_group *sdg = sd->groups; 5770 struct sched_group *sdg = sd->groups;
5709 5771
5710 if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { 5772 if (sched_feat(ARCH_CAPACITY))
5711 if (sched_feat(ARCH_CAPACITY)) 5773 capacity *= arch_scale_cpu_capacity(sd, cpu);
5712 capacity *= arch_scale_smt_capacity(sd, cpu); 5774 else
5713 else 5775 capacity *= default_scale_cpu_capacity(sd, cpu);
5714 capacity *= default_scale_smt_capacity(sd, cpu);
5715 5776
5716 capacity >>= SCHED_CAPACITY_SHIFT; 5777 capacity >>= SCHED_CAPACITY_SHIFT;
5717 }
5718 5778
5719 sdg->sgc->capacity_orig = capacity; 5779 sdg->sgc->capacity_orig = capacity;
5720 5780
@@ -5891,6 +5951,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
5891 return capacity_factor; 5951 return capacity_factor;
5892} 5952}
5893 5953
5954static enum group_type
5955group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
5956{
5957 if (sgs->sum_nr_running > sgs->group_capacity_factor)
5958 return group_overloaded;
5959
5960 if (sg_imbalanced(group))
5961 return group_imbalanced;
5962
5963 return group_other;
5964}
5965
5894/** 5966/**
5895 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 5967 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
5896 * @env: The load balancing environment. 5968 * @env: The load balancing environment.
@@ -5920,7 +5992,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5920 load = source_load(i, load_idx); 5992 load = source_load(i, load_idx);
5921 5993
5922 sgs->group_load += load; 5994 sgs->group_load += load;
5923 sgs->sum_nr_running += rq->nr_running; 5995 sgs->sum_nr_running += rq->cfs.h_nr_running;
5924 5996
5925 if (rq->nr_running > 1) 5997 if (rq->nr_running > 1)
5926 *overload = true; 5998 *overload = true;
@@ -5942,9 +6014,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5942 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 6014 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5943 6015
5944 sgs->group_weight = group->group_weight; 6016 sgs->group_weight = group->group_weight;
5945
5946 sgs->group_imb = sg_imbalanced(group);
5947 sgs->group_capacity_factor = sg_capacity_factor(env, group); 6017 sgs->group_capacity_factor = sg_capacity_factor(env, group);
6018 sgs->group_type = group_classify(group, sgs);
5948 6019
5949 if (sgs->group_capacity_factor > sgs->sum_nr_running) 6020 if (sgs->group_capacity_factor > sgs->sum_nr_running)
5950 sgs->group_has_free_capacity = 1; 6021 sgs->group_has_free_capacity = 1;
@@ -5968,13 +6039,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5968 struct sched_group *sg, 6039 struct sched_group *sg,
5969 struct sg_lb_stats *sgs) 6040 struct sg_lb_stats *sgs)
5970{ 6041{
5971 if (sgs->avg_load <= sds->busiest_stat.avg_load) 6042 struct sg_lb_stats *busiest = &sds->busiest_stat;
5972 return false;
5973 6043
5974 if (sgs->sum_nr_running > sgs->group_capacity_factor) 6044 if (sgs->group_type > busiest->group_type)
5975 return true; 6045 return true;
5976 6046
5977 if (sgs->group_imb) 6047 if (sgs->group_type < busiest->group_type)
6048 return false;
6049
6050 if (sgs->avg_load <= busiest->avg_load)
6051 return false;
6052
6053 /* This is the busiest node in its class. */
6054 if (!(env->sd->flags & SD_ASYM_PACKING))
5978 return true; 6055 return true;
5979 6056
5980 /* 6057 /*
@@ -5982,8 +6059,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5982 * numbered CPUs in the group, therefore mark all groups 6059 * numbered CPUs in the group, therefore mark all groups
5983 * higher than ourself as busy. 6060 * higher than ourself as busy.
5984 */ 6061 */
5985 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 6062 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
5986 env->dst_cpu < group_first_cpu(sg)) {
5987 if (!sds->busiest) 6063 if (!sds->busiest)
5988 return true; 6064 return true;
5989 6065
@@ -6228,7 +6304,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6228 local = &sds->local_stat; 6304 local = &sds->local_stat;
6229 busiest = &sds->busiest_stat; 6305 busiest = &sds->busiest_stat;
6230 6306
6231 if (busiest->group_imb) { 6307 if (busiest->group_type == group_imbalanced) {
6232 /* 6308 /*
6233 * In the group_imb case we cannot rely on group-wide averages 6309 * In the group_imb case we cannot rely on group-wide averages
6234 * to ensure cpu-load equilibrium, look at wider averages. XXX 6310 * to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,12 +6324,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6248 return fix_small_imbalance(env, sds); 6324 return fix_small_imbalance(env, sds);
6249 } 6325 }
6250 6326
6251 if (!busiest->group_imb) { 6327 /*
6252 /* 6328 * If there aren't any idle cpus, avoid creating some.
6253 * Don't want to pull so many tasks that a group would go idle. 6329 */
6254 * Except of course for the group_imb case, since then we might 6330 if (busiest->group_type == group_overloaded &&
6255 * have to drop below capacity to reach cpu-load equilibrium. 6331 local->group_type == group_overloaded) {
6256 */
6257 load_above_capacity = 6332 load_above_capacity =
6258 (busiest->sum_nr_running - busiest->group_capacity_factor); 6333 (busiest->sum_nr_running - busiest->group_capacity_factor);
6259 6334
@@ -6337,7 +6412,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6337 * work because they assume all things are equal, which typically 6412 * work because they assume all things are equal, which typically
6338 * isn't true due to cpus_allowed constraints and the like. 6413 * isn't true due to cpus_allowed constraints and the like.
6339 */ 6414 */
6340 if (busiest->group_imb) 6415 if (busiest->group_type == group_imbalanced)
6341 goto force_balance; 6416 goto force_balance;
6342 6417
6343 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 6418 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
@@ -6346,7 +6421,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6346 goto force_balance; 6421 goto force_balance;
6347 6422
6348 /* 6423 /*
6349 * If the local group is more busy than the selected busiest group 6424 * If the local group is busier than the selected busiest group
6350 * don't try and pull any tasks. 6425 * don't try and pull any tasks.
6351 */ 6426 */
6352 if (local->avg_load >= busiest->avg_load) 6427 if (local->avg_load >= busiest->avg_load)
@@ -6361,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6361 6436
6362 if (env->idle == CPU_IDLE) { 6437 if (env->idle == CPU_IDLE) {
6363 /* 6438 /*
6364 * This cpu is idle. If the busiest group load doesn't 6439 * This cpu is idle. If the busiest group is not overloaded
6365 * have more tasks than the number of available cpu's and 6440 * and there is no imbalance between this and busiest group
6366 * there is no imbalance between this and busiest group 6441 * wrt idle cpus, it is balanced. The imbalance becomes
6367 * wrt to idle cpu's, it is balanced. 6442 * significant if the diff is greater than 1 otherwise we
6443 * might end up to just move the imbalance on another group
6368 */ 6444 */
6369 if ((local->idle_cpus < busiest->idle_cpus) && 6445 if ((busiest->group_type != group_overloaded) &&
6370 busiest->sum_nr_running <= busiest->group_weight) 6446 (local->idle_cpus <= (busiest->idle_cpus + 1)))
6371 goto out_balanced; 6447 goto out_balanced;
6372 } else { 6448 } else {
6373 /* 6449 /*
@@ -6550,6 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
6550 .loop_break = sched_nr_migrate_break, 6626 .loop_break = sched_nr_migrate_break,
6551 .cpus = cpus, 6627 .cpus = cpus,
6552 .fbq_type = all, 6628 .fbq_type = all,
6629 .tasks = LIST_HEAD_INIT(env.tasks),
6553 }; 6630 };
6554 6631
6555 /* 6632 /*
@@ -6599,23 +6676,30 @@ redo:
6599 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 6676 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6600 6677
6601more_balance: 6678more_balance:
6602 local_irq_save(flags); 6679 raw_spin_lock_irqsave(&busiest->lock, flags);
6603 double_rq_lock(env.dst_rq, busiest);
6604 6680
6605 /* 6681 /*
6606 * cur_ld_moved - load moved in current iteration 6682 * cur_ld_moved - load moved in current iteration
6607 * ld_moved - cumulative load moved across iterations 6683 * ld_moved - cumulative load moved across iterations
6608 */ 6684 */
6609 cur_ld_moved = move_tasks(&env); 6685 cur_ld_moved = detach_tasks(&env);
6610 ld_moved += cur_ld_moved;
6611 double_rq_unlock(env.dst_rq, busiest);
6612 local_irq_restore(flags);
6613 6686
6614 /* 6687 /*
6615 * some other cpu did the load balance for us. 6688 * We've detached some tasks from busiest_rq. Every
6689 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
6690 * unlock busiest->lock, and we are able to be sure
6691 * that nobody can manipulate the tasks in parallel.
6692 * See task_rq_lock() family for the details.
6616 */ 6693 */
6617 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 6694
6618 resched_cpu(env.dst_cpu); 6695 raw_spin_unlock(&busiest->lock);
6696
6697 if (cur_ld_moved) {
6698 attach_tasks(&env);
6699 ld_moved += cur_ld_moved;
6700 }
6701
6702 local_irq_restore(flags);
6619 6703
6620 if (env.flags & LBF_NEED_BREAK) { 6704 if (env.flags & LBF_NEED_BREAK) {
6621 env.flags &= ~LBF_NEED_BREAK; 6705 env.flags &= ~LBF_NEED_BREAK;
@@ -6665,10 +6749,8 @@ more_balance:
6665 if (sd_parent) { 6749 if (sd_parent) {
6666 int *group_imbalance = &sd_parent->groups->sgc->imbalance; 6750 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6667 6751
6668 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6752 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
6669 *group_imbalance = 1; 6753 *group_imbalance = 1;
6670 } else if (*group_imbalance)
6671 *group_imbalance = 0;
6672 } 6754 }
6673 6755
6674 /* All tasks on this runqueue were pinned by CPU affinity */ 6756 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -6679,7 +6761,7 @@ more_balance:
6679 env.loop_break = sched_nr_migrate_break; 6761 env.loop_break = sched_nr_migrate_break;
6680 goto redo; 6762 goto redo;
6681 } 6763 }
6682 goto out_balanced; 6764 goto out_all_pinned;
6683 } 6765 }
6684 } 6766 }
6685 6767
@@ -6744,7 +6826,7 @@ more_balance:
6744 * If we've begun active balancing, start to back off. This 6826 * If we've begun active balancing, start to back off. This
6745 * case may not be covered by the all_pinned logic if there 6827 * case may not be covered by the all_pinned logic if there
6746 * is only 1 task on the busy runqueue (because we don't call 6828 * is only 1 task on the busy runqueue (because we don't call
6747 * move_tasks). 6829 * detach_tasks).
6748 */ 6830 */
6749 if (sd->balance_interval < sd->max_interval) 6831 if (sd->balance_interval < sd->max_interval)
6750 sd->balance_interval *= 2; 6832 sd->balance_interval *= 2;
@@ -6753,6 +6835,23 @@ more_balance:
6753 goto out; 6835 goto out;
6754 6836
6755out_balanced: 6837out_balanced:
6838 /*
6839 * We reach balance although we may have faced some affinity
6840 * constraints. Clear the imbalance flag if it was set.
6841 */
6842 if (sd_parent) {
6843 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6844
6845 if (*group_imbalance)
6846 *group_imbalance = 0;
6847 }
6848
6849out_all_pinned:
6850 /*
6851 * We reach balance because all tasks are pinned at this level so
6852 * we can't migrate them. Let the imbalance flag set so parent level
6853 * can try to migrate them.
6854 */
6756 schedstat_inc(sd, lb_balanced[idle]); 6855 schedstat_inc(sd, lb_balanced[idle]);
6757 6856
6758 sd->nr_balance_failed = 0; 6857 sd->nr_balance_failed = 0;
@@ -6914,6 +7013,7 @@ static int active_load_balance_cpu_stop(void *data)
6914 int target_cpu = busiest_rq->push_cpu; 7013 int target_cpu = busiest_rq->push_cpu;
6915 struct rq *target_rq = cpu_rq(target_cpu); 7014 struct rq *target_rq = cpu_rq(target_cpu);
6916 struct sched_domain *sd; 7015 struct sched_domain *sd;
7016 struct task_struct *p = NULL;
6917 7017
6918 raw_spin_lock_irq(&busiest_rq->lock); 7018 raw_spin_lock_irq(&busiest_rq->lock);
6919 7019
@@ -6933,9 +7033,6 @@ static int active_load_balance_cpu_stop(void *data)
6933 */ 7033 */
6934 BUG_ON(busiest_rq == target_rq); 7034 BUG_ON(busiest_rq == target_rq);
6935 7035
6936 /* move a task from busiest_rq to target_rq */
6937 double_lock_balance(busiest_rq, target_rq);
6938
6939 /* Search for an sd spanning us and the target CPU. */ 7036 /* Search for an sd spanning us and the target CPU. */
6940 rcu_read_lock(); 7037 rcu_read_lock();
6941 for_each_domain(target_cpu, sd) { 7038 for_each_domain(target_cpu, sd) {
@@ -6956,16 +7053,22 @@ static int active_load_balance_cpu_stop(void *data)
6956 7053
6957 schedstat_inc(sd, alb_count); 7054 schedstat_inc(sd, alb_count);
6958 7055
6959 if (move_one_task(&env)) 7056 p = detach_one_task(&env);
7057 if (p)
6960 schedstat_inc(sd, alb_pushed); 7058 schedstat_inc(sd, alb_pushed);
6961 else 7059 else
6962 schedstat_inc(sd, alb_failed); 7060 schedstat_inc(sd, alb_failed);
6963 } 7061 }
6964 rcu_read_unlock(); 7062 rcu_read_unlock();
6965 double_unlock_balance(busiest_rq, target_rq);
6966out_unlock: 7063out_unlock:
6967 busiest_rq->active_balance = 0; 7064 busiest_rq->active_balance = 0;
6968 raw_spin_unlock_irq(&busiest_rq->lock); 7065 raw_spin_unlock(&busiest_rq->lock);
7066
7067 if (p)
7068 attach_one_task(target_rq, p);
7069
7070 local_irq_enable();
7071
6969 return 0; 7072 return 0;
6970} 7073}
6971 7074
@@ -7465,7 +7568,7 @@ static void task_fork_fair(struct task_struct *p)
7465static void 7568static void
7466prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) 7569prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7467{ 7570{
7468 if (!p->se.on_rq) 7571 if (!task_on_rq_queued(p))
7469 return; 7572 return;
7470 7573
7471 /* 7574 /*
@@ -7490,11 +7593,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7490 * switched back to the fair class the enqueue_entity(.flags=0) will 7593 * switched back to the fair class the enqueue_entity(.flags=0) will
7491 * do the right thing. 7594 * do the right thing.
7492 * 7595 *
7493 * If it's on_rq, then the dequeue_entity(.flags=0) will already 7596 * If it's queued, then the dequeue_entity(.flags=0) will already
7494 * have normalized the vruntime, if it's !on_rq, then only when 7597 * have normalized the vruntime, if it's !queued, then only when
7495 * the task is sleeping will it still have non-normalized vruntime. 7598 * the task is sleeping will it still have non-normalized vruntime.
7496 */ 7599 */
7497 if (!p->on_rq && p->state != TASK_RUNNING) { 7600 if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
7498 /* 7601 /*
7499 * Fix up our vruntime so that the current sleep doesn't 7602 * Fix up our vruntime so that the current sleep doesn't
7500 * cause 'unlimited' sleep bonus. 7603 * cause 'unlimited' sleep bonus.
@@ -7521,15 +7624,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7521 */ 7624 */
7522static void switched_to_fair(struct rq *rq, struct task_struct *p) 7625static void switched_to_fair(struct rq *rq, struct task_struct *p)
7523{ 7626{
7524 struct sched_entity *se = &p->se;
7525#ifdef CONFIG_FAIR_GROUP_SCHED 7627#ifdef CONFIG_FAIR_GROUP_SCHED
7628 struct sched_entity *se = &p->se;
7526 /* 7629 /*
7527 * Since the real-depth could have been changed (only FAIR 7630 * Since the real-depth could have been changed (only FAIR
7528 * class maintain depth value), reset depth properly. 7631 * class maintain depth value), reset depth properly.
7529 */ 7632 */
7530 se->depth = se->parent ? se->parent->depth + 1 : 0; 7633 se->depth = se->parent ? se->parent->depth + 1 : 0;
7531#endif 7634#endif
7532 if (!se->on_rq) 7635 if (!task_on_rq_queued(p))
7533 return; 7636 return;
7534 7637
7535 /* 7638 /*
@@ -7575,7 +7678,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7575} 7678}
7576 7679
7577#ifdef CONFIG_FAIR_GROUP_SCHED 7680#ifdef CONFIG_FAIR_GROUP_SCHED
7578static void task_move_group_fair(struct task_struct *p, int on_rq) 7681static void task_move_group_fair(struct task_struct *p, int queued)
7579{ 7682{
7580 struct sched_entity *se = &p->se; 7683 struct sched_entity *se = &p->se;
7581 struct cfs_rq *cfs_rq; 7684 struct cfs_rq *cfs_rq;
@@ -7594,7 +7697,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7594 * fair sleeper stuff for the first placement, but who cares. 7697 * fair sleeper stuff for the first placement, but who cares.
7595 */ 7698 */
7596 /* 7699 /*
7597 * When !on_rq, vruntime of the task has usually NOT been normalized. 7700 * When !queued, vruntime of the task has usually NOT been normalized.
7598 * But there are some cases where it has already been normalized: 7701 * But there are some cases where it has already been normalized:
7599 * 7702 *
7600 * - Moving a forked child which is waiting for being woken up by 7703 * - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7708,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7605 * To prevent boost or penalty in the new cfs_rq caused by delta 7708 * To prevent boost or penalty in the new cfs_rq caused by delta
7606 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7709 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7607 */ 7710 */
7608 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) 7711 if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7609 on_rq = 1; 7712 queued = 1;
7610 7713
7611 if (!on_rq) 7714 if (!queued)
7612 se->vruntime -= cfs_rq_of(se)->min_vruntime; 7715 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7613 set_task_rq(p, task_cpu(p)); 7716 set_task_rq(p, task_cpu(p));
7614 se->depth = se->parent ? se->parent->depth + 1 : 0; 7717 se->depth = se->parent ? se->parent->depth + 1 : 0;
7615 if (!on_rq) { 7718 if (!queued) {
7616 cfs_rq = cfs_rq_of(se); 7719 cfs_rq = cfs_rq_of(se);
7617 se->vruntime += cfs_rq->min_vruntime; 7720 se->vruntime += cfs_rq->min_vruntime;
7618#ifdef CONFIG_SMP 7721#ifdef CONFIG_SMP
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 11e7bc434f43..c47fce75e666 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,6 +147,9 @@ use_default:
147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) 147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
148 goto use_default; 148 goto use_default;
149 149
150 /* Take note of the planned idle state. */
151 idle_set_state(this_rq(), &drv->states[next_state]);
152
150 /* 153 /*
151 * Enter the idle state previously returned by the governor decision. 154 * Enter the idle state previously returned by the governor decision.
152 * This function will block until an interrupt occurs and will take 155 * This function will block until an interrupt occurs and will take
@@ -154,6 +157,9 @@ use_default:
154 */ 157 */
155 entered_state = cpuidle_enter(drv, dev, next_state); 158 entered_state = cpuidle_enter(drv, dev, next_state);
156 159
160 /* The cpu is no longer idle or about to enter idle. */
161 idle_set_state(this_rq(), NULL);
162
157 if (broadcast) 163 if (broadcast)
158 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 164 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
159 165
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca4fafd..87ea5bf1b87f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1448 * means a dl or stop task can slip in, in which case we need 1448 * means a dl or stop task can slip in, in which case we need
1449 * to re-start task selection. 1449 * to re-start task selection.
1450 */ 1450 */
1451 if (unlikely((rq->stop && rq->stop->on_rq) || 1451 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1452 rq->dl.dl_nr_running)) 1452 rq->dl.dl_nr_running))
1453 return RETRY_TASK; 1453 return RETRY_TASK;
1454 } 1454 }
@@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1468 p = _pick_next_task_rt(rq); 1468 p = _pick_next_task_rt(rq);
1469 1469
1470 /* The running task is never eligible for pushing */ 1470 /* The running task is never eligible for pushing */
1471 if (p) 1471 dequeue_pushable_task(rq, p);
1472 dequeue_pushable_task(rq, p);
1473 1472
1474 set_post_schedule(rq); 1473 set_post_schedule(rq);
1475 1474
@@ -1624,7 +1623,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1624 !cpumask_test_cpu(lowest_rq->cpu, 1623 !cpumask_test_cpu(lowest_rq->cpu,
1625 tsk_cpus_allowed(task)) || 1624 tsk_cpus_allowed(task)) ||
1626 task_running(rq, task) || 1625 task_running(rq, task) ||
1627 !task->on_rq)) { 1626 !task_on_rq_queued(task))) {
1628 1627
1629 double_unlock_balance(rq, lowest_rq); 1628 double_unlock_balance(rq, lowest_rq);
1630 lowest_rq = NULL; 1629 lowest_rq = NULL;
@@ -1658,7 +1657,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1658 BUG_ON(task_current(rq, p)); 1657 BUG_ON(task_current(rq, p));
1659 BUG_ON(p->nr_cpus_allowed <= 1); 1658 BUG_ON(p->nr_cpus_allowed <= 1);
1660 1659
1661 BUG_ON(!p->on_rq); 1660 BUG_ON(!task_on_rq_queued(p));
1662 BUG_ON(!rt_task(p)); 1661 BUG_ON(!rt_task(p));
1663 1662
1664 return p; 1663 return p;
@@ -1809,7 +1808,7 @@ static int pull_rt_task(struct rq *this_rq)
1809 */ 1808 */
1810 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1809 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1811 WARN_ON(p == src_rq->curr); 1810 WARN_ON(p == src_rq->curr);
1812 WARN_ON(!p->on_rq); 1811 WARN_ON(!task_on_rq_queued(p));
1813 1812
1814 /* 1813 /*
1815 * There's a chance that p is higher in priority 1814 * There's a chance that p is higher in priority
@@ -1870,7 +1869,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1870 1869
1871 BUG_ON(!rt_task(p)); 1870 BUG_ON(!rt_task(p));
1872 1871
1873 if (!p->on_rq) 1872 if (!task_on_rq_queued(p))
1874 return; 1873 return;
1875 1874
1876 weight = cpumask_weight(new_mask); 1875 weight = cpumask_weight(new_mask);
@@ -1936,7 +1935,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1936 * we may need to handle the pulling of RT tasks 1935 * we may need to handle the pulling of RT tasks
1937 * now. 1936 * now.
1938 */ 1937 */
1939 if (!p->on_rq || rq->rt.rt_nr_running) 1938 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
1940 return; 1939 return;
1941 1940
1942 if (pull_rt_task(rq)) 1941 if (pull_rt_task(rq))
@@ -1970,7 +1969,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1970 * If that current running task is also an RT task 1969 * If that current running task is also an RT task
1971 * then see if we can move to another run queue. 1970 * then see if we can move to another run queue.
1972 */ 1971 */
1973 if (p->on_rq && rq->curr != p) { 1972 if (task_on_rq_queued(p) && rq->curr != p) {
1974#ifdef CONFIG_SMP 1973#ifdef CONFIG_SMP
1975 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && 1974 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
1976 /* Don't resched if we changed runqueues */ 1975 /* Don't resched if we changed runqueues */
@@ -1989,7 +1988,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1989static void 1988static void
1990prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1989prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1991{ 1990{
1992 if (!p->on_rq) 1991 if (!task_on_rq_queued(p))
1993 return; 1992 return;
1994 1993
1995 if (rq->curr == p) { 1994 if (rq->curr == p) {
@@ -2073,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2073 for_each_sched_rt_entity(rt_se) { 2072 for_each_sched_rt_entity(rt_se) {
2074 if (rt_se->run_list.prev != rt_se->run_list.next) { 2073 if (rt_se->run_list.prev != rt_se->run_list.next) {
2075 requeue_task_rt(rq, p, 0); 2074 requeue_task_rt(rq, p, 0);
2076 set_tsk_need_resched(p); 2075 resched_curr(rq);
2077 return; 2076 return;
2078 } 2077 }
2079 } 2078 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f4e9d5..6130251de280 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,11 @@
14#include "cpuacct.h" 14#include "cpuacct.h"
15 15
16struct rq; 16struct rq;
17struct cpuidle_state;
18
19/* task_struct::on_rq states: */
20#define TASK_ON_RQ_QUEUED 1
21#define TASK_ON_RQ_MIGRATING 2
17 22
18extern __read_mostly int scheduler_running; 23extern __read_mostly int scheduler_running;
19 24
@@ -126,6 +131,9 @@ struct rt_bandwidth {
126 u64 rt_runtime; 131 u64 rt_runtime;
127 struct hrtimer rt_period_timer; 132 struct hrtimer rt_period_timer;
128}; 133};
134
135void __dl_clear_params(struct task_struct *p);
136
129/* 137/*
130 * To keep the bandwidth of -deadline tasks and groups under control 138 * To keep the bandwidth of -deadline tasks and groups under control
131 * we need some place where: 139 * we need some place where:
@@ -184,7 +192,7 @@ struct cfs_bandwidth {
184 raw_spinlock_t lock; 192 raw_spinlock_t lock;
185 ktime_t period; 193 ktime_t period;
186 u64 quota, runtime; 194 u64 quota, runtime;
187 s64 hierarchal_quota; 195 s64 hierarchical_quota;
188 u64 runtime_expires; 196 u64 runtime_expires;
189 197
190 int idle, timer_active; 198 int idle, timer_active;
@@ -636,6 +644,11 @@ struct rq {
636#ifdef CONFIG_SMP 644#ifdef CONFIG_SMP
637 struct llist_head wake_list; 645 struct llist_head wake_list;
638#endif 646#endif
647
648#ifdef CONFIG_CPU_IDLE
649 /* Must be inspected within a rcu lock section */
650 struct cpuidle_state *idle_state;
651#endif
639}; 652};
640 653
641static inline int cpu_of(struct rq *rq) 654static inline int cpu_of(struct rq *rq)
@@ -647,7 +660,7 @@ static inline int cpu_of(struct rq *rq)
647#endif 660#endif
648} 661}
649 662
650DECLARE_PER_CPU(struct rq, runqueues); 663DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
651 664
652#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 665#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
653#define this_rq() (&__get_cpu_var(runqueues)) 666#define this_rq() (&__get_cpu_var(runqueues))
@@ -942,6 +955,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
942#endif 955#endif
943} 956}
944 957
958static inline int task_on_rq_queued(struct task_struct *p)
959{
960 return p->on_rq == TASK_ON_RQ_QUEUED;
961}
962
963static inline int task_on_rq_migrating(struct task_struct *p)
964{
965 return p->on_rq == TASK_ON_RQ_MIGRATING;
966}
945 967
946#ifndef prepare_arch_switch 968#ifndef prepare_arch_switch
947# define prepare_arch_switch(next) do { } while (0) 969# define prepare_arch_switch(next) do { } while (0)
@@ -953,7 +975,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
953# define finish_arch_post_lock_switch() do { } while (0) 975# define finish_arch_post_lock_switch() do { } while (0)
954#endif 976#endif
955 977
956#ifndef __ARCH_WANT_UNLOCKED_CTXSW
957static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 978static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
958{ 979{
959#ifdef CONFIG_SMP 980#ifdef CONFIG_SMP
@@ -991,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
991 raw_spin_unlock_irq(&rq->lock); 1012 raw_spin_unlock_irq(&rq->lock);
992} 1013}
993 1014
994#else /* __ARCH_WANT_UNLOCKED_CTXSW */
995static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
996{
997#ifdef CONFIG_SMP
998 /*
999 * We can optimise this out completely for !SMP, because the
1000 * SMP rebalancing from interrupt is the only thing that cares
1001 * here.
1002 */
1003 next->on_cpu = 1;
1004#endif
1005 raw_spin_unlock(&rq->lock);
1006}
1007
1008static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1009{
1010#ifdef CONFIG_SMP
1011 /*
1012 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1013 * We must ensure this doesn't happen until the switch is completely
1014 * finished.
1015 */
1016 smp_wmb();
1017 prev->on_cpu = 0;
1018#endif
1019 local_irq_enable();
1020}
1021#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1022
1023/* 1015/*
1024 * wake flags 1016 * wake flags
1025 */ 1017 */
@@ -1180,6 +1172,30 @@ static inline void idle_exit_fair(struct rq *rq) { }
1180 1172
1181#endif 1173#endif
1182 1174
1175#ifdef CONFIG_CPU_IDLE
1176static inline void idle_set_state(struct rq *rq,
1177 struct cpuidle_state *idle_state)
1178{
1179 rq->idle_state = idle_state;
1180}
1181
1182static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1183{
1184 WARN_ON(!rcu_read_lock_held());
1185 return rq->idle_state;
1186}
1187#else
1188static inline void idle_set_state(struct rq *rq,
1189 struct cpuidle_state *idle_state)
1190{
1191}
1192
1193static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1194{
1195 return NULL;
1196}
1197#endif
1198
1183extern void sysrq_sched_debug_show(void); 1199extern void sysrq_sched_debug_show(void);
1184extern void sched_init_granularity(void); 1200extern void sched_init_granularity(void);
1185extern void update_max_interval(void); 1201extern void update_max_interval(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0edadbfbb..67426e529f59 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
28{ 28{
29 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
30 30
31 if (!stop || !stop->on_rq) 31 if (!stop || !task_on_rq_queued(stop))
32 return NULL; 32 return NULL;
33 33
34 put_prev_task(rq, prev); 34 put_prev_task(rq, prev);
diff --git a/kernel/smp.c b/kernel/smp.c
index aff8aa14f547..9e0d0b289118 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/gfp.h> 13#include <linux/gfp.h>
14#include <linux/smp.h> 14#include <linux/smp.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/sched.h>
16 17
17#include "smpboot.h" 18#include "smpboot.h"
18 19
@@ -699,3 +700,24 @@ void kick_all_cpus_sync(void)
699 smp_call_function(do_nothing, NULL, 1); 700 smp_call_function(do_nothing, NULL, 1);
700} 701}
701EXPORT_SYMBOL_GPL(kick_all_cpus_sync); 702EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
703
704/**
705 * wake_up_all_idle_cpus - break all cpus out of idle
706 * wake_up_all_idle_cpus try to break all cpus which is in idle state even
707 * including idle polling cpus, for non-idle cpus, we will do nothing
708 * for them.
709 */
710void wake_up_all_idle_cpus(void)
711{
712 int cpu;
713
714 preempt_disable();
715 for_each_online_cpu(cpu) {
716 if (cpu == smp_processor_id())
717 continue;
718
719 wake_up_if_idle(cpu);
720 }
721 preempt_enable();
722}
723EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/sys.c b/kernel/sys.c
index dfce4debd138..1eaa2f0b0246 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -869,11 +869,9 @@ void do_sys_times(struct tms *tms)
869{ 869{
870 cputime_t tgutime, tgstime, cutime, cstime; 870 cputime_t tgutime, tgstime, cutime, cstime;
871 871
872 spin_lock_irq(&current->sighand->siglock);
873 thread_group_cputime_adjusted(current, &tgutime, &tgstime); 872 thread_group_cputime_adjusted(current, &tgutime, &tgstime);
874 cutime = current->signal->cutime; 873 cutime = current->signal->cutime;
875 cstime = current->signal->cstime; 874 cstime = current->signal->cstime;
876 spin_unlock_irq(&current->sighand->siglock);
877 tms->tms_utime = cputime_to_clock_t(tgutime); 875 tms->tms_utime = cputime_to_clock_t(tgutime);
878 tms->tms_stime = cputime_to_clock_t(tgstime); 876 tms->tms_stime = cputime_to_clock_t(tgstime);
879 tms->tms_cutime = cputime_to_clock_t(cutime); 877 tms->tms_cutime = cputime_to_clock_t(cutime);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1c2fe7de2842..ab370ffffd53 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1776 */ 1776 */
1777 if (!expires) { 1777 if (!expires) {
1778 schedule(); 1778 schedule();
1779 __set_current_state(TASK_RUNNING);
1780 return -EINTR; 1779 return -EINTR;
1781 } 1780 }
1782 1781
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..492b986195d5 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
272 if (same_thread_group(tsk, current)) 272 if (same_thread_group(tsk, current))
273 err = cpu_clock_sample(which_clock, tsk, &rtn); 273 err = cpu_clock_sample(which_clock, tsk, &rtn);
274 } else { 274 } else {
275 unsigned long flags;
276 struct sighand_struct *sighand;
277
278 /*
279 * while_each_thread() is not yet entirely RCU safe,
280 * keep locking the group while sampling process
281 * clock for now.
282 */
283 sighand = lock_task_sighand(tsk, &flags);
284 if (!sighand)
285 return err;
286
287 if (tsk == current || thread_group_leader(tsk)) 275 if (tsk == current || thread_group_leader(tsk))
288 err = cpu_clock_sample_group(which_clock, tsk, &rtn); 276 err = cpu_clock_sample_group(which_clock, tsk, &rtn);
289
290 unlock_task_sighand(tsk, &flags);
291 } 277 }
292 278
293 if (!err) 279 if (!err)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0434ff1b808e..3f9e328c30b5 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -205,7 +205,6 @@ static void ring_buffer_consumer(void)
205 break; 205 break;
206 206
207 schedule(); 207 schedule();
208 __set_current_state(TASK_RUNNING);
209 } 208 }
210 reader_finish = 0; 209 reader_finish = 0;
211 complete(&read_done); 210 complete(&read_done);
@@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg)
379 break; 378 break;
380 379
381 schedule(); 380 schedule();
382 __set_current_state(TASK_RUNNING);
383 } 381 }
384 __set_current_state(TASK_RUNNING); 382 __set_current_state(TASK_RUNNING);
385 383
@@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg)
407 trace_printk("Sleeping for 10 secs\n"); 405 trace_printk("Sleeping for 10 secs\n");
408 set_current_state(TASK_INTERRUPTIBLE); 406 set_current_state(TASK_INTERRUPTIBLE);
409 schedule_timeout(HZ * SLEEP_TIME); 407 schedule_timeout(HZ * SLEEP_TIME);
410 __set_current_state(TASK_RUNNING);
411 } 408 }
412 409
413 if (kill_test) 410 if (kill_test)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8a4e5cb66a4c..16eddb308c33 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,7 +13,6 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/magic.h>
17 16
18#include <asm/setup.h> 17#include <asm/setup.h>
19 18
@@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack)
171 i++; 170 i++;
172 } 171 }
173 172
174 if ((current != &init_task && 173 if (task_stack_end_corrupted(current)) {
175 *(end_of_stack(current)) != STACK_END_MAGIC)) {
176 print_max_stack(); 174 print_max_stack();
177 BUG(); 175 BUG();
178 } 176 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 49d5fb754e88..e7ad58c5fbeb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -824,6 +824,18 @@ config SCHEDSTATS
824 application, you can say N to avoid the very slight overhead 824 application, you can say N to avoid the very slight overhead
825 this adds. 825 this adds.
826 826
827config SCHED_STACK_END_CHECK
828 bool "Detect stack corruption on calls to schedule()"
829 depends on DEBUG_KERNEL
830 default n
831 help
832 This option checks for a stack overrun on calls to schedule().
833 If the stack end location is found to be over written always panic as
834 the content of the corrupted region can no longer be trusted.
835 This is to ensure no erroneous behaviour occurs which could result in
836 data corruption or a sporadic crash at a later stage once the region
837 is examined. The runtime overhead introduced is minimal.
838
827config TIMER_STATS 839config TIMER_STATS
828 bool "Collect kernel timers statistics" 840 bool "Collect kernel timers statistics"
829 depends on DEBUG_KERNEL && PROC_FS 841 depends on DEBUG_KERNEL && PROC_FS