diff options
55 files changed, 1075 insertions, 552 deletions
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 18adc92a6b3b..21461a0441c1 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt | |||
@@ -15,6 +15,8 @@ CONTENTS | |||
15 | 5. Tasks CPU affinity | 15 | 5. Tasks CPU affinity |
16 | 5.1 SCHED_DEADLINE and cpusets HOWTO | 16 | 5.1 SCHED_DEADLINE and cpusets HOWTO |
17 | 6. Future plans | 17 | 6. Future plans |
18 | A. Test suite | ||
19 | B. Minimal main() | ||
18 | 20 | ||
19 | 21 | ||
20 | 0. WARNING | 22 | 0. WARNING |
@@ -38,24 +40,25 @@ CONTENTS | |||
38 | ================== | 40 | ================== |
39 | 41 | ||
40 | SCHED_DEADLINE uses three parameters, named "runtime", "period", and | 42 | SCHED_DEADLINE uses three parameters, named "runtime", "period", and |
41 | "deadline" to schedule tasks. A SCHED_DEADLINE task is guaranteed to receive | 43 | "deadline", to schedule tasks. A SCHED_DEADLINE task should receive |
42 | "runtime" microseconds of execution time every "period" microseconds, and | 44 | "runtime" microseconds of execution time every "period" microseconds, and |
43 | these "runtime" microseconds are available within "deadline" microseconds | 45 | these "runtime" microseconds are available within "deadline" microseconds |
44 | from the beginning of the period. In order to implement this behaviour, | 46 | from the beginning of the period. In order to implement this behaviour, |
45 | every time the task wakes up, the scheduler computes a "scheduling deadline" | 47 | every time the task wakes up, the scheduler computes a "scheduling deadline" |
46 | consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then | 48 | consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then |
47 | scheduled using EDF[1] on these scheduling deadlines (the task with the | 49 | scheduled using EDF[1] on these scheduling deadlines (the task with the |
48 | smallest scheduling deadline is selected for execution). Notice that this | 50 | earliest scheduling deadline is selected for execution). Notice that the |
49 | guaranteed is respected if a proper "admission control" strategy (see Section | 51 | task actually receives "runtime" time units within "deadline" if a proper |
50 | "4. Bandwidth management") is used. | 52 | "admission control" strategy (see Section "4. Bandwidth management") is used |
53 | (clearly, if the system is overloaded this guarantee cannot be respected). | ||
51 | 54 | ||
52 | Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so | 55 | Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so |
53 | that each task runs for at most its runtime every period, avoiding any | 56 | that each task runs for at most its runtime every period, avoiding any |
54 | interference between different tasks (bandwidth isolation), while the EDF[1] | 57 | interference between different tasks (bandwidth isolation), while the EDF[1] |
55 | algorithm selects the task with the smallest scheduling deadline as the one | 58 | algorithm selects the task with the earliest scheduling deadline as the one |
56 | to be executed first. Thanks to this feature, also tasks that do not | 59 | to be executed next. Thanks to this feature, tasks that do not strictly comply |
57 | strictly comply with the "traditional" real-time task model (see Section 3) | 60 | with the "traditional" real-time task model (see Section 3) can effectively |
58 | can effectively use the new policy. | 61 | use the new policy. |
59 | 62 | ||
60 | In more details, the CBS algorithm assigns scheduling deadlines to | 63 | In more details, the CBS algorithm assigns scheduling deadlines to |
61 | tasks in the following way: | 64 | tasks in the following way: |
@@ -64,45 +67,45 @@ CONTENTS | |||
64 | "deadline", and "period" parameters; | 67 | "deadline", and "period" parameters; |
65 | 68 | ||
66 | - The state of the task is described by a "scheduling deadline", and | 69 | - The state of the task is described by a "scheduling deadline", and |
67 | a "current runtime". These two parameters are initially set to 0; | 70 | a "remaining runtime". These two parameters are initially set to 0; |
68 | 71 | ||
69 | - When a SCHED_DEADLINE task wakes up (becomes ready for execution), | 72 | - When a SCHED_DEADLINE task wakes up (becomes ready for execution), |
70 | the scheduler checks if | 73 | the scheduler checks if |
71 | 74 | ||
72 | current runtime runtime | 75 | remaining runtime runtime |
73 | ---------------------------------- > ---------------- | 76 | ---------------------------------- > --------- |
74 | scheduling deadline - current time period | 77 | scheduling deadline - current time period |
75 | 78 | ||
76 | then, if the scheduling deadline is smaller than the current time, or | 79 | then, if the scheduling deadline is smaller than the current time, or |
77 | this condition is verified, the scheduling deadline and the | 80 | this condition is verified, the scheduling deadline and the |
78 | current budget are re-initialised as | 81 | remaining runtime are re-initialised as |
79 | 82 | ||
80 | scheduling deadline = current time + deadline | 83 | scheduling deadline = current time + deadline |
81 | current runtime = runtime | 84 | remaining runtime = runtime |
82 | 85 | ||
83 | otherwise, the scheduling deadline and the current runtime are | 86 | otherwise, the scheduling deadline and the remaining runtime are |
84 | left unchanged; | 87 | left unchanged; |
85 | 88 | ||
86 | - When a SCHED_DEADLINE task executes for an amount of time t, its | 89 | - When a SCHED_DEADLINE task executes for an amount of time t, its |
87 | current runtime is decreased as | 90 | remaining runtime is decreased as |
88 | 91 | ||
89 | current runtime = current runtime - t | 92 | remaining runtime = remaining runtime - t |
90 | 93 | ||
91 | (technically, the runtime is decreased at every tick, or when the | 94 | (technically, the runtime is decreased at every tick, or when the |
92 | task is descheduled / preempted); | 95 | task is descheduled / preempted); |
93 | 96 | ||
94 | - When the current runtime becomes less or equal than 0, the task is | 97 | - When the remaining runtime becomes less or equal than 0, the task is |
95 | said to be "throttled" (also known as "depleted" in real-time literature) | 98 | said to be "throttled" (also known as "depleted" in real-time literature) |
96 | and cannot be scheduled until its scheduling deadline. The "replenishment | 99 | and cannot be scheduled until its scheduling deadline. The "replenishment |
97 | time" for this task (see next item) is set to be equal to the current | 100 | time" for this task (see next item) is set to be equal to the current |
98 | value of the scheduling deadline; | 101 | value of the scheduling deadline; |
99 | 102 | ||
100 | - When the current time is equal to the replenishment time of a | 103 | - When the current time is equal to the replenishment time of a |
101 | throttled task, the scheduling deadline and the current runtime are | 104 | throttled task, the scheduling deadline and the remaining runtime are |
102 | updated as | 105 | updated as |
103 | 106 | ||
104 | scheduling deadline = scheduling deadline + period | 107 | scheduling deadline = scheduling deadline + period |
105 | current runtime = current runtime + runtime | 108 | remaining runtime = remaining runtime + runtime |
106 | 109 | ||
107 | 110 | ||
108 | 3. Scheduling Real-Time Tasks | 111 | 3. Scheduling Real-Time Tasks |
@@ -134,6 +137,50 @@ CONTENTS | |||
134 | A real-time task can be periodic with period P if r_{j+1} = r_j + P, or | 137 | A real-time task can be periodic with period P if r_{j+1} = r_j + P, or |
135 | sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally, | 138 | sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally, |
136 | d_j = r_j + D, where D is the task's relative deadline. | 139 | d_j = r_j + D, where D is the task's relative deadline. |
140 | The utilisation of a real-time task is defined as the ratio between its | ||
141 | WCET and its period (or minimum inter-arrival time), and represents | ||
142 | the fraction of CPU time needed to execute the task. | ||
143 | |||
144 | If the total utilisation sum_i(WCET_i/P_i) is larger than M (with M equal | ||
145 | to the number of CPUs), then the scheduler is unable to respect all the | ||
146 | deadlines. | ||
147 | Note that total utilisation is defined as the sum of the utilisations | ||
148 | WCET_i/P_i over all the real-time tasks in the system. When considering | ||
149 | multiple real-time tasks, the parameters of the i-th task are indicated | ||
150 | with the "_i" suffix. | ||
151 | Moreover, if the total utilisation is larger than M, then we risk starving | ||
152 | non- real-time tasks by real-time tasks. | ||
153 | If, instead, the total utilisation is smaller than M, then non real-time | ||
154 | tasks will not be starved and the system might be able to respect all the | ||
155 | deadlines. | ||
156 | As a matter of fact, in this case it is possible to provide an upper bound | ||
157 | for tardiness (defined as the maximum between 0 and the difference | ||
158 | between the finishing time of a job and its absolute deadline). | ||
159 | More precisely, it can be proven that using a global EDF scheduler the | ||
160 | maximum tardiness of each task is smaller or equal than | ||
161 | ((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max | ||
162 | where WCET_max = max_i{WCET_i} is the maximum WCET, WCET_min=min_i{WCET_i} | ||
163 | is the minimum WCET, and U_max = max_i{WCET_i/P_i} is the maximum utilisation. | ||
164 | |||
165 | If M=1 (uniprocessor system), or in case of partitioned scheduling (each | ||
166 | real-time task is statically assigned to one and only one CPU), it is | ||
167 | possible to formally check if all the deadlines are respected. | ||
168 | If D_i = P_i for all tasks, then EDF is able to respect all the deadlines | ||
169 | of all the tasks executing on a CPU if and only if the total utilisation | ||
170 | of the tasks running on such a CPU is smaller or equal than 1. | ||
171 | If D_i != P_i for some task, then it is possible to define the density of | ||
172 | a task as C_i/min{D_i,T_i}, and EDF is able to respect all the deadlines | ||
173 | of all the tasks running on a CPU if the sum sum_i C_i/min{D_i,T_i} of the | ||
174 | densities of the tasks running on such a CPU is smaller or equal than 1 | ||
175 | (notice that this condition is only sufficient, and not necessary). | ||
176 | |||
177 | On multiprocessor systems with global EDF scheduling (non partitioned | ||
178 | systems), a sufficient test for schedulability can not be based on the | ||
179 | utilisations (it can be shown that task sets with utilisations slightly | ||
180 | larger than 1 can miss deadlines regardless of the number of CPUs M). | ||
181 | However, as previously stated, enforcing that the total utilisation is smaller | ||
182 | than M is enough to guarantee that non real-time tasks are not starved and | ||
183 | that the tardiness of real-time tasks has an upper bound. | ||
137 | 184 | ||
138 | SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that | 185 | SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that |
139 | the jobs' deadlines of a task are respected. In order to do this, a task | 186 | the jobs' deadlines of a task are respected. In order to do this, a task |
@@ -147,6 +194,8 @@ CONTENTS | |||
147 | and the absolute deadlines (d_j) coincide, so a proper admission control | 194 | and the absolute deadlines (d_j) coincide, so a proper admission control |
148 | allows to respect the jobs' absolute deadlines for this task (this is what is | 195 | allows to respect the jobs' absolute deadlines for this task (this is what is |
149 | called "hard schedulability property" and is an extension of Lemma 1 of [2]). | 196 | called "hard schedulability property" and is an extension of Lemma 1 of [2]). |
197 | Notice that if runtime > deadline the admission control will surely reject | ||
198 | this task, as it is not possible to respect its temporal constraints. | ||
150 | 199 | ||
151 | References: | 200 | References: |
152 | 1 - C. L. Liu and J. W. Layland. Scheduling algorithms for multiprogram- | 201 | 1 - C. L. Liu and J. W. Layland. Scheduling algorithms for multiprogram- |
@@ -156,46 +205,57 @@ CONTENTS | |||
156 | Real-Time Systems. Proceedings of the 19th IEEE Real-time Systems | 205 | Real-Time Systems. Proceedings of the 19th IEEE Real-time Systems |
157 | Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf | 206 | Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf |
158 | 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab | 207 | 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab |
159 | Technical Report. http://xoomer.virgilio.it/lucabe72/pubs/tr-98-01.ps | 208 | Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf |
160 | 209 | ||
161 | 4. Bandwidth management | 210 | 4. Bandwidth management |
162 | ======================= | 211 | ======================= |
163 | 212 | ||
164 | In order for the -deadline scheduling to be effective and useful, it is | 213 | As previously mentioned, in order for -deadline scheduling to be |
165 | important to have some method to keep the allocation of the available CPU | 214 | effective and useful (that is, to be able to provide "runtime" time units |
166 | bandwidth to the tasks under control. | 215 | within "deadline"), it is important to have some method to keep the allocation |
167 | This is usually called "admission control" and if it is not performed at all, | 216 | of the available fractions of CPU time to the various tasks under control. |
217 | This is usually called "admission control" and if it is not performed, then | ||
168 | no guarantee can be given on the actual scheduling of the -deadline tasks. | 218 | no guarantee can be given on the actual scheduling of the -deadline tasks. |
169 | 219 | ||
170 | Since when RT-throttling has been introduced each task group has a bandwidth | 220 | As already stated in Section 3, a necessary condition to be respected to |
171 | associated, calculated as a certain amount of runtime over a period. | 221 | correctly schedule a set of real-time tasks is that the total utilisation |
172 | Moreover, to make it possible to manipulate such bandwidth, readable/writable | 222 | is smaller than M. When talking about -deadline tasks, this requires that |
173 | controls have been added to both procfs (for system wide settings) and cgroupfs | 223 | the sum of the ratio between runtime and period for all tasks is smaller |
174 | (for per-group settings). | 224 | than M. Notice that the ratio runtime/period is equivalent to the utilisation |
175 | Therefore, the same interface is being used for controlling the bandwidth | 225 | of a "traditional" real-time task, and is also often referred to as |
176 | distrubution to -deadline tasks. | 226 | "bandwidth". |
177 | 227 | The interface used to control the CPU bandwidth that can be allocated | |
178 | However, more discussion is needed in order to figure out how we want to manage | 228 | to -deadline tasks is similar to the one already used for -rt |
179 | SCHED_DEADLINE bandwidth at the task group level. Therefore, SCHED_DEADLINE | 229 | tasks with real-time group scheduling (a.k.a. RT-throttling - see |
180 | uses (for now) a less sophisticated, but actually very sensible, mechanism to | 230 | Documentation/scheduler/sched-rt-group.txt), and is based on readable/ |
181 | ensure that a certain utilization cap is not overcome per each root_domain. | 231 | writable control files located in procfs (for system wide settings). |
182 | 232 | Notice that per-group settings (controlled through cgroupfs) are still not | |
183 | Another main difference between deadline bandwidth management and RT-throttling | 233 | defined for -deadline tasks, because more discussion is needed in order to |
234 | figure out how we want to manage SCHED_DEADLINE bandwidth at the task group | ||
235 | level. | ||
236 | |||
237 | A main difference between deadline bandwidth management and RT-throttling | ||
184 | is that -deadline tasks have bandwidth on their own (while -rt ones don't!), | 238 | is that -deadline tasks have bandwidth on their own (while -rt ones don't!), |
185 | and thus we don't need an higher level throttling mechanism to enforce the | 239 | and thus we don't need a higher level throttling mechanism to enforce the |
186 | desired bandwidth. | 240 | desired bandwidth. In other words, this means that interface parameters are |
241 | only used at admission control time (i.e., when the user calls | ||
242 | sched_setattr()). Scheduling is then performed considering actual tasks' | ||
243 | parameters, so that CPU bandwidth is allocated to SCHED_DEADLINE tasks | ||
244 | respecting their needs in terms of granularity. Therefore, using this simple | ||
245 | interface we can put a cap on total utilization of -deadline tasks (i.e., | ||
246 | \Sum (runtime_i / period_i) < global_dl_utilization_cap). | ||
187 | 247 | ||
188 | 4.1 System wide settings | 248 | 4.1 System wide settings |
189 | ------------------------ | 249 | ------------------------ |
190 | 250 | ||
191 | The system wide settings are configured under the /proc virtual file system. | 251 | The system wide settings are configured under the /proc virtual file system. |
192 | 252 | ||
193 | For now the -rt knobs are used for dl admission control and the -deadline | 253 | For now the -rt knobs are used for -deadline admission control and the |
194 | runtime is accounted against the -rt runtime. We realise that this isn't | 254 | -deadline runtime is accounted against the -rt runtime. We realise that this |
195 | entirely desirable; however, it is better to have a small interface for now, | 255 | isn't entirely desirable; however, it is better to have a small interface for |
196 | and be able to change it easily later. The ideal situation (see 5.) is to run | 256 | now, and be able to change it easily later. The ideal situation (see 5.) is to |
197 | -rt tasks from a -deadline server; in which case the -rt bandwidth is a direct | 257 | run -rt tasks from a -deadline server; in which case the -rt bandwidth is a |
198 | subset of dl_bw. | 258 | direct subset of dl_bw. |
199 | 259 | ||
200 | This means that, for a root_domain comprising M CPUs, -deadline tasks | 260 | This means that, for a root_domain comprising M CPUs, -deadline tasks |
201 | can be created while the sum of their bandwidths stays below: | 261 | can be created while the sum of their bandwidths stays below: |
@@ -231,8 +291,16 @@ CONTENTS | |||
231 | 950000. With rt_period equal to 1000000, by default, it means that -deadline | 291 | 950000. With rt_period equal to 1000000, by default, it means that -deadline |
232 | tasks can use at most 95%, multiplied by the number of CPUs that compose the | 292 | tasks can use at most 95%, multiplied by the number of CPUs that compose the |
233 | root_domain, for each root_domain. | 293 | root_domain, for each root_domain. |
294 | This means that non -deadline tasks will receive at least 5% of the CPU time, | ||
295 | and that -deadline tasks will receive their runtime with a guaranteed | ||
296 | worst-case delay respect to the "deadline" parameter. If "deadline" = "period" | ||
297 | and the cpuset mechanism is used to implement partitioned scheduling (see | ||
298 | Section 5), then this simple setting of the bandwidth management is able to | ||
299 | deterministically guarantee that -deadline tasks will receive their runtime | ||
300 | in a period. | ||
234 | 301 | ||
235 | A -deadline task cannot fork. | 302 | Finally, notice that in order not to jeopardize the admission control a |
303 | -deadline task cannot fork. | ||
236 | 304 | ||
237 | 5. Tasks CPU affinity | 305 | 5. Tasks CPU affinity |
238 | ===================== | 306 | ===================== |
@@ -279,3 +347,179 @@ CONTENTS | |||
279 | throttling patches [https://lkml.org/lkml/2010/2/23/239] but we still are in | 347 | throttling patches [https://lkml.org/lkml/2010/2/23/239] but we still are in |
280 | the preliminary phases of the merge and we really seek feedback that would | 348 | the preliminary phases of the merge and we really seek feedback that would |
281 | help us decide on the direction it should take. | 349 | help us decide on the direction it should take. |
350 | |||
351 | Appendix A. Test suite | ||
352 | ====================== | ||
353 | |||
354 | The SCHED_DEADLINE policy can be easily tested using two applications that | ||
355 | are part of a wider Linux Scheduler validation suite. The suite is | ||
356 | available as a GitHub repository: https://github.com/scheduler-tools. | ||
357 | |||
358 | The first testing application is called rt-app and can be used to | ||
359 | start multiple threads with specific parameters. rt-app supports | ||
360 | SCHED_{OTHER,FIFO,RR,DEADLINE} scheduling policies and their related | ||
361 | parameters (e.g., niceness, priority, runtime/deadline/period). rt-app | ||
362 | is a valuable tool, as it can be used to synthetically recreate certain | ||
363 | workloads (maybe mimicking real use-cases) and evaluate how the scheduler | ||
364 | behaves under such workloads. In this way, results are easily reproducible. | ||
365 | rt-app is available at: https://github.com/scheduler-tools/rt-app. | ||
366 | |||
367 | Thread parameters can be specified from the command line, with something like | ||
368 | this: | ||
369 | |||
370 | # rt-app -t 100000:10000:d -t 150000:20000:f:10 -D5 | ||
371 | |||
372 | The above creates 2 threads. The first one, scheduled by SCHED_DEADLINE, | ||
373 | executes for 10ms every 100ms. The second one, scheduled at SCHED_FIFO | ||
374 | priority 10, executes for 20ms every 150ms. The test will run for a total | ||
375 | of 5 seconds. | ||
376 | |||
377 | More interestingly, configurations can be described with a json file that | ||
378 | can be passed as input to rt-app with something like this: | ||
379 | |||
380 | # rt-app my_config.json | ||
381 | |||
382 | The parameters that can be specified with the second method are a superset | ||
383 | of the command line options. Please refer to rt-app documentation for more | ||
384 | details (<rt-app-sources>/doc/*.json). | ||
385 | |||
386 | The second testing application is a modification of schedtool, called | ||
387 | schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a | ||
388 | certain pid/application. schedtool-dl is available at: | ||
389 | https://github.com/scheduler-tools/schedtool-dl.git. | ||
390 | |||
391 | The usage is straightforward: | ||
392 | |||
393 | # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app | ||
394 | |||
395 | With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation | ||
396 | of 10ms every 100ms (note that parameters are expressed in microseconds). | ||
397 | You can also use schedtool to create a reservation for an already running | ||
398 | application, given that you know its pid: | ||
399 | |||
400 | # schedtool -E -t 10000000:100000000 my_app_pid | ||
401 | |||
402 | Appendix B. Minimal main() | ||
403 | ========================== | ||
404 | |||
405 | We provide in what follows a simple (ugly) self-contained code snippet | ||
406 | showing how SCHED_DEADLINE reservations can be created by a real-time | ||
407 | application developer. | ||
408 | |||
409 | #define _GNU_SOURCE | ||
410 | #include <unistd.h> | ||
411 | #include <stdio.h> | ||
412 | #include <stdlib.h> | ||
413 | #include <string.h> | ||
414 | #include <time.h> | ||
415 | #include <linux/unistd.h> | ||
416 | #include <linux/kernel.h> | ||
417 | #include <linux/types.h> | ||
418 | #include <sys/syscall.h> | ||
419 | #include <pthread.h> | ||
420 | |||
421 | #define gettid() syscall(__NR_gettid) | ||
422 | |||
423 | #define SCHED_DEADLINE 6 | ||
424 | |||
425 | /* XXX use the proper syscall numbers */ | ||
426 | #ifdef __x86_64__ | ||
427 | #define __NR_sched_setattr 314 | ||
428 | #define __NR_sched_getattr 315 | ||
429 | #endif | ||
430 | |||
431 | #ifdef __i386__ | ||
432 | #define __NR_sched_setattr 351 | ||
433 | #define __NR_sched_getattr 352 | ||
434 | #endif | ||
435 | |||
436 | #ifdef __arm__ | ||
437 | #define __NR_sched_setattr 380 | ||
438 | #define __NR_sched_getattr 381 | ||
439 | #endif | ||
440 | |||
441 | static volatile int done; | ||
442 | |||
443 | struct sched_attr { | ||
444 | __u32 size; | ||
445 | |||
446 | __u32 sched_policy; | ||
447 | __u64 sched_flags; | ||
448 | |||
449 | /* SCHED_NORMAL, SCHED_BATCH */ | ||
450 | __s32 sched_nice; | ||
451 | |||
452 | /* SCHED_FIFO, SCHED_RR */ | ||
453 | __u32 sched_priority; | ||
454 | |||
455 | /* SCHED_DEADLINE (nsec) */ | ||
456 | __u64 sched_runtime; | ||
457 | __u64 sched_deadline; | ||
458 | __u64 sched_period; | ||
459 | }; | ||
460 | |||
461 | int sched_setattr(pid_t pid, | ||
462 | const struct sched_attr *attr, | ||
463 | unsigned int flags) | ||
464 | { | ||
465 | return syscall(__NR_sched_setattr, pid, attr, flags); | ||
466 | } | ||
467 | |||
468 | int sched_getattr(pid_t pid, | ||
469 | struct sched_attr *attr, | ||
470 | unsigned int size, | ||
471 | unsigned int flags) | ||
472 | { | ||
473 | return syscall(__NR_sched_getattr, pid, attr, size, flags); | ||
474 | } | ||
475 | |||
476 | void *run_deadline(void *data) | ||
477 | { | ||
478 | struct sched_attr attr; | ||
479 | int x = 0; | ||
480 | int ret; | ||
481 | unsigned int flags = 0; | ||
482 | |||
483 | printf("deadline thread started [%ld]\n", gettid()); | ||
484 | |||
485 | attr.size = sizeof(attr); | ||
486 | attr.sched_flags = 0; | ||
487 | attr.sched_nice = 0; | ||
488 | attr.sched_priority = 0; | ||
489 | |||
490 | /* This creates a 10ms/30ms reservation */ | ||
491 | attr.sched_policy = SCHED_DEADLINE; | ||
492 | attr.sched_runtime = 10 * 1000 * 1000; | ||
493 | attr.sched_period = attr.sched_deadline = 30 * 1000 * 1000; | ||
494 | |||
495 | ret = sched_setattr(0, &attr, flags); | ||
496 | if (ret < 0) { | ||
497 | done = 0; | ||
498 | perror("sched_setattr"); | ||
499 | exit(-1); | ||
500 | } | ||
501 | |||
502 | while (!done) { | ||
503 | x++; | ||
504 | } | ||
505 | |||
506 | printf("deadline thread dies [%ld]\n", gettid()); | ||
507 | return NULL; | ||
508 | } | ||
509 | |||
510 | int main (int argc, char **argv) | ||
511 | { | ||
512 | pthread_t thread; | ||
513 | |||
514 | printf("main thread [%ld]\n", gettid()); | ||
515 | |||
516 | pthread_create(&thread, NULL, run_deadline, NULL); | ||
517 | |||
518 | sleep(10); | ||
519 | |||
520 | done = 1; | ||
521 | pthread_join(thread, NULL); | ||
522 | |||
523 | printf("main dies [%ld]\n", gettid()); | ||
524 | return 0; | ||
525 | } | ||
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index e35d880f9773..89cfdd6e50cb 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c | |||
@@ -42,7 +42,7 @@ | |||
42 | */ | 42 | */ |
43 | static DEFINE_PER_CPU(unsigned long, cpu_scale); | 43 | static DEFINE_PER_CPU(unsigned long, cpu_scale); |
44 | 44 | ||
45 | unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | 45 | unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
46 | { | 46 | { |
47 | return per_cpu(cpu_scale, cpu); | 47 | return per_cpu(cpu_scale, cpu); |
48 | } | 48 | } |
@@ -166,7 +166,7 @@ static void update_cpu_capacity(unsigned int cpu) | |||
166 | set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); | 166 | set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); |
167 | 167 | ||
168 | printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n", | 168 | printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n", |
169 | cpu, arch_scale_freq_capacity(NULL, cpu)); | 169 | cpu, arch_scale_cpu_capacity(NULL, cpu)); |
170 | } | 170 | } |
171 | 171 | ||
172 | #else | 172 | #else |
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c index 29eb02ab3f25..0f3983241e60 100644 --- a/arch/cris/arch-v10/drivers/sync_serial.c +++ b/arch/cris/arch-v10/drivers/sync_serial.c | |||
@@ -1086,7 +1086,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf, | |||
1086 | } | 1086 | } |
1087 | local_irq_restore(flags); | 1087 | local_irq_restore(flags); |
1088 | schedule(); | 1088 | schedule(); |
1089 | set_current_state(TASK_RUNNING); | ||
1090 | remove_wait_queue(&port->out_wait_q, &wait); | 1089 | remove_wait_queue(&port->out_wait_q, &wait); |
1091 | if (signal_pending(current)) | 1090 | if (signal_pending(current)) |
1092 | return -EINTR; | 1091 | return -EINTR; |
diff --git a/arch/cris/arch-v32/drivers/sync_serial.c b/arch/cris/arch-v32/drivers/sync_serial.c index bbb806b68838..5a149134cfb5 100644 --- a/arch/cris/arch-v32/drivers/sync_serial.c +++ b/arch/cris/arch-v32/drivers/sync_serial.c | |||
@@ -1089,7 +1089,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf, | |||
1089 | } | 1089 | } |
1090 | 1090 | ||
1091 | schedule(); | 1091 | schedule(); |
1092 | set_current_state(TASK_RUNNING); | ||
1093 | remove_wait_queue(&port->out_wait_q, &wait); | 1092 | remove_wait_queue(&port->out_wait_q, &wait); |
1094 | 1093 | ||
1095 | if (signal_pending(current)) | 1094 | if (signal_pending(current)) |
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index c7367130ab14..ce53c50d0ba4 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h | |||
@@ -19,7 +19,6 @@ | |||
19 | #include <asm/ptrace.h> | 19 | #include <asm/ptrace.h> |
20 | #include <asm/ustack.h> | 20 | #include <asm/ustack.h> |
21 | 21 | ||
22 | #define __ARCH_WANT_UNLOCKED_CTXSW | ||
23 | #define ARCH_HAS_PREFETCH_SWITCH_STACK | 22 | #define ARCH_HAS_PREFETCH_SWITCH_STACK |
24 | 23 | ||
25 | #define IA64_NUM_PHYS_STACK_REG 96 | 24 | #define IA64_NUM_PHYS_STACK_REG 96 |
diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 05f08438a7c4..f1df4cb4a286 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h | |||
@@ -397,12 +397,6 @@ unsigned long get_wchan(struct task_struct *p); | |||
397 | #define ARCH_HAS_PREFETCHW | 397 | #define ARCH_HAS_PREFETCHW |
398 | #define prefetchw(x) __builtin_prefetch((x), 1, 1) | 398 | #define prefetchw(x) __builtin_prefetch((x), 1, 1) |
399 | 399 | ||
400 | /* | ||
401 | * See Documentation/scheduler/sched-arch.txt; prevents deadlock on SMP | ||
402 | * systems. | ||
403 | */ | ||
404 | #define __ARCH_WANT_UNLOCKED_CTXSW | ||
405 | |||
406 | #endif | 400 | #endif |
407 | 401 | ||
408 | #endif /* _ASM_PROCESSOR_H */ | 402 | #endif /* _ASM_PROCESSOR_H */ |
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 607559ab271f..6c840ceab820 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h | |||
@@ -32,6 +32,8 @@ static inline void setup_cputime_one_jiffy(void) { } | |||
32 | typedef u64 __nocast cputime_t; | 32 | typedef u64 __nocast cputime_t; |
33 | typedef u64 __nocast cputime64_t; | 33 | typedef u64 __nocast cputime64_t; |
34 | 34 | ||
35 | #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) | ||
36 | |||
35 | #ifdef __KERNEL__ | 37 | #ifdef __KERNEL__ |
36 | 38 | ||
37 | /* | 39 | /* |
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 24b3f4949df4..08d659a9fcdb 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
@@ -30,7 +30,6 @@ | |||
30 | #include <linux/kprobes.h> | 30 | #include <linux/kprobes.h> |
31 | #include <linux/kdebug.h> | 31 | #include <linux/kdebug.h> |
32 | #include <linux/perf_event.h> | 32 | #include <linux/perf_event.h> |
33 | #include <linux/magic.h> | ||
34 | #include <linux/ratelimit.h> | 33 | #include <linux/ratelimit.h> |
35 | #include <linux/context_tracking.h> | 34 | #include <linux/context_tracking.h> |
36 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
@@ -521,7 +520,6 @@ bail: | |||
521 | void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) | 520 | void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) |
522 | { | 521 | { |
523 | const struct exception_table_entry *entry; | 522 | const struct exception_table_entry *entry; |
524 | unsigned long *stackend; | ||
525 | 523 | ||
526 | /* Are we prepared to handle this fault? */ | 524 | /* Are we prepared to handle this fault? */ |
527 | if ((entry = search_exception_tables(regs->nip)) != NULL) { | 525 | if ((entry = search_exception_tables(regs->nip)) != NULL) { |
@@ -550,8 +548,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) | |||
550 | printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", | 548 | printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", |
551 | regs->nip); | 549 | regs->nip); |
552 | 550 | ||
553 | stackend = end_of_stack(current); | 551 | if (task_stack_end_corrupted(current)) |
554 | if (current != &init_task && *stackend != STACK_END_MAGIC) | ||
555 | printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); | 552 | printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); |
556 | 553 | ||
557 | die("Kernel access of bad area", regs, sig); | 554 | die("Kernel access of bad area", regs, sig); |
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index f65bd3634519..3001887f94b7 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h | |||
@@ -18,6 +18,8 @@ | |||
18 | typedef unsigned long long __nocast cputime_t; | 18 | typedef unsigned long long __nocast cputime_t; |
19 | typedef unsigned long long __nocast cputime64_t; | 19 | typedef unsigned long long __nocast cputime64_t; |
20 | 20 | ||
21 | #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) | ||
22 | |||
21 | static inline unsigned long __div(unsigned long long n, unsigned long base) | 23 | static inline unsigned long __div(unsigned long long n, unsigned long base) |
22 | { | 24 | { |
23 | #ifndef CONFIG_64BIT | 25 | #ifndef CONFIG_64BIT |
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index 9e3a72205827..dd16c902ff70 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c | |||
@@ -79,7 +79,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, | |||
79 | set_task_state(current, TASK_INTERRUPTIBLE); | 79 | set_task_state(current, TASK_INTERRUPTIBLE); |
80 | 80 | ||
81 | schedule(); | 81 | schedule(); |
82 | set_task_state(current, TASK_RUNNING); | ||
83 | remove_wait_queue(&host_read_wait, &wait); | 82 | remove_wait_queue(&host_read_wait, &wait); |
84 | 83 | ||
85 | if (atomic_dec_and_test(&host_sleep_count)) { | 84 | if (atomic_dec_and_test(&host_sleep_count)) { |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 42a2dca984b3..9b1c0f8f68e6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -296,11 +296,19 @@ void smp_store_cpu_info(int id) | |||
296 | } | 296 | } |
297 | 297 | ||
298 | static bool | 298 | static bool |
299 | topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
300 | { | ||
301 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; | ||
302 | |||
303 | return (cpu_to_node(cpu1) == cpu_to_node(cpu2)); | ||
304 | } | ||
305 | |||
306 | static bool | ||
299 | topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) | 307 | topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) |
300 | { | 308 | { |
301 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; | 309 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; |
302 | 310 | ||
303 | return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), | 311 | return !WARN_ONCE(!topology_same_node(c, o), |
304 | "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " | 312 | "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " |
305 | "[node: %d != %d]. Ignoring dependency.\n", | 313 | "[node: %d != %d]. Ignoring dependency.\n", |
306 | cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); | 314 | cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); |
@@ -341,17 +349,44 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | |||
341 | return false; | 349 | return false; |
342 | } | 350 | } |
343 | 351 | ||
344 | static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | 352 | /* |
353 | * Unlike the other levels, we do not enforce keeping a | ||
354 | * multicore group inside a NUMA node. If this happens, we will | ||
355 | * discard the MC level of the topology later. | ||
356 | */ | ||
357 | static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
345 | { | 358 | { |
346 | if (c->phys_proc_id == o->phys_proc_id) { | 359 | if (c->phys_proc_id == o->phys_proc_id) |
347 | if (cpu_has(c, X86_FEATURE_AMD_DCM)) | 360 | return true; |
348 | return true; | ||
349 | |||
350 | return topology_sane(c, o, "mc"); | ||
351 | } | ||
352 | return false; | 361 | return false; |
353 | } | 362 | } |
354 | 363 | ||
364 | static struct sched_domain_topology_level numa_inside_package_topology[] = { | ||
365 | #ifdef CONFIG_SCHED_SMT | ||
366 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
367 | #endif | ||
368 | #ifdef CONFIG_SCHED_MC | ||
369 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
370 | #endif | ||
371 | { NULL, }, | ||
372 | }; | ||
373 | /* | ||
374 | * set_sched_topology() sets the topology internal to a CPU. The | ||
375 | * NUMA topologies are layered on top of it to build the full | ||
376 | * system topology. | ||
377 | * | ||
378 | * If NUMA nodes are observed to occur within a CPU package, this | ||
379 | * function should be called. It forces the sched domain code to | ||
380 | * only use the SMT level for the CPU portion of the topology. | ||
381 | * This essentially falls back to relying on NUMA information | ||
382 | * from the SRAT table to describe the entire system topology | ||
383 | * (except for hyperthreads). | ||
384 | */ | ||
385 | static void primarily_use_numa_for_topology(void) | ||
386 | { | ||
387 | set_sched_topology(numa_inside_package_topology); | ||
388 | } | ||
389 | |||
355 | void set_cpu_sibling_map(int cpu) | 390 | void set_cpu_sibling_map(int cpu) |
356 | { | 391 | { |
357 | bool has_smt = smp_num_siblings > 1; | 392 | bool has_smt = smp_num_siblings > 1; |
@@ -388,7 +423,7 @@ void set_cpu_sibling_map(int cpu) | |||
388 | for_each_cpu(i, cpu_sibling_setup_mask) { | 423 | for_each_cpu(i, cpu_sibling_setup_mask) { |
389 | o = &cpu_data(i); | 424 | o = &cpu_data(i); |
390 | 425 | ||
391 | if ((i == cpu) || (has_mp && match_mc(c, o))) { | 426 | if ((i == cpu) || (has_mp && match_die(c, o))) { |
392 | link_mask(core, cpu, i); | 427 | link_mask(core, cpu, i); |
393 | 428 | ||
394 | /* | 429 | /* |
@@ -410,6 +445,8 @@ void set_cpu_sibling_map(int cpu) | |||
410 | } else if (i != cpu && !c->booted_cores) | 445 | } else if (i != cpu && !c->booted_cores) |
411 | c->booted_cores = cpu_data(i).booted_cores; | 446 | c->booted_cores = cpu_data(i).booted_cores; |
412 | } | 447 | } |
448 | if (match_die(c, o) && !topology_same_node(c, o)) | ||
449 | primarily_use_numa_for_topology(); | ||
413 | } | 450 | } |
414 | } | 451 | } |
415 | 452 | ||
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 83bb03bfa259..9c5b32e2bdc0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -3,7 +3,6 @@ | |||
3 | * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. | 3 | * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. |
4 | * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar | 4 | * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar |
5 | */ | 5 | */ |
6 | #include <linux/magic.h> /* STACK_END_MAGIC */ | ||
7 | #include <linux/sched.h> /* test_thread_flag(), ... */ | 6 | #include <linux/sched.h> /* test_thread_flag(), ... */ |
8 | #include <linux/kdebug.h> /* oops_begin/end, ... */ | 7 | #include <linux/kdebug.h> /* oops_begin/end, ... */ |
9 | #include <linux/module.h> /* search_exception_table */ | 8 | #include <linux/module.h> /* search_exception_table */ |
@@ -649,7 +648,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, | |||
649 | unsigned long address, int signal, int si_code) | 648 | unsigned long address, int signal, int si_code) |
650 | { | 649 | { |
651 | struct task_struct *tsk = current; | 650 | struct task_struct *tsk = current; |
652 | unsigned long *stackend; | ||
653 | unsigned long flags; | 651 | unsigned long flags; |
654 | int sig; | 652 | int sig; |
655 | 653 | ||
@@ -709,8 +707,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, | |||
709 | 707 | ||
710 | show_fault_oops(regs, error_code, address); | 708 | show_fault_oops(regs, error_code, address); |
711 | 709 | ||
712 | stackend = end_of_stack(tsk); | 710 | if (task_stack_end_corrupted(tsk)) |
713 | if (tsk != &init_task && *stackend != STACK_END_MAGIC) | ||
714 | printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); | 711 | printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); |
715 | 712 | ||
716 | tsk->thread.cr2 = address; | 713 | tsk->thread.cr2 = address; |
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index ee9df5e3f5eb..125150dc6e81 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c | |||
@@ -223,8 +223,14 @@ void cpuidle_uninstall_idle_handler(void) | |||
223 | { | 223 | { |
224 | if (enabled_devices) { | 224 | if (enabled_devices) { |
225 | initialized = 0; | 225 | initialized = 0; |
226 | kick_all_cpus_sync(); | 226 | wake_up_all_idle_cpus(); |
227 | } | 227 | } |
228 | |||
229 | /* | ||
230 | * Make sure external observers (such as the scheduler) | ||
231 | * are done looking at pointed idle states. | ||
232 | */ | ||
233 | synchronize_rcu(); | ||
228 | } | 234 | } |
229 | 235 | ||
230 | /** | 236 | /** |
@@ -530,11 +536,6 @@ EXPORT_SYMBOL_GPL(cpuidle_register); | |||
530 | 536 | ||
531 | #ifdef CONFIG_SMP | 537 | #ifdef CONFIG_SMP |
532 | 538 | ||
533 | static void smp_callback(void *v) | ||
534 | { | ||
535 | /* we already woke the CPU up, nothing more to do */ | ||
536 | } | ||
537 | |||
538 | /* | 539 | /* |
539 | * This function gets called when a part of the kernel has a new latency | 540 | * This function gets called when a part of the kernel has a new latency |
540 | * requirement. This means we need to get all processors out of their C-state, | 541 | * requirement. This means we need to get all processors out of their C-state, |
@@ -544,7 +545,7 @@ static void smp_callback(void *v) | |||
544 | static int cpuidle_latency_notify(struct notifier_block *b, | 545 | static int cpuidle_latency_notify(struct notifier_block *b, |
545 | unsigned long l, void *v) | 546 | unsigned long l, void *v) |
546 | { | 547 | { |
547 | smp_call_function(smp_callback, NULL, 1); | 548 | wake_up_all_idle_cpus(); |
548 | return NOTIFY_OK; | 549 | return NOTIFY_OK; |
549 | } | 550 | } |
550 | 551 | ||
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c index 77711623b973..7bcbf863656e 100644 --- a/drivers/gpu/vga/vgaarb.c +++ b/drivers/gpu/vga/vgaarb.c | |||
@@ -400,7 +400,6 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible) | |||
400 | } | 400 | } |
401 | schedule(); | 401 | schedule(); |
402 | remove_wait_queue(&vga_wait_queue, &wait); | 402 | remove_wait_queue(&vga_wait_queue, &wait); |
403 | set_current_state(TASK_RUNNING); | ||
404 | } | 403 | } |
405 | return rc; | 404 | return rc; |
406 | } | 405 | } |
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ab472c557d18..0505559f0965 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c | |||
@@ -720,7 +720,6 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) | |||
720 | 720 | ||
721 | io_schedule(); | 721 | io_schedule(); |
722 | 722 | ||
723 | set_task_state(current, TASK_RUNNING); | ||
724 | remove_wait_queue(&c->free_buffer_wait, &wait); | 723 | remove_wait_queue(&c->free_buffer_wait, &wait); |
725 | 724 | ||
726 | dm_bufio_lock(c); | 725 | dm_bufio_lock(c); |
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c index 90cca5e3805f..ef31b77404ef 100644 --- a/drivers/parisc/power.c +++ b/drivers/parisc/power.c | |||
@@ -121,7 +121,6 @@ static int kpowerswd(void *param) | |||
121 | unsigned long soft_power_reg = (unsigned long) param; | 121 | unsigned long soft_power_reg = (unsigned long) param; |
122 | 122 | ||
123 | schedule_timeout_interruptible(pwrsw_enabled ? HZ : HZ/POWERSWITCH_POLL_PER_SEC); | 123 | schedule_timeout_interruptible(pwrsw_enabled ? HZ : HZ/POWERSWITCH_POLL_PER_SEC); |
124 | __set_current_state(TASK_RUNNING); | ||
125 | 124 | ||
126 | if (unlikely(!pwrsw_enabled)) | 125 | if (unlikely(!pwrsw_enabled)) |
127 | continue; | 126 | continue; |
diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c index fbc6701bef30..213e54ee8a66 100644 --- a/drivers/s390/net/claw.c +++ b/drivers/s390/net/claw.c | |||
@@ -481,7 +481,6 @@ claw_open(struct net_device *dev) | |||
481 | spin_unlock_irqrestore( | 481 | spin_unlock_irqrestore( |
482 | get_ccwdev_lock(privptr->channel[i].cdev), saveflags); | 482 | get_ccwdev_lock(privptr->channel[i].cdev), saveflags); |
483 | schedule(); | 483 | schedule(); |
484 | set_current_state(TASK_RUNNING); | ||
485 | remove_wait_queue(&privptr->channel[i].wait, &wait); | 484 | remove_wait_queue(&privptr->channel[i].wait, &wait); |
486 | if(rc != 0) | 485 | if(rc != 0) |
487 | ccw_check_return_code(privptr->channel[i].cdev, rc); | 486 | ccw_check_return_code(privptr->channel[i].cdev, rc); |
@@ -828,7 +827,6 @@ claw_release(struct net_device *dev) | |||
828 | spin_unlock_irqrestore( | 827 | spin_unlock_irqrestore( |
829 | get_ccwdev_lock(privptr->channel[i].cdev), saveflags); | 828 | get_ccwdev_lock(privptr->channel[i].cdev), saveflags); |
830 | schedule(); | 829 | schedule(); |
831 | set_current_state(TASK_RUNNING); | ||
832 | remove_wait_queue(&privptr->channel[i].wait, &wait); | 830 | remove_wait_queue(&privptr->channel[i].wait, &wait); |
833 | if (rc != 0) { | 831 | if (rc != 0) { |
834 | ccw_check_return_code(privptr->channel[i].cdev, rc); | 832 | ccw_check_return_code(privptr->channel[i].cdev, rc); |
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 00ee0ed642aa..4a8ac7d8c76b 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c | |||
@@ -1884,7 +1884,6 @@ retry: | |||
1884 | set_current_state(TASK_INTERRUPTIBLE); | 1884 | set_current_state(TASK_INTERRUPTIBLE); |
1885 | spin_unlock_bh(&p->fcoe_rx_list.lock); | 1885 | spin_unlock_bh(&p->fcoe_rx_list.lock); |
1886 | schedule(); | 1886 | schedule(); |
1887 | set_current_state(TASK_RUNNING); | ||
1888 | goto retry; | 1887 | goto retry; |
1889 | } | 1888 | } |
1890 | 1889 | ||
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index dabd25429c58..db3dbd999cb6 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c | |||
@@ -4875,7 +4875,6 @@ qla2x00_do_dpc(void *data) | |||
4875 | "DPC handler sleeping.\n"); | 4875 | "DPC handler sleeping.\n"); |
4876 | 4876 | ||
4877 | schedule(); | 4877 | schedule(); |
4878 | __set_current_state(TASK_RUNNING); | ||
4879 | 4878 | ||
4880 | if (!base_vha->flags.init_done || ha->flags.mbox_busy) | 4879 | if (!base_vha->flags.init_done || ha->flags.mbox_busy) |
4881 | goto end_loop; | 4880 | goto end_loop; |
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c index 86f1a91e896f..14c9c8d18d02 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c | |||
@@ -3215,7 +3215,6 @@ kiblnd_connd (void *arg) | |||
3215 | 3215 | ||
3216 | schedule_timeout(timeout); | 3216 | schedule_timeout(timeout); |
3217 | 3217 | ||
3218 | set_current_state(TASK_RUNNING); | ||
3219 | remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); | 3218 | remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); |
3220 | spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); | 3219 | spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); |
3221 | } | 3220 | } |
@@ -3432,7 +3431,6 @@ kiblnd_scheduler(void *arg) | |||
3432 | busy_loops = 0; | 3431 | busy_loops = 0; |
3433 | 3432 | ||
3434 | remove_wait_queue(&sched->ibs_waitq, &wait); | 3433 | remove_wait_queue(&sched->ibs_waitq, &wait); |
3435 | set_current_state(TASK_RUNNING); | ||
3436 | spin_lock_irqsave(&sched->ibs_lock, flags); | 3434 | spin_lock_irqsave(&sched->ibs_lock, flags); |
3437 | } | 3435 | } |
3438 | 3436 | ||
@@ -3507,7 +3505,6 @@ kiblnd_failover_thread(void *arg) | |||
3507 | 3505 | ||
3508 | rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : | 3506 | rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : |
3509 | cfs_time_seconds(1)); | 3507 | cfs_time_seconds(1)); |
3510 | set_current_state(TASK_RUNNING); | ||
3511 | remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); | 3508 | remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); |
3512 | write_lock_irqsave(glock, flags); | 3509 | write_lock_irqsave(glock, flags); |
3513 | 3510 | ||
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c index bcfee7c21942..d29f5f134b89 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c | |||
@@ -2232,7 +2232,6 @@ ksocknal_connd (void *arg) | |||
2232 | nloops = 0; | 2232 | nloops = 0; |
2233 | schedule_timeout(timeout); | 2233 | schedule_timeout(timeout); |
2234 | 2234 | ||
2235 | set_current_state(TASK_RUNNING); | ||
2236 | remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); | 2235 | remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); |
2237 | spin_lock_bh(connd_lock); | 2236 | spin_lock_bh(connd_lock); |
2238 | } | 2237 | } |
diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c index 1bf9c90b4789..e73ca3df9734 100644 --- a/drivers/staging/lustre/lustre/libcfs/fail.c +++ b/drivers/staging/lustre/lustre/libcfs/fail.c | |||
@@ -131,7 +131,6 @@ int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) | |||
131 | id, ms); | 131 | id, ms); |
132 | set_current_state(TASK_UNINTERRUPTIBLE); | 132 | set_current_state(TASK_UNINTERRUPTIBLE); |
133 | schedule_timeout(cfs_time_seconds(ms) / 1000); | 133 | schedule_timeout(cfs_time_seconds(ms) / 1000); |
134 | set_current_state(TASK_RUNNING); | ||
135 | CERROR("cfs_fail_timeout id %x awake\n", id); | 134 | CERROR("cfs_fail_timeout id %x awake\n", id); |
136 | } | 135 | } |
137 | return ret; | 136 | return ret; |
diff --git a/drivers/tty/bfin_jtag_comm.c b/drivers/tty/bfin_jtag_comm.c index 8096fcbe2dc1..d7b198c400c7 100644 --- a/drivers/tty/bfin_jtag_comm.c +++ b/drivers/tty/bfin_jtag_comm.c | |||
@@ -77,7 +77,6 @@ bfin_jc_emudat_manager(void *arg) | |||
77 | pr_debug("waiting for readers\n"); | 77 | pr_debug("waiting for readers\n"); |
78 | __set_current_state(TASK_UNINTERRUPTIBLE); | 78 | __set_current_state(TASK_UNINTERRUPTIBLE); |
79 | schedule(); | 79 | schedule(); |
80 | __set_current_state(TASK_RUNNING); | ||
81 | continue; | 80 | continue; |
82 | } | 81 | } |
83 | 82 | ||
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index b6df2e83809f..52976785a32c 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c | |||
@@ -130,7 +130,6 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl, | |||
130 | /* second+ BUSY - sleep a little bit */ | 130 | /* second+ BUSY - sleep a little bit */ |
131 | set_current_state(TASK_UNINTERRUPTIBLE); | 131 | set_current_state(TASK_UNINTERRUPTIBLE); |
132 | schedule_timeout(1); | 132 | schedule_timeout(1); |
133 | __set_current_state(TASK_RUNNING); | ||
134 | } | 133 | } |
135 | continue; | 134 | continue; |
136 | } | 135 | } |
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 0acddf60af55..bc462dcd7a40 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c | |||
@@ -1585,7 +1585,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait) | |||
1585 | set_current_state(TASK_UNINTERRUPTIBLE); | 1585 | set_current_state(TASK_UNINTERRUPTIBLE); |
1586 | LOGGC_UNLOCK(log); | 1586 | LOGGC_UNLOCK(log); |
1587 | schedule(); | 1587 | schedule(); |
1588 | __set_current_state(TASK_RUNNING); | ||
1589 | LOGGC_LOCK(log); | 1588 | LOGGC_LOCK(log); |
1590 | remove_wait_queue(&target->gcwait, &__wait); | 1589 | remove_wait_queue(&target->gcwait, &__wait); |
1591 | } | 1590 | } |
@@ -2359,7 +2358,6 @@ int jfsIOWait(void *arg) | |||
2359 | set_current_state(TASK_INTERRUPTIBLE); | 2358 | set_current_state(TASK_INTERRUPTIBLE); |
2360 | spin_unlock_irq(&log_redrive_lock); | 2359 | spin_unlock_irq(&log_redrive_lock); |
2361 | schedule(); | 2360 | schedule(); |
2362 | __set_current_state(TASK_RUNNING); | ||
2363 | } | 2361 | } |
2364 | } while (!kthread_should_stop()); | 2362 | } while (!kthread_should_stop()); |
2365 | 2363 | ||
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 564c4f279ac6..d595856453b2 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c | |||
@@ -136,7 +136,6 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) | |||
136 | set_current_state(TASK_UNINTERRUPTIBLE); | 136 | set_current_state(TASK_UNINTERRUPTIBLE); |
137 | TXN_UNLOCK(); | 137 | TXN_UNLOCK(); |
138 | io_schedule(); | 138 | io_schedule(); |
139 | __set_current_state(TASK_RUNNING); | ||
140 | remove_wait_queue(event, &wait); | 139 | remove_wait_queue(event, &wait); |
141 | } | 140 | } |
142 | 141 | ||
@@ -2808,7 +2807,6 @@ int jfs_lazycommit(void *arg) | |||
2808 | set_current_state(TASK_INTERRUPTIBLE); | 2807 | set_current_state(TASK_INTERRUPTIBLE); |
2809 | LAZY_UNLOCK(flags); | 2808 | LAZY_UNLOCK(flags); |
2810 | schedule(); | 2809 | schedule(); |
2811 | __set_current_state(TASK_RUNNING); | ||
2812 | remove_wait_queue(&jfs_commit_thread_wait, &wq); | 2810 | remove_wait_queue(&jfs_commit_thread_wait, &wq); |
2813 | } | 2811 | } |
2814 | } while (!kthread_should_stop()); | 2812 | } while (!kthread_should_stop()); |
@@ -2996,7 +2994,6 @@ int jfs_sync(void *arg) | |||
2996 | set_current_state(TASK_INTERRUPTIBLE); | 2994 | set_current_state(TASK_INTERRUPTIBLE); |
2997 | TXN_UNLOCK(); | 2995 | TXN_UNLOCK(); |
2998 | schedule(); | 2996 | schedule(); |
2999 | __set_current_state(TASK_RUNNING); | ||
3000 | } | 2997 | } |
3001 | } while (!kthread_should_stop()); | 2998 | } while (!kthread_should_stop()); |
3002 | 2999 | ||
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index 8d04bda2bd2e..e966c023b1b7 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c | |||
@@ -92,7 +92,6 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, | |||
92 | 92 | ||
93 | set_current_state(TASK_UNINTERRUPTIBLE); | 93 | set_current_state(TASK_UNINTERRUPTIBLE); |
94 | schedule(); | 94 | schedule(); |
95 | __set_current_state(TASK_RUNNING); | ||
96 | remove_wait_queue(&nn->bl_wq, &wq); | 95 | remove_wait_queue(&nn->bl_wq, &wq); |
97 | 96 | ||
98 | if (reply->status != BL_DEVICE_REQUEST_PROC) { | 97 | if (reply->status != BL_DEVICE_REQUEST_PROC) { |
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index ea95a2bc21b5..a25490ae6c62 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c | |||
@@ -675,7 +675,6 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) | |||
675 | } | 675 | } |
676 | 676 | ||
677 | schedule(); | 677 | schedule(); |
678 | set_current_state(TASK_RUNNING); | ||
679 | 678 | ||
680 | if (msg.errno < 0) | 679 | if (msg.errno < 0) |
681 | ret = msg.errno; | 680 | ret = msg.errno; |
diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index d5cb78f53986..fe386fc6e85e 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h | |||
@@ -3,6 +3,8 @@ | |||
3 | 3 | ||
4 | typedef unsigned long __nocast cputime_t; | 4 | typedef unsigned long __nocast cputime_t; |
5 | 5 | ||
6 | #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) | ||
7 | |||
6 | #define cputime_one_jiffy jiffies_to_cputime(1) | 8 | #define cputime_one_jiffy jiffies_to_cputime(1) |
7 | #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) | 9 | #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) |
8 | #define cputime_to_scaled(__ct) (__ct) | 10 | #define cputime_to_scaled(__ct) (__ct) |
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 4e817606c549..0419485891f2 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h | |||
@@ -21,6 +21,8 @@ | |||
21 | typedef u64 __nocast cputime_t; | 21 | typedef u64 __nocast cputime_t; |
22 | typedef u64 __nocast cputime64_t; | 22 | typedef u64 __nocast cputime64_t; |
23 | 23 | ||
24 | #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) | ||
25 | |||
24 | #define cputime_one_jiffy jiffies_to_cputime(1) | 26 | #define cputime_one_jiffy jiffies_to_cputime(1) |
25 | 27 | ||
26 | #define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) | 28 | #define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 05a8c00e8339..5e344bbe63ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -57,6 +57,7 @@ struct sched_param { | |||
57 | #include <linux/llist.h> | 57 | #include <linux/llist.h> |
58 | #include <linux/uidgid.h> | 58 | #include <linux/uidgid.h> |
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/magic.h> | ||
60 | 61 | ||
61 | #include <asm/processor.h> | 62 | #include <asm/processor.h> |
62 | 63 | ||
@@ -646,6 +647,7 @@ struct signal_struct { | |||
646 | * Live threads maintain their own counters and add to these | 647 | * Live threads maintain their own counters and add to these |
647 | * in __exit_signal, except for the group leader. | 648 | * in __exit_signal, except for the group leader. |
648 | */ | 649 | */ |
650 | seqlock_t stats_lock; | ||
649 | cputime_t utime, stime, cutime, cstime; | 651 | cputime_t utime, stime, cutime, cstime; |
650 | cputime_t gtime; | 652 | cputime_t gtime; |
651 | cputime_t cgtime; | 653 | cputime_t cgtime; |
@@ -1024,6 +1026,7 @@ struct sched_domain_topology_level { | |||
1024 | extern struct sched_domain_topology_level *sched_domain_topology; | 1026 | extern struct sched_domain_topology_level *sched_domain_topology; |
1025 | 1027 | ||
1026 | extern void set_sched_topology(struct sched_domain_topology_level *tl); | 1028 | extern void set_sched_topology(struct sched_domain_topology_level *tl); |
1029 | extern void wake_up_if_idle(int cpu); | ||
1027 | 1030 | ||
1028 | #ifdef CONFIG_SCHED_DEBUG | 1031 | #ifdef CONFIG_SCHED_DEBUG |
1029 | # define SD_INIT_NAME(type) .name = #type | 1032 | # define SD_INIT_NAME(type) .name = #type |
@@ -2647,6 +2650,8 @@ static inline unsigned long *end_of_stack(struct task_struct *p) | |||
2647 | } | 2650 | } |
2648 | 2651 | ||
2649 | #endif | 2652 | #endif |
2653 | #define task_stack_end_corrupted(task) \ | ||
2654 | (*(end_of_stack(task)) != STACK_END_MAGIC) | ||
2650 | 2655 | ||
2651 | static inline int object_is_on_stack(void *obj) | 2656 | static inline int object_is_on_stack(void *obj) |
2652 | { | 2657 | { |
@@ -2669,6 +2674,7 @@ static inline unsigned long stack_not_used(struct task_struct *p) | |||
2669 | return (unsigned long)n - (unsigned long)end_of_stack(p); | 2674 | return (unsigned long)n - (unsigned long)end_of_stack(p); |
2670 | } | 2675 | } |
2671 | #endif | 2676 | #endif |
2677 | extern void set_task_stack_end_magic(struct task_struct *tsk); | ||
2672 | 2678 | ||
2673 | /* set thread flags in other task's structures | 2679 | /* set thread flags in other task's structures |
2674 | * - see asm/thread_info.h for TIF_xxxx flags available | 2680 | * - see asm/thread_info.h for TIF_xxxx flags available |
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index cc359636cfa3..f5df8f687b4d 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h | |||
@@ -456,4 +456,23 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) | |||
456 | spin_unlock_irqrestore(&sl->lock, flags); | 456 | spin_unlock_irqrestore(&sl->lock, flags); |
457 | } | 457 | } |
458 | 458 | ||
459 | static inline unsigned long | ||
460 | read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) | ||
461 | { | ||
462 | unsigned long flags = 0; | ||
463 | |||
464 | if (!(*seq & 1)) /* Even */ | ||
465 | *seq = read_seqbegin(lock); | ||
466 | else /* Odd */ | ||
467 | read_seqlock_excl_irqsave(lock, flags); | ||
468 | |||
469 | return flags; | ||
470 | } | ||
471 | |||
472 | static inline void | ||
473 | done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) | ||
474 | { | ||
475 | if (seq & 1) | ||
476 | read_sequnlock_excl_irqrestore(lock, flags); | ||
477 | } | ||
459 | #endif /* __LINUX_SEQLOCK_H */ | 478 | #endif /* __LINUX_SEQLOCK_H */ |
diff --git a/include/linux/smp.h b/include/linux/smp.h index 34347f26be9b..93dff5fff524 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h | |||
@@ -100,6 +100,7 @@ int smp_call_function_any(const struct cpumask *mask, | |||
100 | smp_call_func_t func, void *info, int wait); | 100 | smp_call_func_t func, void *info, int wait); |
101 | 101 | ||
102 | void kick_all_cpus_sync(void); | 102 | void kick_all_cpus_sync(void); |
103 | void wake_up_all_idle_cpus(void); | ||
103 | 104 | ||
104 | /* | 105 | /* |
105 | * Generic and arch helpers | 106 | * Generic and arch helpers |
@@ -148,6 +149,7 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, | |||
148 | } | 149 | } |
149 | 150 | ||
150 | static inline void kick_all_cpus_sync(void) { } | 151 | static inline void kick_all_cpus_sync(void) { } |
152 | static inline void wake_up_all_idle_cpus(void) { } | ||
151 | 153 | ||
152 | #endif /* !SMP */ | 154 | #endif /* !SMP */ |
153 | 155 | ||
diff --git a/include/linux/wait.h b/include/linux/wait.h index 80115bf88671..e4a8eb9312ea 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
@@ -281,9 +281,11 @@ do { \ | |||
281 | * wake_up() has to be called after changing any variable that could | 281 | * wake_up() has to be called after changing any variable that could |
282 | * change the result of the wait condition. | 282 | * change the result of the wait condition. |
283 | * | 283 | * |
284 | * The function returns 0 if the @timeout elapsed, or the remaining | 284 | * Returns: |
285 | * jiffies (at least 1) if the @condition evaluated to %true before | 285 | * 0 if the @condition evaluated to %false after the @timeout elapsed, |
286 | * the @timeout elapsed. | 286 | * 1 if the @condition evaluated to %true after the @timeout elapsed, |
287 | * or the remaining jiffies (at least 1) if the @condition evaluated | ||
288 | * to %true before the @timeout elapsed. | ||
287 | */ | 289 | */ |
288 | #define wait_event_timeout(wq, condition, timeout) \ | 290 | #define wait_event_timeout(wq, condition, timeout) \ |
289 | ({ \ | 291 | ({ \ |
@@ -364,9 +366,11 @@ do { \ | |||
364 | * change the result of the wait condition. | 366 | * change the result of the wait condition. |
365 | * | 367 | * |
366 | * Returns: | 368 | * Returns: |
367 | * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by | 369 | * 0 if the @condition evaluated to %false after the @timeout elapsed, |
368 | * a signal, or the remaining jiffies (at least 1) if the @condition | 370 | * 1 if the @condition evaluated to %true after the @timeout elapsed, |
369 | * evaluated to %true before the @timeout elapsed. | 371 | * the remaining jiffies (at least 1) if the @condition evaluated |
372 | * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was | ||
373 | * interrupted by a signal. | ||
370 | */ | 374 | */ |
371 | #define wait_event_interruptible_timeout(wq, condition, timeout) \ | 375 | #define wait_event_interruptible_timeout(wq, condition, timeout) \ |
372 | ({ \ | 376 | ({ \ |
diff --git a/init/main.c b/init/main.c index c5c11da6c4e1..89ec862da2d4 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -508,6 +508,7 @@ asmlinkage __visible void __init start_kernel(void) | |||
508 | * lockdep hash: | 508 | * lockdep hash: |
509 | */ | 509 | */ |
510 | lockdep_init(); | 510 | lockdep_init(); |
511 | set_task_stack_end_magic(&init_task); | ||
511 | smp_setup_processor_id(); | 512 | smp_setup_processor_id(); |
512 | debug_objects_early_init(); | 513 | debug_objects_early_init(); |
513 | 514 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index d13f2eec4bb8..5d30019ff953 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -115,32 +115,33 @@ static void __exit_signal(struct task_struct *tsk) | |||
115 | 115 | ||
116 | if (tsk == sig->curr_target) | 116 | if (tsk == sig->curr_target) |
117 | sig->curr_target = next_thread(tsk); | 117 | sig->curr_target = next_thread(tsk); |
118 | /* | ||
119 | * Accumulate here the counters for all threads but the | ||
120 | * group leader as they die, so they can be added into | ||
121 | * the process-wide totals when those are taken. | ||
122 | * The group leader stays around as a zombie as long | ||
123 | * as there are other threads. When it gets reaped, | ||
124 | * the exit.c code will add its counts into these totals. | ||
125 | * We won't ever get here for the group leader, since it | ||
126 | * will have been the last reference on the signal_struct. | ||
127 | */ | ||
128 | task_cputime(tsk, &utime, &stime); | ||
129 | sig->utime += utime; | ||
130 | sig->stime += stime; | ||
131 | sig->gtime += task_gtime(tsk); | ||
132 | sig->min_flt += tsk->min_flt; | ||
133 | sig->maj_flt += tsk->maj_flt; | ||
134 | sig->nvcsw += tsk->nvcsw; | ||
135 | sig->nivcsw += tsk->nivcsw; | ||
136 | sig->inblock += task_io_get_inblock(tsk); | ||
137 | sig->oublock += task_io_get_oublock(tsk); | ||
138 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | ||
139 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||
140 | } | 118 | } |
141 | 119 | ||
120 | /* | ||
121 | * Accumulate here the counters for all threads but the group leader | ||
122 | * as they die, so they can be added into the process-wide totals | ||
123 | * when those are taken. The group leader stays around as a zombie as | ||
124 | * long as there are other threads. When it gets reaped, the exit.c | ||
125 | * code will add its counts into these totals. We won't ever get here | ||
126 | * for the group leader, since it will have been the last reference on | ||
127 | * the signal_struct. | ||
128 | */ | ||
129 | task_cputime(tsk, &utime, &stime); | ||
130 | write_seqlock(&sig->stats_lock); | ||
131 | sig->utime += utime; | ||
132 | sig->stime += stime; | ||
133 | sig->gtime += task_gtime(tsk); | ||
134 | sig->min_flt += tsk->min_flt; | ||
135 | sig->maj_flt += tsk->maj_flt; | ||
136 | sig->nvcsw += tsk->nvcsw; | ||
137 | sig->nivcsw += tsk->nivcsw; | ||
138 | sig->inblock += task_io_get_inblock(tsk); | ||
139 | sig->oublock += task_io_get_oublock(tsk); | ||
140 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | ||
141 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||
142 | sig->nr_threads--; | 142 | sig->nr_threads--; |
143 | __unhash_process(tsk, group_dead); | 143 | __unhash_process(tsk, group_dead); |
144 | write_sequnlock(&sig->stats_lock); | ||
144 | 145 | ||
145 | /* | 146 | /* |
146 | * Do this under ->siglock, we can race with another thread | 147 | * Do this under ->siglock, we can race with another thread |
@@ -1046,6 +1047,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1046 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1047 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1047 | psig = p->real_parent->signal; | 1048 | psig = p->real_parent->signal; |
1048 | sig = p->signal; | 1049 | sig = p->signal; |
1050 | write_seqlock(&psig->stats_lock); | ||
1049 | psig->cutime += tgutime + sig->cutime; | 1051 | psig->cutime += tgutime + sig->cutime; |
1050 | psig->cstime += tgstime + sig->cstime; | 1052 | psig->cstime += tgstime + sig->cstime; |
1051 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; | 1053 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; |
@@ -1068,6 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1068 | psig->cmaxrss = maxrss; | 1070 | psig->cmaxrss = maxrss; |
1069 | task_io_accounting_add(&psig->ioac, &p->ioac); | 1071 | task_io_accounting_add(&psig->ioac, &p->ioac); |
1070 | task_io_accounting_add(&psig->ioac, &sig->ioac); | 1072 | task_io_accounting_add(&psig->ioac, &sig->ioac); |
1073 | write_sequnlock(&psig->stats_lock); | ||
1071 | spin_unlock_irq(&p->real_parent->sighand->siglock); | 1074 | spin_unlock_irq(&p->real_parent->sighand->siglock); |
1072 | } | 1075 | } |
1073 | 1076 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 8c162d102740..9b7d746d6d62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, | |||
294 | return 0; | 294 | return 0; |
295 | } | 295 | } |
296 | 296 | ||
297 | void set_task_stack_end_magic(struct task_struct *tsk) | ||
298 | { | ||
299 | unsigned long *stackend; | ||
300 | |||
301 | stackend = end_of_stack(tsk); | ||
302 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | ||
303 | } | ||
304 | |||
297 | static struct task_struct *dup_task_struct(struct task_struct *orig) | 305 | static struct task_struct *dup_task_struct(struct task_struct *orig) |
298 | { | 306 | { |
299 | struct task_struct *tsk; | 307 | struct task_struct *tsk; |
300 | struct thread_info *ti; | 308 | struct thread_info *ti; |
301 | unsigned long *stackend; | ||
302 | int node = tsk_fork_get_node(orig); | 309 | int node = tsk_fork_get_node(orig); |
303 | int err; | 310 | int err; |
304 | 311 | ||
@@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
328 | setup_thread_stack(tsk, orig); | 335 | setup_thread_stack(tsk, orig); |
329 | clear_user_return_notifier(tsk); | 336 | clear_user_return_notifier(tsk); |
330 | clear_tsk_need_resched(tsk); | 337 | clear_tsk_need_resched(tsk); |
331 | stackend = end_of_stack(tsk); | 338 | set_task_stack_end_magic(tsk); |
332 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | ||
333 | 339 | ||
334 | #ifdef CONFIG_CC_STACKPROTECTOR | 340 | #ifdef CONFIG_CC_STACKPROTECTOR |
335 | tsk->stack_canary = get_random_int(); | 341 | tsk->stack_canary = get_random_int(); |
@@ -1067,6 +1073,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1067 | sig->curr_target = tsk; | 1073 | sig->curr_target = tsk; |
1068 | init_sigpending(&sig->shared_pending); | 1074 | init_sigpending(&sig->shared_pending); |
1069 | INIT_LIST_HEAD(&sig->posix_timers); | 1075 | INIT_LIST_HEAD(&sig->posix_timers); |
1076 | seqlock_init(&sig->stats_lock); | ||
1070 | 1077 | ||
1071 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1078 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1072 | sig->real_timer.function = it_real_fn; | 1079 | sig->real_timer.function = it_real_fn; |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba98301..8a2e230fb86a 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
148 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | 148 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) |
149 | goto out; | 149 | goto out; |
150 | 150 | ||
151 | t = p; | 151 | for_each_thread(p, t) |
152 | do { | ||
153 | sched_move_task(t); | 152 | sched_move_task(t); |
154 | } while_each_thread(p, t); | ||
155 | |||
156 | out: | 153 | out: |
157 | unlock_task_sighand(p, &flags); | 154 | unlock_task_sighand(p, &flags); |
158 | autogroup_kref_put(prev); | 155 | autogroup_kref_put(prev); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f235c41a3532..44999505e1bf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -317,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
317 | for (;;) { | 317 | for (;;) { |
318 | rq = task_rq(p); | 318 | rq = task_rq(p); |
319 | raw_spin_lock(&rq->lock); | 319 | raw_spin_lock(&rq->lock); |
320 | if (likely(rq == task_rq(p))) | 320 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) |
321 | return rq; | 321 | return rq; |
322 | raw_spin_unlock(&rq->lock); | 322 | raw_spin_unlock(&rq->lock); |
323 | |||
324 | while (unlikely(task_on_rq_migrating(p))) | ||
325 | cpu_relax(); | ||
323 | } | 326 | } |
324 | } | 327 | } |
325 | 328 | ||
@@ -336,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
336 | raw_spin_lock_irqsave(&p->pi_lock, *flags); | 339 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
337 | rq = task_rq(p); | 340 | rq = task_rq(p); |
338 | raw_spin_lock(&rq->lock); | 341 | raw_spin_lock(&rq->lock); |
339 | if (likely(rq == task_rq(p))) | 342 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) |
340 | return rq; | 343 | return rq; |
341 | raw_spin_unlock(&rq->lock); | 344 | raw_spin_unlock(&rq->lock); |
342 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | 345 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); |
346 | |||
347 | while (unlikely(task_on_rq_migrating(p))) | ||
348 | cpu_relax(); | ||
343 | } | 349 | } |
344 | } | 350 | } |
345 | 351 | ||
@@ -433,7 +439,15 @@ static void __hrtick_start(void *arg) | |||
433 | void hrtick_start(struct rq *rq, u64 delay) | 439 | void hrtick_start(struct rq *rq, u64 delay) |
434 | { | 440 | { |
435 | struct hrtimer *timer = &rq->hrtick_timer; | 441 | struct hrtimer *timer = &rq->hrtick_timer; |
436 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 442 | ktime_t time; |
443 | s64 delta; | ||
444 | |||
445 | /* | ||
446 | * Don't schedule slices shorter than 10000ns, that just | ||
447 | * doesn't make sense and can cause timer DoS. | ||
448 | */ | ||
449 | delta = max_t(s64, delay, 10000LL); | ||
450 | time = ktime_add_ns(timer->base->get_time(), delta); | ||
437 | 451 | ||
438 | hrtimer_set_expires(timer, time); | 452 | hrtimer_set_expires(timer, time); |
439 | 453 | ||
@@ -1027,7 +1041,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
1027 | * A queue event has occurred, and we're going to schedule. In | 1041 | * A queue event has occurred, and we're going to schedule. In |
1028 | * this case, we can save a useless back to back clock update. | 1042 | * this case, we can save a useless back to back clock update. |
1029 | */ | 1043 | */ |
1030 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) | 1044 | if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) |
1031 | rq->skip_clock_update = 1; | 1045 | rq->skip_clock_update = 1; |
1032 | } | 1046 | } |
1033 | 1047 | ||
@@ -1072,7 +1086,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1072 | 1086 | ||
1073 | static void __migrate_swap_task(struct task_struct *p, int cpu) | 1087 | static void __migrate_swap_task(struct task_struct *p, int cpu) |
1074 | { | 1088 | { |
1075 | if (p->on_rq) { | 1089 | if (task_on_rq_queued(p)) { |
1076 | struct rq *src_rq, *dst_rq; | 1090 | struct rq *src_rq, *dst_rq; |
1077 | 1091 | ||
1078 | src_rq = task_rq(p); | 1092 | src_rq = task_rq(p); |
@@ -1198,7 +1212,7 @@ static int migration_cpu_stop(void *data); | |||
1198 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) | 1212 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) |
1199 | { | 1213 | { |
1200 | unsigned long flags; | 1214 | unsigned long flags; |
1201 | int running, on_rq; | 1215 | int running, queued; |
1202 | unsigned long ncsw; | 1216 | unsigned long ncsw; |
1203 | struct rq *rq; | 1217 | struct rq *rq; |
1204 | 1218 | ||
@@ -1236,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
1236 | rq = task_rq_lock(p, &flags); | 1250 | rq = task_rq_lock(p, &flags); |
1237 | trace_sched_wait_task(p); | 1251 | trace_sched_wait_task(p); |
1238 | running = task_running(rq, p); | 1252 | running = task_running(rq, p); |
1239 | on_rq = p->on_rq; | 1253 | queued = task_on_rq_queued(p); |
1240 | ncsw = 0; | 1254 | ncsw = 0; |
1241 | if (!match_state || p->state == match_state) | 1255 | if (!match_state || p->state == match_state) |
1242 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 1256 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
@@ -1268,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
1268 | * running right now), it's preempted, and we should | 1282 | * running right now), it's preempted, and we should |
1269 | * yield - it could be a while. | 1283 | * yield - it could be a while. |
1270 | */ | 1284 | */ |
1271 | if (unlikely(on_rq)) { | 1285 | if (unlikely(queued)) { |
1272 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); | 1286 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
1273 | 1287 | ||
1274 | set_current_state(TASK_UNINTERRUPTIBLE); | 1288 | set_current_state(TASK_UNINTERRUPTIBLE); |
@@ -1462,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
1462 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1476 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
1463 | { | 1477 | { |
1464 | activate_task(rq, p, en_flags); | 1478 | activate_task(rq, p, en_flags); |
1465 | p->on_rq = 1; | 1479 | p->on_rq = TASK_ON_RQ_QUEUED; |
1466 | 1480 | ||
1467 | /* if a worker is waking up, notify workqueue */ | 1481 | /* if a worker is waking up, notify workqueue */ |
1468 | if (p->flags & PF_WQ_WORKER) | 1482 | if (p->flags & PF_WQ_WORKER) |
@@ -1521,7 +1535,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
1521 | int ret = 0; | 1535 | int ret = 0; |
1522 | 1536 | ||
1523 | rq = __task_rq_lock(p); | 1537 | rq = __task_rq_lock(p); |
1524 | if (p->on_rq) { | 1538 | if (task_on_rq_queued(p)) { |
1525 | /* check_preempt_curr() may use rq clock */ | 1539 | /* check_preempt_curr() may use rq clock */ |
1526 | update_rq_clock(rq); | 1540 | update_rq_clock(rq); |
1527 | ttwu_do_wakeup(rq, p, wake_flags); | 1541 | ttwu_do_wakeup(rq, p, wake_flags); |
@@ -1604,6 +1618,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) | |||
1604 | } | 1618 | } |
1605 | } | 1619 | } |
1606 | 1620 | ||
1621 | void wake_up_if_idle(int cpu) | ||
1622 | { | ||
1623 | struct rq *rq = cpu_rq(cpu); | ||
1624 | unsigned long flags; | ||
1625 | |||
1626 | if (!is_idle_task(rq->curr)) | ||
1627 | return; | ||
1628 | |||
1629 | if (set_nr_if_polling(rq->idle)) { | ||
1630 | trace_sched_wake_idle_without_ipi(cpu); | ||
1631 | } else { | ||
1632 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1633 | if (is_idle_task(rq->curr)) | ||
1634 | smp_send_reschedule(cpu); | ||
1635 | /* Else cpu is not in idle, do nothing here */ | ||
1636 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1637 | } | ||
1638 | } | ||
1639 | |||
1607 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1640 | bool cpus_share_cache(int this_cpu, int that_cpu) |
1608 | { | 1641 | { |
1609 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1642 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
@@ -1726,7 +1759,7 @@ static void try_to_wake_up_local(struct task_struct *p) | |||
1726 | if (!(p->state & TASK_NORMAL)) | 1759 | if (!(p->state & TASK_NORMAL)) |
1727 | goto out; | 1760 | goto out; |
1728 | 1761 | ||
1729 | if (!p->on_rq) | 1762 | if (!task_on_rq_queued(p)) |
1730 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 1763 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
1731 | 1764 | ||
1732 | ttwu_do_wakeup(rq, p, 0); | 1765 | ttwu_do_wakeup(rq, p, 0); |
@@ -1760,6 +1793,20 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
1760 | } | 1793 | } |
1761 | 1794 | ||
1762 | /* | 1795 | /* |
1796 | * This function clears the sched_dl_entity static params. | ||
1797 | */ | ||
1798 | void __dl_clear_params(struct task_struct *p) | ||
1799 | { | ||
1800 | struct sched_dl_entity *dl_se = &p->dl; | ||
1801 | |||
1802 | dl_se->dl_runtime = 0; | ||
1803 | dl_se->dl_deadline = 0; | ||
1804 | dl_se->dl_period = 0; | ||
1805 | dl_se->flags = 0; | ||
1806 | dl_se->dl_bw = 0; | ||
1807 | } | ||
1808 | |||
1809 | /* | ||
1763 | * Perform scheduler related setup for a newly forked process p. | 1810 | * Perform scheduler related setup for a newly forked process p. |
1764 | * p is forked by current. | 1811 | * p is forked by current. |
1765 | * | 1812 | * |
@@ -1783,10 +1830,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1783 | 1830 | ||
1784 | RB_CLEAR_NODE(&p->dl.rb_node); | 1831 | RB_CLEAR_NODE(&p->dl.rb_node); |
1785 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1832 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1786 | p->dl.dl_runtime = p->dl.runtime = 0; | 1833 | __dl_clear_params(p); |
1787 | p->dl.dl_deadline = p->dl.deadline = 0; | ||
1788 | p->dl.dl_period = 0; | ||
1789 | p->dl.flags = 0; | ||
1790 | 1834 | ||
1791 | INIT_LIST_HEAD(&p->rt.run_list); | 1835 | INIT_LIST_HEAD(&p->rt.run_list); |
1792 | 1836 | ||
@@ -1961,6 +2005,8 @@ unsigned long to_ratio(u64 period, u64 runtime) | |||
1961 | #ifdef CONFIG_SMP | 2005 | #ifdef CONFIG_SMP |
1962 | inline struct dl_bw *dl_bw_of(int i) | 2006 | inline struct dl_bw *dl_bw_of(int i) |
1963 | { | 2007 | { |
2008 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | ||
2009 | "sched RCU must be held"); | ||
1964 | return &cpu_rq(i)->rd->dl_bw; | 2010 | return &cpu_rq(i)->rd->dl_bw; |
1965 | } | 2011 | } |
1966 | 2012 | ||
@@ -1969,6 +2015,8 @@ static inline int dl_bw_cpus(int i) | |||
1969 | struct root_domain *rd = cpu_rq(i)->rd; | 2015 | struct root_domain *rd = cpu_rq(i)->rd; |
1970 | int cpus = 0; | 2016 | int cpus = 0; |
1971 | 2017 | ||
2018 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | ||
2019 | "sched RCU must be held"); | ||
1972 | for_each_cpu_and(i, rd->span, cpu_active_mask) | 2020 | for_each_cpu_and(i, rd->span, cpu_active_mask) |
1973 | cpus++; | 2021 | cpus++; |
1974 | 2022 | ||
@@ -2079,7 +2127,7 @@ void wake_up_new_task(struct task_struct *p) | |||
2079 | init_task_runnable_average(p); | 2127 | init_task_runnable_average(p); |
2080 | rq = __task_rq_lock(p); | 2128 | rq = __task_rq_lock(p); |
2081 | activate_task(rq, p, 0); | 2129 | activate_task(rq, p, 0); |
2082 | p->on_rq = 1; | 2130 | p->on_rq = TASK_ON_RQ_QUEUED; |
2083 | trace_sched_wakeup_new(p, true); | 2131 | trace_sched_wakeup_new(p, true); |
2084 | check_preempt_curr(rq, p, WF_FORK); | 2132 | check_preempt_curr(rq, p, WF_FORK); |
2085 | #ifdef CONFIG_SMP | 2133 | #ifdef CONFIG_SMP |
@@ -2271,10 +2319,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) | |||
2271 | */ | 2319 | */ |
2272 | post_schedule(rq); | 2320 | post_schedule(rq); |
2273 | 2321 | ||
2274 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | ||
2275 | /* In this case, finish_task_switch does not reenable preemption */ | ||
2276 | preempt_enable(); | ||
2277 | #endif | ||
2278 | if (current->set_child_tid) | 2322 | if (current->set_child_tid) |
2279 | put_user(task_pid_vnr(current), current->set_child_tid); | 2323 | put_user(task_pid_vnr(current), current->set_child_tid); |
2280 | } | 2324 | } |
@@ -2317,9 +2361,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2317 | * of the scheduler it's an obvious special-case), so we | 2361 | * of the scheduler it's an obvious special-case), so we |
2318 | * do an early lockdep release here: | 2362 | * do an early lockdep release here: |
2319 | */ | 2363 | */ |
2320 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
2321 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 2364 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
2322 | #endif | ||
2323 | 2365 | ||
2324 | context_tracking_task_switch(prev, next); | 2366 | context_tracking_task_switch(prev, next); |
2325 | /* Here we just switch the register state and the stack. */ | 2367 | /* Here we just switch the register state and the stack. */ |
@@ -2447,7 +2489,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
2447 | * project cycles that may never be accounted to this | 2489 | * project cycles that may never be accounted to this |
2448 | * thread, breaking clock_gettime(). | 2490 | * thread, breaking clock_gettime(). |
2449 | */ | 2491 | */ |
2450 | if (task_current(rq, p) && p->on_rq) { | 2492 | if (task_current(rq, p) && task_on_rq_queued(p)) { |
2451 | update_rq_clock(rq); | 2493 | update_rq_clock(rq); |
2452 | ns = rq_clock_task(rq) - p->se.exec_start; | 2494 | ns = rq_clock_task(rq) - p->se.exec_start; |
2453 | if ((s64)ns < 0) | 2495 | if ((s64)ns < 0) |
@@ -2493,7 +2535,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
2493 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has | 2535 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has |
2494 | * been accounted, so we're correct here as well. | 2536 | * been accounted, so we're correct here as well. |
2495 | */ | 2537 | */ |
2496 | if (!p->on_cpu || !p->on_rq) | 2538 | if (!p->on_cpu || !task_on_rq_queued(p)) |
2497 | return p->se.sum_exec_runtime; | 2539 | return p->se.sum_exec_runtime; |
2498 | #endif | 2540 | #endif |
2499 | 2541 | ||
@@ -2656,6 +2698,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
2656 | */ | 2698 | */ |
2657 | static inline void schedule_debug(struct task_struct *prev) | 2699 | static inline void schedule_debug(struct task_struct *prev) |
2658 | { | 2700 | { |
2701 | #ifdef CONFIG_SCHED_STACK_END_CHECK | ||
2702 | BUG_ON(unlikely(task_stack_end_corrupted(prev))); | ||
2703 | #endif | ||
2659 | /* | 2704 | /* |
2660 | * Test if we are atomic. Since do_exit() needs to call into | 2705 | * Test if we are atomic. Since do_exit() needs to call into |
2661 | * schedule() atomically, we ignore that path. Otherwise whine | 2706 | * schedule() atomically, we ignore that path. Otherwise whine |
@@ -2797,7 +2842,7 @@ need_resched: | |||
2797 | switch_count = &prev->nvcsw; | 2842 | switch_count = &prev->nvcsw; |
2798 | } | 2843 | } |
2799 | 2844 | ||
2800 | if (prev->on_rq || rq->skip_clock_update < 0) | 2845 | if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) |
2801 | update_rq_clock(rq); | 2846 | update_rq_clock(rq); |
2802 | 2847 | ||
2803 | next = pick_next_task(rq, prev); | 2848 | next = pick_next_task(rq, prev); |
@@ -2962,7 +3007,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
2962 | */ | 3007 | */ |
2963 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3008 | void rt_mutex_setprio(struct task_struct *p, int prio) |
2964 | { | 3009 | { |
2965 | int oldprio, on_rq, running, enqueue_flag = 0; | 3010 | int oldprio, queued, running, enqueue_flag = 0; |
2966 | struct rq *rq; | 3011 | struct rq *rq; |
2967 | const struct sched_class *prev_class; | 3012 | const struct sched_class *prev_class; |
2968 | 3013 | ||
@@ -2991,12 +3036,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2991 | trace_sched_pi_setprio(p, prio); | 3036 | trace_sched_pi_setprio(p, prio); |
2992 | oldprio = p->prio; | 3037 | oldprio = p->prio; |
2993 | prev_class = p->sched_class; | 3038 | prev_class = p->sched_class; |
2994 | on_rq = p->on_rq; | 3039 | queued = task_on_rq_queued(p); |
2995 | running = task_current(rq, p); | 3040 | running = task_current(rq, p); |
2996 | if (on_rq) | 3041 | if (queued) |
2997 | dequeue_task(rq, p, 0); | 3042 | dequeue_task(rq, p, 0); |
2998 | if (running) | 3043 | if (running) |
2999 | p->sched_class->put_prev_task(rq, p); | 3044 | put_prev_task(rq, p); |
3000 | 3045 | ||
3001 | /* | 3046 | /* |
3002 | * Boosting condition are: | 3047 | * Boosting condition are: |
@@ -3033,7 +3078,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3033 | 3078 | ||
3034 | if (running) | 3079 | if (running) |
3035 | p->sched_class->set_curr_task(rq); | 3080 | p->sched_class->set_curr_task(rq); |
3036 | if (on_rq) | 3081 | if (queued) |
3037 | enqueue_task(rq, p, enqueue_flag); | 3082 | enqueue_task(rq, p, enqueue_flag); |
3038 | 3083 | ||
3039 | check_class_changed(rq, p, prev_class, oldprio); | 3084 | check_class_changed(rq, p, prev_class, oldprio); |
@@ -3044,7 +3089,7 @@ out_unlock: | |||
3044 | 3089 | ||
3045 | void set_user_nice(struct task_struct *p, long nice) | 3090 | void set_user_nice(struct task_struct *p, long nice) |
3046 | { | 3091 | { |
3047 | int old_prio, delta, on_rq; | 3092 | int old_prio, delta, queued; |
3048 | unsigned long flags; | 3093 | unsigned long flags; |
3049 | struct rq *rq; | 3094 | struct rq *rq; |
3050 | 3095 | ||
@@ -3065,8 +3110,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3065 | p->static_prio = NICE_TO_PRIO(nice); | 3110 | p->static_prio = NICE_TO_PRIO(nice); |
3066 | goto out_unlock; | 3111 | goto out_unlock; |
3067 | } | 3112 | } |
3068 | on_rq = p->on_rq; | 3113 | queued = task_on_rq_queued(p); |
3069 | if (on_rq) | 3114 | if (queued) |
3070 | dequeue_task(rq, p, 0); | 3115 | dequeue_task(rq, p, 0); |
3071 | 3116 | ||
3072 | p->static_prio = NICE_TO_PRIO(nice); | 3117 | p->static_prio = NICE_TO_PRIO(nice); |
@@ -3075,7 +3120,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3075 | p->prio = effective_prio(p); | 3120 | p->prio = effective_prio(p); |
3076 | delta = p->prio - old_prio; | 3121 | delta = p->prio - old_prio; |
3077 | 3122 | ||
3078 | if (on_rq) { | 3123 | if (queued) { |
3079 | enqueue_task(rq, p, 0); | 3124 | enqueue_task(rq, p, 0); |
3080 | /* | 3125 | /* |
3081 | * If the task increased its priority or is running and | 3126 | * If the task increased its priority or is running and |
@@ -3347,7 +3392,7 @@ static int __sched_setscheduler(struct task_struct *p, | |||
3347 | { | 3392 | { |
3348 | int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : | 3393 | int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : |
3349 | MAX_RT_PRIO - 1 - attr->sched_priority; | 3394 | MAX_RT_PRIO - 1 - attr->sched_priority; |
3350 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3395 | int retval, oldprio, oldpolicy = -1, queued, running; |
3351 | int policy = attr->sched_policy; | 3396 | int policy = attr->sched_policy; |
3352 | unsigned long flags; | 3397 | unsigned long flags; |
3353 | const struct sched_class *prev_class; | 3398 | const struct sched_class *prev_class; |
@@ -3544,19 +3589,19 @@ change: | |||
3544 | return 0; | 3589 | return 0; |
3545 | } | 3590 | } |
3546 | 3591 | ||
3547 | on_rq = p->on_rq; | 3592 | queued = task_on_rq_queued(p); |
3548 | running = task_current(rq, p); | 3593 | running = task_current(rq, p); |
3549 | if (on_rq) | 3594 | if (queued) |
3550 | dequeue_task(rq, p, 0); | 3595 | dequeue_task(rq, p, 0); |
3551 | if (running) | 3596 | if (running) |
3552 | p->sched_class->put_prev_task(rq, p); | 3597 | put_prev_task(rq, p); |
3553 | 3598 | ||
3554 | prev_class = p->sched_class; | 3599 | prev_class = p->sched_class; |
3555 | __setscheduler(rq, p, attr); | 3600 | __setscheduler(rq, p, attr); |
3556 | 3601 | ||
3557 | if (running) | 3602 | if (running) |
3558 | p->sched_class->set_curr_task(rq); | 3603 | p->sched_class->set_curr_task(rq); |
3559 | if (on_rq) { | 3604 | if (queued) { |
3560 | /* | 3605 | /* |
3561 | * We enqueue to tail when the priority of a task is | 3606 | * We enqueue to tail when the priority of a task is |
3562 | * increased (user space view). | 3607 | * increased (user space view). |
@@ -3980,14 +4025,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
3980 | rcu_read_lock(); | 4025 | rcu_read_lock(); |
3981 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { | 4026 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { |
3982 | rcu_read_unlock(); | 4027 | rcu_read_unlock(); |
3983 | goto out_unlock; | 4028 | goto out_free_new_mask; |
3984 | } | 4029 | } |
3985 | rcu_read_unlock(); | 4030 | rcu_read_unlock(); |
3986 | } | 4031 | } |
3987 | 4032 | ||
3988 | retval = security_task_setscheduler(p); | 4033 | retval = security_task_setscheduler(p); |
3989 | if (retval) | 4034 | if (retval) |
3990 | goto out_unlock; | 4035 | goto out_free_new_mask; |
3991 | 4036 | ||
3992 | 4037 | ||
3993 | cpuset_cpus_allowed(p, cpus_allowed); | 4038 | cpuset_cpus_allowed(p, cpus_allowed); |
@@ -4000,13 +4045,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4000 | * root_domain. | 4045 | * root_domain. |
4001 | */ | 4046 | */ |
4002 | #ifdef CONFIG_SMP | 4047 | #ifdef CONFIG_SMP |
4003 | if (task_has_dl_policy(p)) { | 4048 | if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { |
4004 | const struct cpumask *span = task_rq(p)->rd->span; | 4049 | rcu_read_lock(); |
4005 | 4050 | if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { | |
4006 | if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { | ||
4007 | retval = -EBUSY; | 4051 | retval = -EBUSY; |
4008 | goto out_unlock; | 4052 | rcu_read_unlock(); |
4053 | goto out_free_new_mask; | ||
4009 | } | 4054 | } |
4055 | rcu_read_unlock(); | ||
4010 | } | 4056 | } |
4011 | #endif | 4057 | #endif |
4012 | again: | 4058 | again: |
@@ -4024,7 +4070,7 @@ again: | |||
4024 | goto again; | 4070 | goto again; |
4025 | } | 4071 | } |
4026 | } | 4072 | } |
4027 | out_unlock: | 4073 | out_free_new_mask: |
4028 | free_cpumask_var(new_mask); | 4074 | free_cpumask_var(new_mask); |
4029 | out_free_cpus_allowed: | 4075 | out_free_cpus_allowed: |
4030 | free_cpumask_var(cpus_allowed); | 4076 | free_cpumask_var(cpus_allowed); |
@@ -4508,7 +4554,7 @@ void show_state_filter(unsigned long state_filter) | |||
4508 | " task PC stack pid father\n"); | 4554 | " task PC stack pid father\n"); |
4509 | #endif | 4555 | #endif |
4510 | rcu_read_lock(); | 4556 | rcu_read_lock(); |
4511 | do_each_thread(g, p) { | 4557 | for_each_process_thread(g, p) { |
4512 | /* | 4558 | /* |
4513 | * reset the NMI-timeout, listing all files on a slow | 4559 | * reset the NMI-timeout, listing all files on a slow |
4514 | * console might take a lot of time: | 4560 | * console might take a lot of time: |
@@ -4516,7 +4562,7 @@ void show_state_filter(unsigned long state_filter) | |||
4516 | touch_nmi_watchdog(); | 4562 | touch_nmi_watchdog(); |
4517 | if (!state_filter || (p->state & state_filter)) | 4563 | if (!state_filter || (p->state & state_filter)) |
4518 | sched_show_task(p); | 4564 | sched_show_task(p); |
4519 | } while_each_thread(g, p); | 4565 | } |
4520 | 4566 | ||
4521 | touch_all_softlockup_watchdogs(); | 4567 | touch_all_softlockup_watchdogs(); |
4522 | 4568 | ||
@@ -4571,7 +4617,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4571 | rcu_read_unlock(); | 4617 | rcu_read_unlock(); |
4572 | 4618 | ||
4573 | rq->curr = rq->idle = idle; | 4619 | rq->curr = rq->idle = idle; |
4574 | idle->on_rq = 1; | 4620 | idle->on_rq = TASK_ON_RQ_QUEUED; |
4575 | #if defined(CONFIG_SMP) | 4621 | #if defined(CONFIG_SMP) |
4576 | idle->on_cpu = 1; | 4622 | idle->on_cpu = 1; |
4577 | #endif | 4623 | #endif |
@@ -4592,6 +4638,33 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4592 | } | 4638 | } |
4593 | 4639 | ||
4594 | #ifdef CONFIG_SMP | 4640 | #ifdef CONFIG_SMP |
4641 | /* | ||
4642 | * move_queued_task - move a queued task to new rq. | ||
4643 | * | ||
4644 | * Returns (locked) new rq. Old rq's lock is released. | ||
4645 | */ | ||
4646 | static struct rq *move_queued_task(struct task_struct *p, int new_cpu) | ||
4647 | { | ||
4648 | struct rq *rq = task_rq(p); | ||
4649 | |||
4650 | lockdep_assert_held(&rq->lock); | ||
4651 | |||
4652 | dequeue_task(rq, p, 0); | ||
4653 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
4654 | set_task_cpu(p, new_cpu); | ||
4655 | raw_spin_unlock(&rq->lock); | ||
4656 | |||
4657 | rq = cpu_rq(new_cpu); | ||
4658 | |||
4659 | raw_spin_lock(&rq->lock); | ||
4660 | BUG_ON(task_cpu(p) != new_cpu); | ||
4661 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
4662 | enqueue_task(rq, p, 0); | ||
4663 | check_preempt_curr(rq, p, 0); | ||
4664 | |||
4665 | return rq; | ||
4666 | } | ||
4667 | |||
4595 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | 4668 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
4596 | { | 4669 | { |
4597 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 4670 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
@@ -4648,14 +4721,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
4648 | goto out; | 4721 | goto out; |
4649 | 4722 | ||
4650 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 4723 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
4651 | if (p->on_rq) { | 4724 | if (task_running(rq, p) || p->state == TASK_WAKING) { |
4652 | struct migration_arg arg = { p, dest_cpu }; | 4725 | struct migration_arg arg = { p, dest_cpu }; |
4653 | /* Need help from migration thread: drop lock and wait. */ | 4726 | /* Need help from migration thread: drop lock and wait. */ |
4654 | task_rq_unlock(rq, p, &flags); | 4727 | task_rq_unlock(rq, p, &flags); |
4655 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 4728 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
4656 | tlb_migrate_finish(p->mm); | 4729 | tlb_migrate_finish(p->mm); |
4657 | return 0; | 4730 | return 0; |
4658 | } | 4731 | } else if (task_on_rq_queued(p)) |
4732 | rq = move_queued_task(p, dest_cpu); | ||
4659 | out: | 4733 | out: |
4660 | task_rq_unlock(rq, p, &flags); | 4734 | task_rq_unlock(rq, p, &flags); |
4661 | 4735 | ||
@@ -4676,20 +4750,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); | |||
4676 | */ | 4750 | */ |
4677 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4751 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4678 | { | 4752 | { |
4679 | struct rq *rq_dest, *rq_src; | 4753 | struct rq *rq; |
4680 | int ret = 0; | 4754 | int ret = 0; |
4681 | 4755 | ||
4682 | if (unlikely(!cpu_active(dest_cpu))) | 4756 | if (unlikely(!cpu_active(dest_cpu))) |
4683 | return ret; | 4757 | return ret; |
4684 | 4758 | ||
4685 | rq_src = cpu_rq(src_cpu); | 4759 | rq = cpu_rq(src_cpu); |
4686 | rq_dest = cpu_rq(dest_cpu); | ||
4687 | 4760 | ||
4688 | raw_spin_lock(&p->pi_lock); | 4761 | raw_spin_lock(&p->pi_lock); |
4689 | double_rq_lock(rq_src, rq_dest); | 4762 | raw_spin_lock(&rq->lock); |
4690 | /* Already moved. */ | 4763 | /* Already moved. */ |
4691 | if (task_cpu(p) != src_cpu) | 4764 | if (task_cpu(p) != src_cpu) |
4692 | goto done; | 4765 | goto done; |
4766 | |||
4693 | /* Affinity changed (again). */ | 4767 | /* Affinity changed (again). */ |
4694 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | 4768 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
4695 | goto fail; | 4769 | goto fail; |
@@ -4698,16 +4772,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
4698 | * If we're not on a rq, the next wake-up will ensure we're | 4772 | * If we're not on a rq, the next wake-up will ensure we're |
4699 | * placed properly. | 4773 | * placed properly. |
4700 | */ | 4774 | */ |
4701 | if (p->on_rq) { | 4775 | if (task_on_rq_queued(p)) |
4702 | dequeue_task(rq_src, p, 0); | 4776 | rq = move_queued_task(p, dest_cpu); |
4703 | set_task_cpu(p, dest_cpu); | ||
4704 | enqueue_task(rq_dest, p, 0); | ||
4705 | check_preempt_curr(rq_dest, p, 0); | ||
4706 | } | ||
4707 | done: | 4777 | done: |
4708 | ret = 1; | 4778 | ret = 1; |
4709 | fail: | 4779 | fail: |
4710 | double_rq_unlock(rq_src, rq_dest); | 4780 | raw_spin_unlock(&rq->lock); |
4711 | raw_spin_unlock(&p->pi_lock); | 4781 | raw_spin_unlock(&p->pi_lock); |
4712 | return ret; | 4782 | return ret; |
4713 | } | 4783 | } |
@@ -4739,22 +4809,22 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
4739 | { | 4809 | { |
4740 | struct rq *rq; | 4810 | struct rq *rq; |
4741 | unsigned long flags; | 4811 | unsigned long flags; |
4742 | bool on_rq, running; | 4812 | bool queued, running; |
4743 | 4813 | ||
4744 | rq = task_rq_lock(p, &flags); | 4814 | rq = task_rq_lock(p, &flags); |
4745 | on_rq = p->on_rq; | 4815 | queued = task_on_rq_queued(p); |
4746 | running = task_current(rq, p); | 4816 | running = task_current(rq, p); |
4747 | 4817 | ||
4748 | if (on_rq) | 4818 | if (queued) |
4749 | dequeue_task(rq, p, 0); | 4819 | dequeue_task(rq, p, 0); |
4750 | if (running) | 4820 | if (running) |
4751 | p->sched_class->put_prev_task(rq, p); | 4821 | put_prev_task(rq, p); |
4752 | 4822 | ||
4753 | p->numa_preferred_nid = nid; | 4823 | p->numa_preferred_nid = nid; |
4754 | 4824 | ||
4755 | if (running) | 4825 | if (running) |
4756 | p->sched_class->set_curr_task(rq); | 4826 | p->sched_class->set_curr_task(rq); |
4757 | if (on_rq) | 4827 | if (queued) |
4758 | enqueue_task(rq, p, 0); | 4828 | enqueue_task(rq, p, 0); |
4759 | task_rq_unlock(rq, p, &flags); | 4829 | task_rq_unlock(rq, p, &flags); |
4760 | } | 4830 | } |
@@ -4774,6 +4844,12 @@ static int migration_cpu_stop(void *data) | |||
4774 | * be on another cpu but it doesn't matter. | 4844 | * be on another cpu but it doesn't matter. |
4775 | */ | 4845 | */ |
4776 | local_irq_disable(); | 4846 | local_irq_disable(); |
4847 | /* | ||
4848 | * We need to explicitly wake pending tasks before running | ||
4849 | * __migrate_task() such that we will not miss enforcing cpus_allowed | ||
4850 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. | ||
4851 | */ | ||
4852 | sched_ttwu_pending(); | ||
4777 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); | 4853 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); |
4778 | local_irq_enable(); | 4854 | local_irq_enable(); |
4779 | return 0; | 4855 | return 0; |
@@ -5184,6 +5260,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb, | |||
5184 | { | 5260 | { |
5185 | unsigned long flags; | 5261 | unsigned long flags; |
5186 | long cpu = (long)hcpu; | 5262 | long cpu = (long)hcpu; |
5263 | struct dl_bw *dl_b; | ||
5187 | 5264 | ||
5188 | switch (action & ~CPU_TASKS_FROZEN) { | 5265 | switch (action & ~CPU_TASKS_FROZEN) { |
5189 | case CPU_DOWN_PREPARE: | 5266 | case CPU_DOWN_PREPARE: |
@@ -5191,15 +5268,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb, | |||
5191 | 5268 | ||
5192 | /* explicitly allow suspend */ | 5269 | /* explicitly allow suspend */ |
5193 | if (!(action & CPU_TASKS_FROZEN)) { | 5270 | if (!(action & CPU_TASKS_FROZEN)) { |
5194 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
5195 | bool overflow; | 5271 | bool overflow; |
5196 | int cpus; | 5272 | int cpus; |
5197 | 5273 | ||
5274 | rcu_read_lock_sched(); | ||
5275 | dl_b = dl_bw_of(cpu); | ||
5276 | |||
5198 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 5277 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
5199 | cpus = dl_bw_cpus(cpu); | 5278 | cpus = dl_bw_cpus(cpu); |
5200 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | 5279 | overflow = __dl_overflow(dl_b, cpus, 0, 0); |
5201 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 5280 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
5202 | 5281 | ||
5282 | rcu_read_unlock_sched(); | ||
5283 | |||
5203 | if (overflow) | 5284 | if (overflow) |
5204 | return notifier_from_errno(-EBUSY); | 5285 | return notifier_from_errno(-EBUSY); |
5205 | } | 5286 | } |
@@ -5742,7 +5823,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5742 | const struct cpumask *span = sched_domain_span(sd); | 5823 | const struct cpumask *span = sched_domain_span(sd); |
5743 | struct cpumask *covered = sched_domains_tmpmask; | 5824 | struct cpumask *covered = sched_domains_tmpmask; |
5744 | struct sd_data *sdd = sd->private; | 5825 | struct sd_data *sdd = sd->private; |
5745 | struct sched_domain *child; | 5826 | struct sched_domain *sibling; |
5746 | int i; | 5827 | int i; |
5747 | 5828 | ||
5748 | cpumask_clear(covered); | 5829 | cpumask_clear(covered); |
@@ -5753,10 +5834,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5753 | if (cpumask_test_cpu(i, covered)) | 5834 | if (cpumask_test_cpu(i, covered)) |
5754 | continue; | 5835 | continue; |
5755 | 5836 | ||
5756 | child = *per_cpu_ptr(sdd->sd, i); | 5837 | sibling = *per_cpu_ptr(sdd->sd, i); |
5757 | 5838 | ||
5758 | /* See the comment near build_group_mask(). */ | 5839 | /* See the comment near build_group_mask(). */ |
5759 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | 5840 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) |
5760 | continue; | 5841 | continue; |
5761 | 5842 | ||
5762 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 5843 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
@@ -5766,10 +5847,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5766 | goto fail; | 5847 | goto fail; |
5767 | 5848 | ||
5768 | sg_span = sched_group_cpus(sg); | 5849 | sg_span = sched_group_cpus(sg); |
5769 | if (child->child) { | 5850 | if (sibling->child) |
5770 | child = child->child; | 5851 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); |
5771 | cpumask_copy(sg_span, sched_domain_span(child)); | 5852 | else |
5772 | } else | ||
5773 | cpumask_set_cpu(i, sg_span); | 5853 | cpumask_set_cpu(i, sg_span); |
5774 | 5854 | ||
5775 | cpumask_or(covered, covered, sg_span); | 5855 | cpumask_or(covered, covered, sg_span); |
@@ -7120,13 +7200,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
7120 | .sched_policy = SCHED_NORMAL, | 7200 | .sched_policy = SCHED_NORMAL, |
7121 | }; | 7201 | }; |
7122 | int old_prio = p->prio; | 7202 | int old_prio = p->prio; |
7123 | int on_rq; | 7203 | int queued; |
7124 | 7204 | ||
7125 | on_rq = p->on_rq; | 7205 | queued = task_on_rq_queued(p); |
7126 | if (on_rq) | 7206 | if (queued) |
7127 | dequeue_task(rq, p, 0); | 7207 | dequeue_task(rq, p, 0); |
7128 | __setscheduler(rq, p, &attr); | 7208 | __setscheduler(rq, p, &attr); |
7129 | if (on_rq) { | 7209 | if (queued) { |
7130 | enqueue_task(rq, p, 0); | 7210 | enqueue_task(rq, p, 0); |
7131 | resched_curr(rq); | 7211 | resched_curr(rq); |
7132 | } | 7212 | } |
@@ -7140,12 +7220,12 @@ void normalize_rt_tasks(void) | |||
7140 | unsigned long flags; | 7220 | unsigned long flags; |
7141 | struct rq *rq; | 7221 | struct rq *rq; |
7142 | 7222 | ||
7143 | read_lock_irqsave(&tasklist_lock, flags); | 7223 | read_lock(&tasklist_lock); |
7144 | do_each_thread(g, p) { | 7224 | for_each_process_thread(g, p) { |
7145 | /* | 7225 | /* |
7146 | * Only normalize user tasks: | 7226 | * Only normalize user tasks: |
7147 | */ | 7227 | */ |
7148 | if (!p->mm) | 7228 | if (p->flags & PF_KTHREAD) |
7149 | continue; | 7229 | continue; |
7150 | 7230 | ||
7151 | p->se.exec_start = 0; | 7231 | p->se.exec_start = 0; |
@@ -7160,21 +7240,16 @@ void normalize_rt_tasks(void) | |||
7160 | * Renice negative nice level userspace | 7240 | * Renice negative nice level userspace |
7161 | * tasks back to 0: | 7241 | * tasks back to 0: |
7162 | */ | 7242 | */ |
7163 | if (task_nice(p) < 0 && p->mm) | 7243 | if (task_nice(p) < 0) |
7164 | set_user_nice(p, 0); | 7244 | set_user_nice(p, 0); |
7165 | continue; | 7245 | continue; |
7166 | } | 7246 | } |
7167 | 7247 | ||
7168 | raw_spin_lock(&p->pi_lock); | 7248 | rq = task_rq_lock(p, &flags); |
7169 | rq = __task_rq_lock(p); | ||
7170 | |||
7171 | normalize_task(rq, p); | 7249 | normalize_task(rq, p); |
7172 | 7250 | task_rq_unlock(rq, p, &flags); | |
7173 | __task_rq_unlock(rq); | 7251 | } |
7174 | raw_spin_unlock(&p->pi_lock); | 7252 | read_unlock(&tasklist_lock); |
7175 | } while_each_thread(g, p); | ||
7176 | |||
7177 | read_unlock_irqrestore(&tasklist_lock, flags); | ||
7178 | } | 7253 | } |
7179 | 7254 | ||
7180 | #endif /* CONFIG_MAGIC_SYSRQ */ | 7255 | #endif /* CONFIG_MAGIC_SYSRQ */ |
@@ -7314,19 +7389,19 @@ void sched_offline_group(struct task_group *tg) | |||
7314 | void sched_move_task(struct task_struct *tsk) | 7389 | void sched_move_task(struct task_struct *tsk) |
7315 | { | 7390 | { |
7316 | struct task_group *tg; | 7391 | struct task_group *tg; |
7317 | int on_rq, running; | 7392 | int queued, running; |
7318 | unsigned long flags; | 7393 | unsigned long flags; |
7319 | struct rq *rq; | 7394 | struct rq *rq; |
7320 | 7395 | ||
7321 | rq = task_rq_lock(tsk, &flags); | 7396 | rq = task_rq_lock(tsk, &flags); |
7322 | 7397 | ||
7323 | running = task_current(rq, tsk); | 7398 | running = task_current(rq, tsk); |
7324 | on_rq = tsk->on_rq; | 7399 | queued = task_on_rq_queued(tsk); |
7325 | 7400 | ||
7326 | if (on_rq) | 7401 | if (queued) |
7327 | dequeue_task(rq, tsk, 0); | 7402 | dequeue_task(rq, tsk, 0); |
7328 | if (unlikely(running)) | 7403 | if (unlikely(running)) |
7329 | tsk->sched_class->put_prev_task(rq, tsk); | 7404 | put_prev_task(rq, tsk); |
7330 | 7405 | ||
7331 | tg = container_of(task_css_check(tsk, cpu_cgrp_id, | 7406 | tg = container_of(task_css_check(tsk, cpu_cgrp_id, |
7332 | lockdep_is_held(&tsk->sighand->siglock)), | 7407 | lockdep_is_held(&tsk->sighand->siglock)), |
@@ -7336,14 +7411,14 @@ void sched_move_task(struct task_struct *tsk) | |||
7336 | 7411 | ||
7337 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7412 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7338 | if (tsk->sched_class->task_move_group) | 7413 | if (tsk->sched_class->task_move_group) |
7339 | tsk->sched_class->task_move_group(tsk, on_rq); | 7414 | tsk->sched_class->task_move_group(tsk, queued); |
7340 | else | 7415 | else |
7341 | #endif | 7416 | #endif |
7342 | set_task_rq(tsk, task_cpu(tsk)); | 7417 | set_task_rq(tsk, task_cpu(tsk)); |
7343 | 7418 | ||
7344 | if (unlikely(running)) | 7419 | if (unlikely(running)) |
7345 | tsk->sched_class->set_curr_task(rq); | 7420 | tsk->sched_class->set_curr_task(rq); |
7346 | if (on_rq) | 7421 | if (queued) |
7347 | enqueue_task(rq, tsk, 0); | 7422 | enqueue_task(rq, tsk, 0); |
7348 | 7423 | ||
7349 | task_rq_unlock(rq, tsk, &flags); | 7424 | task_rq_unlock(rq, tsk, &flags); |
@@ -7361,10 +7436,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
7361 | { | 7436 | { |
7362 | struct task_struct *g, *p; | 7437 | struct task_struct *g, *p; |
7363 | 7438 | ||
7364 | do_each_thread(g, p) { | 7439 | for_each_process_thread(g, p) { |
7365 | if (rt_task(p) && task_rq(p)->rt.tg == tg) | 7440 | if (rt_task(p) && task_group(p) == tg) |
7366 | return 1; | 7441 | return 1; |
7367 | } while_each_thread(g, p); | 7442 | } |
7368 | 7443 | ||
7369 | return 0; | 7444 | return 0; |
7370 | } | 7445 | } |
@@ -7573,6 +7648,7 @@ static int sched_dl_global_constraints(void) | |||
7573 | u64 runtime = global_rt_runtime(); | 7648 | u64 runtime = global_rt_runtime(); |
7574 | u64 period = global_rt_period(); | 7649 | u64 period = global_rt_period(); |
7575 | u64 new_bw = to_ratio(period, runtime); | 7650 | u64 new_bw = to_ratio(period, runtime); |
7651 | struct dl_bw *dl_b; | ||
7576 | int cpu, ret = 0; | 7652 | int cpu, ret = 0; |
7577 | unsigned long flags; | 7653 | unsigned long flags; |
7578 | 7654 | ||
@@ -7586,13 +7662,16 @@ static int sched_dl_global_constraints(void) | |||
7586 | * solutions is welcome! | 7662 | * solutions is welcome! |
7587 | */ | 7663 | */ |
7588 | for_each_possible_cpu(cpu) { | 7664 | for_each_possible_cpu(cpu) { |
7589 | struct dl_bw *dl_b = dl_bw_of(cpu); | 7665 | rcu_read_lock_sched(); |
7666 | dl_b = dl_bw_of(cpu); | ||
7590 | 7667 | ||
7591 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 7668 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
7592 | if (new_bw < dl_b->total_bw) | 7669 | if (new_bw < dl_b->total_bw) |
7593 | ret = -EBUSY; | 7670 | ret = -EBUSY; |
7594 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 7671 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
7595 | 7672 | ||
7673 | rcu_read_unlock_sched(); | ||
7674 | |||
7596 | if (ret) | 7675 | if (ret) |
7597 | break; | 7676 | break; |
7598 | } | 7677 | } |
@@ -7603,6 +7682,7 @@ static int sched_dl_global_constraints(void) | |||
7603 | static void sched_dl_do_global(void) | 7682 | static void sched_dl_do_global(void) |
7604 | { | 7683 | { |
7605 | u64 new_bw = -1; | 7684 | u64 new_bw = -1; |
7685 | struct dl_bw *dl_b; | ||
7606 | int cpu; | 7686 | int cpu; |
7607 | unsigned long flags; | 7687 | unsigned long flags; |
7608 | 7688 | ||
@@ -7616,11 +7696,14 @@ static void sched_dl_do_global(void) | |||
7616 | * FIXME: As above... | 7696 | * FIXME: As above... |
7617 | */ | 7697 | */ |
7618 | for_each_possible_cpu(cpu) { | 7698 | for_each_possible_cpu(cpu) { |
7619 | struct dl_bw *dl_b = dl_bw_of(cpu); | 7699 | rcu_read_lock_sched(); |
7700 | dl_b = dl_bw_of(cpu); | ||
7620 | 7701 | ||
7621 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 7702 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
7622 | dl_b->bw = new_bw; | 7703 | dl_b->bw = new_bw; |
7623 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 7704 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
7705 | |||
7706 | rcu_read_unlock_sched(); | ||
7624 | } | 7707 | } |
7625 | } | 7708 | } |
7626 | 7709 | ||
@@ -8001,7 +8084,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
8001 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; | 8084 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; |
8002 | 8085 | ||
8003 | quota = normalize_cfs_quota(tg, d); | 8086 | quota = normalize_cfs_quota(tg, d); |
8004 | parent_quota = parent_b->hierarchal_quota; | 8087 | parent_quota = parent_b->hierarchical_quota; |
8005 | 8088 | ||
8006 | /* | 8089 | /* |
8007 | * ensure max(child_quota) <= parent_quota, inherit when no | 8090 | * ensure max(child_quota) <= parent_quota, inherit when no |
@@ -8012,7 +8095,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
8012 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | 8095 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) |
8013 | return -EINVAL; | 8096 | return -EINVAL; |
8014 | } | 8097 | } |
8015 | cfs_b->hierarchal_quota = quota; | 8098 | cfs_b->hierarchical_quota = quota; |
8016 | 8099 | ||
8017 | return 0; | 8100 | return 0; |
8018 | } | 8101 | } |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963dae80..539ca3ce071b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
107 | int best_cpu = -1; | 107 | int best_cpu = -1; |
108 | const struct sched_dl_entity *dl_se = &p->dl; | 108 | const struct sched_dl_entity *dl_se = &p->dl; |
109 | 109 | ||
110 | if (later_mask && cpumask_and(later_mask, cp->free_cpus, | 110 | if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { |
111 | &p->cpus_allowed) && cpumask_and(later_mask, | ||
112 | later_mask, cpu_active_mask)) { | ||
113 | best_cpu = cpumask_any(later_mask); | 111 | best_cpu = cpumask_any(later_mask); |
114 | goto out; | 112 | goto out; |
115 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && | 113 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06ef865..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
288 | struct signal_struct *sig = tsk->signal; | 288 | struct signal_struct *sig = tsk->signal; |
289 | cputime_t utime, stime; | 289 | cputime_t utime, stime; |
290 | struct task_struct *t; | 290 | struct task_struct *t; |
291 | 291 | unsigned int seq, nextseq; | |
292 | times->utime = sig->utime; | 292 | unsigned long flags; |
293 | times->stime = sig->stime; | ||
294 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
295 | 293 | ||
296 | rcu_read_lock(); | 294 | rcu_read_lock(); |
297 | /* make sure we can trust tsk->thread_group list */ | 295 | /* Attempt a lockless read on the first round. */ |
298 | if (!likely(pid_alive(tsk))) | 296 | nextseq = 0; |
299 | goto out; | ||
300 | |||
301 | t = tsk; | ||
302 | do { | 297 | do { |
303 | task_cputime(t, &utime, &stime); | 298 | seq = nextseq; |
304 | times->utime += utime; | 299 | flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); |
305 | times->stime += stime; | 300 | times->utime = sig->utime; |
306 | times->sum_exec_runtime += task_sched_runtime(t); | 301 | times->stime = sig->stime; |
307 | } while_each_thread(tsk, t); | 302 | times->sum_exec_runtime = sig->sum_sched_runtime; |
308 | out: | 303 | |
304 | for_each_thread(tsk, t) { | ||
305 | task_cputime(t, &utime, &stime); | ||
306 | times->utime += utime; | ||
307 | times->stime += stime; | ||
308 | times->sum_exec_runtime += task_sched_runtime(t); | ||
309 | } | ||
310 | /* If lockless access failed, take the lock. */ | ||
311 | nextseq = 1; | ||
312 | } while (need_seqretry(&sig->stats_lock, seq)); | ||
313 | done_seqretry_irqrestore(&sig->stats_lock, seq, flags); | ||
309 | rcu_read_unlock(); | 314 | rcu_read_unlock(); |
310 | } | 315 | } |
311 | 316 | ||
@@ -550,6 +555,23 @@ drop_precision: | |||
550 | } | 555 | } |
551 | 556 | ||
552 | /* | 557 | /* |
558 | * Atomically advance counter to the new value. Interrupts, vcpu | ||
559 | * scheduling, and scaling inaccuracies can cause cputime_advance | ||
560 | * to be occasionally called with a new value smaller than counter. | ||
561 | * Let's enforce atomicity. | ||
562 | * | ||
563 | * Normally a caller will only go through this loop once, or not | ||
564 | * at all in case a previous caller updated counter the same jiffy. | ||
565 | */ | ||
566 | static void cputime_advance(cputime_t *counter, cputime_t new) | ||
567 | { | ||
568 | cputime_t old; | ||
569 | |||
570 | while (new > (old = ACCESS_ONCE(*counter))) | ||
571 | cmpxchg_cputime(counter, old, new); | ||
572 | } | ||
573 | |||
574 | /* | ||
553 | * Adjust tick based cputime random precision against scheduler | 575 | * Adjust tick based cputime random precision against scheduler |
554 | * runtime accounting. | 576 | * runtime accounting. |
555 | */ | 577 | */ |
@@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, | |||
594 | utime = rtime - stime; | 616 | utime = rtime - stime; |
595 | } | 617 | } |
596 | 618 | ||
597 | /* | 619 | cputime_advance(&prev->stime, stime); |
598 | * If the tick based count grows faster than the scheduler one, | 620 | cputime_advance(&prev->utime, utime); |
599 | * the result of the scaling may go backward. | ||
600 | * Let's enforce monotonicity. | ||
601 | */ | ||
602 | prev->stime = max(prev->stime, stime); | ||
603 | prev->utime = max(prev->utime, utime); | ||
604 | 621 | ||
605 | out: | 622 | out: |
606 | *ut = prev->utime; | 623 | *ut = prev->utime; |
@@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
617 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | 634 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
618 | } | 635 | } |
619 | 636 | ||
620 | /* | ||
621 | * Must be called with siglock held. | ||
622 | */ | ||
623 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 637 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
624 | { | 638 | { |
625 | struct task_cputime cputime; | 639 | struct task_cputime cputime; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce138b652..abfaf3d9a29f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -530,7 +530,7 @@ again: | |||
530 | update_rq_clock(rq); | 530 | update_rq_clock(rq); |
531 | dl_se->dl_throttled = 0; | 531 | dl_se->dl_throttled = 0; |
532 | dl_se->dl_yielded = 0; | 532 | dl_se->dl_yielded = 0; |
533 | if (p->on_rq) { | 533 | if (task_on_rq_queued(p)) { |
534 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | 534 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); |
535 | if (task_has_dl_policy(rq->curr)) | 535 | if (task_has_dl_policy(rq->curr)) |
536 | check_preempt_curr_dl(rq, p, 0); | 536 | check_preempt_curr_dl(rq, p, 0); |
@@ -997,10 +997,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | |||
997 | #ifdef CONFIG_SCHED_HRTICK | 997 | #ifdef CONFIG_SCHED_HRTICK |
998 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | 998 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) |
999 | { | 999 | { |
1000 | s64 delta = p->dl.dl_runtime - p->dl.runtime; | 1000 | hrtick_start(rq, p->dl.runtime); |
1001 | |||
1002 | if (delta > 10000) | ||
1003 | hrtick_start(rq, p->dl.runtime); | ||
1004 | } | 1001 | } |
1005 | #endif | 1002 | #endif |
1006 | 1003 | ||
@@ -1030,7 +1027,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) | |||
1030 | * means a stop task can slip in, in which case we need to | 1027 | * means a stop task can slip in, in which case we need to |
1031 | * re-start task selection. | 1028 | * re-start task selection. |
1032 | */ | 1029 | */ |
1033 | if (rq->stop && rq->stop->on_rq) | 1030 | if (rq->stop && task_on_rq_queued(rq->stop)) |
1034 | return RETRY_TASK; | 1031 | return RETRY_TASK; |
1035 | } | 1032 | } |
1036 | 1033 | ||
@@ -1124,10 +1121,8 @@ static void set_curr_task_dl(struct rq *rq) | |||
1124 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | 1121 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) |
1125 | { | 1122 | { |
1126 | if (!task_running(rq, p) && | 1123 | if (!task_running(rq, p) && |
1127 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | 1124 | cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
1128 | (p->nr_cpus_allowed > 1)) | ||
1129 | return 1; | 1125 | return 1; |
1130 | |||
1131 | return 0; | 1126 | return 0; |
1132 | } | 1127 | } |
1133 | 1128 | ||
@@ -1169,6 +1164,13 @@ static int find_later_rq(struct task_struct *task) | |||
1169 | if (task->nr_cpus_allowed == 1) | 1164 | if (task->nr_cpus_allowed == 1) |
1170 | return -1; | 1165 | return -1; |
1171 | 1166 | ||
1167 | /* | ||
1168 | * We have to consider system topology and task affinity | ||
1169 | * first, then we can look for a suitable cpu. | ||
1170 | */ | ||
1171 | cpumask_copy(later_mask, task_rq(task)->rd->span); | ||
1172 | cpumask_and(later_mask, later_mask, cpu_active_mask); | ||
1173 | cpumask_and(later_mask, later_mask, &task->cpus_allowed); | ||
1172 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, | 1174 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, |
1173 | task, later_mask); | 1175 | task, later_mask); |
1174 | if (best_cpu == -1) | 1176 | if (best_cpu == -1) |
@@ -1257,7 +1259,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | |||
1257 | if (unlikely(task_rq(task) != rq || | 1259 | if (unlikely(task_rq(task) != rq || |
1258 | !cpumask_test_cpu(later_rq->cpu, | 1260 | !cpumask_test_cpu(later_rq->cpu, |
1259 | &task->cpus_allowed) || | 1261 | &task->cpus_allowed) || |
1260 | task_running(rq, task) || !task->on_rq)) { | 1262 | task_running(rq, task) || |
1263 | !task_on_rq_queued(task))) { | ||
1261 | double_unlock_balance(rq, later_rq); | 1264 | double_unlock_balance(rq, later_rq); |
1262 | later_rq = NULL; | 1265 | later_rq = NULL; |
1263 | break; | 1266 | break; |
@@ -1296,7 +1299,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) | |||
1296 | BUG_ON(task_current(rq, p)); | 1299 | BUG_ON(task_current(rq, p)); |
1297 | BUG_ON(p->nr_cpus_allowed <= 1); | 1300 | BUG_ON(p->nr_cpus_allowed <= 1); |
1298 | 1301 | ||
1299 | BUG_ON(!p->on_rq); | 1302 | BUG_ON(!task_on_rq_queued(p)); |
1300 | BUG_ON(!dl_task(p)); | 1303 | BUG_ON(!dl_task(p)); |
1301 | 1304 | ||
1302 | return p; | 1305 | return p; |
@@ -1443,7 +1446,7 @@ static int pull_dl_task(struct rq *this_rq) | |||
1443 | dl_time_before(p->dl.deadline, | 1446 | dl_time_before(p->dl.deadline, |
1444 | this_rq->dl.earliest_dl.curr))) { | 1447 | this_rq->dl.earliest_dl.curr))) { |
1445 | WARN_ON(p == src_rq->curr); | 1448 | WARN_ON(p == src_rq->curr); |
1446 | WARN_ON(!p->on_rq); | 1449 | WARN_ON(!task_on_rq_queued(p)); |
1447 | 1450 | ||
1448 | /* | 1451 | /* |
1449 | * Then we pull iff p has actually an earlier | 1452 | * Then we pull iff p has actually an earlier |
@@ -1569,6 +1572,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
1569 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | 1572 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) |
1570 | hrtimer_try_to_cancel(&p->dl.dl_timer); | 1573 | hrtimer_try_to_cancel(&p->dl.dl_timer); |
1571 | 1574 | ||
1575 | __dl_clear_params(p); | ||
1576 | |||
1572 | #ifdef CONFIG_SMP | 1577 | #ifdef CONFIG_SMP |
1573 | /* | 1578 | /* |
1574 | * Since this might be the only -deadline task on the rq, | 1579 | * Since this might be the only -deadline task on the rq, |
@@ -1596,7 +1601,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1596 | if (unlikely(p->dl.dl_throttled)) | 1601 | if (unlikely(p->dl.dl_throttled)) |
1597 | return; | 1602 | return; |
1598 | 1603 | ||
1599 | if (p->on_rq && rq->curr != p) { | 1604 | if (task_on_rq_queued(p) && rq->curr != p) { |
1600 | #ifdef CONFIG_SMP | 1605 | #ifdef CONFIG_SMP |
1601 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | 1606 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) |
1602 | /* Only reschedule if pushing failed */ | 1607 | /* Only reschedule if pushing failed */ |
@@ -1614,7 +1619,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1614 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, | 1619 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, |
1615 | int oldprio) | 1620 | int oldprio) |
1616 | { | 1621 | { |
1617 | if (p->on_rq || rq->curr == p) { | 1622 | if (task_on_rq_queued(p) || rq->curr == p) { |
1618 | #ifdef CONFIG_SMP | 1623 | #ifdef CONFIG_SMP |
1619 | /* | 1624 | /* |
1620 | * This might be too much, but unfortunately | 1625 | * This might be too much, but unfortunately |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c34b821..ce33780d8f20 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
150 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | 150 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
151 | { | 151 | { |
152 | struct task_struct *g, *p; | 152 | struct task_struct *g, *p; |
153 | unsigned long flags; | ||
154 | 153 | ||
155 | SEQ_printf(m, | 154 | SEQ_printf(m, |
156 | "\nrunnable tasks:\n" | 155 | "\nrunnable tasks:\n" |
@@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
159 | "------------------------------------------------------" | 158 | "------------------------------------------------------" |
160 | "----------------------------------------------------\n"); | 159 | "----------------------------------------------------\n"); |
161 | 160 | ||
162 | read_lock_irqsave(&tasklist_lock, flags); | 161 | rcu_read_lock(); |
163 | 162 | for_each_process_thread(g, p) { | |
164 | do_each_thread(g, p) { | ||
165 | if (task_cpu(p) != rq_cpu) | 163 | if (task_cpu(p) != rq_cpu) |
166 | continue; | 164 | continue; |
167 | 165 | ||
168 | print_task(m, rq, p); | 166 | print_task(m, rq, p); |
169 | } while_each_thread(g, p); | 167 | } |
170 | 168 | rcu_read_unlock(); | |
171 | read_unlock_irqrestore(&tasklist_lock, flags); | ||
172 | } | 169 | } |
173 | 170 | ||
174 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 171 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
@@ -333,9 +330,7 @@ do { \ | |||
333 | print_cfs_stats(m, cpu); | 330 | print_cfs_stats(m, cpu); |
334 | print_rt_stats(m, cpu); | 331 | print_rt_stats(m, cpu); |
335 | 332 | ||
336 | rcu_read_lock(); | ||
337 | print_rq(m, rq, cpu); | 333 | print_rq(m, rq, cpu); |
338 | rcu_read_unlock(); | ||
339 | spin_unlock_irqrestore(&sched_debug_lock, flags); | 334 | spin_unlock_irqrestore(&sched_debug_lock, flags); |
340 | SEQ_printf(m, "\n"); | 335 | SEQ_printf(m, "\n"); |
341 | } | 336 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82088b29704e..b78280c59b46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
26 | #include <linux/cpuidle.h> | ||
26 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
27 | #include <linux/profile.h> | 28 | #include <linux/profile.h> |
28 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
665 | } | 666 | } |
666 | 667 | ||
667 | #ifdef CONFIG_SMP | 668 | #ifdef CONFIG_SMP |
669 | static int select_idle_sibling(struct task_struct *p, int cpu); | ||
668 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
669 | 671 | ||
670 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
@@ -1038,7 +1040,8 @@ struct numa_stats { | |||
1038 | */ | 1040 | */ |
1039 | static void update_numa_stats(struct numa_stats *ns, int nid) | 1041 | static void update_numa_stats(struct numa_stats *ns, int nid) |
1040 | { | 1042 | { |
1041 | int cpu, cpus = 0; | 1043 | int smt, cpu, cpus = 0; |
1044 | unsigned long capacity; | ||
1042 | 1045 | ||
1043 | memset(ns, 0, sizeof(*ns)); | 1046 | memset(ns, 0, sizeof(*ns)); |
1044 | for_each_cpu(cpu, cpumask_of_node(nid)) { | 1047 | for_each_cpu(cpu, cpumask_of_node(nid)) { |
@@ -1062,8 +1065,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1062 | if (!cpus) | 1065 | if (!cpus) |
1063 | return; | 1066 | return; |
1064 | 1067 | ||
1065 | ns->task_capacity = | 1068 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ |
1066 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); | 1069 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); |
1070 | capacity = cpus / smt; /* cores */ | ||
1071 | |||
1072 | ns->task_capacity = min_t(unsigned, capacity, | ||
1073 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); | ||
1067 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); | 1074 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); |
1068 | } | 1075 | } |
1069 | 1076 | ||
@@ -1206,7 +1213,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1206 | 1213 | ||
1207 | if (!cur) { | 1214 | if (!cur) { |
1208 | /* Is there capacity at our destination? */ | 1215 | /* Is there capacity at our destination? */ |
1209 | if (env->src_stats.has_free_capacity && | 1216 | if (env->src_stats.nr_running <= env->src_stats.task_capacity && |
1210 | !env->dst_stats.has_free_capacity) | 1217 | !env->dst_stats.has_free_capacity) |
1211 | goto unlock; | 1218 | goto unlock; |
1212 | 1219 | ||
@@ -1252,6 +1259,13 @@ balance: | |||
1252 | if (load_too_imbalanced(src_load, dst_load, env)) | 1259 | if (load_too_imbalanced(src_load, dst_load, env)) |
1253 | goto unlock; | 1260 | goto unlock; |
1254 | 1261 | ||
1262 | /* | ||
1263 | * One idle CPU per node is evaluated for a task numa move. | ||
1264 | * Call select_idle_sibling to maybe find a better one. | ||
1265 | */ | ||
1266 | if (!cur) | ||
1267 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | ||
1268 | |||
1255 | assign: | 1269 | assign: |
1256 | task_numa_assign(env, cur, imp); | 1270 | task_numa_assign(env, cur, imp); |
1257 | unlock: | 1271 | unlock: |
@@ -1775,7 +1789,7 @@ void task_numa_free(struct task_struct *p) | |||
1775 | list_del(&p->numa_entry); | 1789 | list_del(&p->numa_entry); |
1776 | grp->nr_tasks--; | 1790 | grp->nr_tasks--; |
1777 | spin_unlock_irqrestore(&grp->lock, flags); | 1791 | spin_unlock_irqrestore(&grp->lock, flags); |
1778 | rcu_assign_pointer(p->numa_group, NULL); | 1792 | RCU_INIT_POINTER(p->numa_group, NULL); |
1779 | put_numa_group(grp); | 1793 | put_numa_group(grp); |
1780 | } | 1794 | } |
1781 | 1795 | ||
@@ -1804,10 +1818,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
1804 | if (!p->mm) | 1818 | if (!p->mm) |
1805 | return; | 1819 | return; |
1806 | 1820 | ||
1807 | /* Do not worry about placement if exiting */ | ||
1808 | if (p->state == TASK_DEAD) | ||
1809 | return; | ||
1810 | |||
1811 | /* Allocate buffer to track faults on a per-node basis */ | 1821 | /* Allocate buffer to track faults on a per-node basis */ |
1812 | if (unlikely(!p->numa_faults_memory)) { | 1822 | if (unlikely(!p->numa_faults_memory)) { |
1813 | int size = sizeof(*p->numa_faults_memory) * | 1823 | int size = sizeof(*p->numa_faults_memory) * |
@@ -2211,8 +2221,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) | |||
2211 | 2221 | ||
2212 | /* | 2222 | /* |
2213 | * As y^PERIOD = 1/2, we can combine | 2223 | * As y^PERIOD = 1/2, we can combine |
2214 | * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) | 2224 | * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) |
2215 | * With a look-up table which covers k^n (n<PERIOD) | 2225 | * With a look-up table which covers y^n (n<PERIOD) |
2216 | * | 2226 | * |
2217 | * To achieve constant time decay_load. | 2227 | * To achieve constant time decay_load. |
2218 | */ | 2228 | */ |
@@ -2377,6 +2387,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | |||
2377 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | 2387 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; |
2378 | tg_contrib -= cfs_rq->tg_load_contrib; | 2388 | tg_contrib -= cfs_rq->tg_load_contrib; |
2379 | 2389 | ||
2390 | if (!tg_contrib) | ||
2391 | return; | ||
2392 | |||
2380 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | 2393 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { |
2381 | atomic_long_add(tg_contrib, &tg->load_avg); | 2394 | atomic_long_add(tg_contrib, &tg->load_avg); |
2382 | cfs_rq->tg_load_contrib += tg_contrib; | 2395 | cfs_rq->tg_load_contrib += tg_contrib; |
@@ -3892,14 +3905,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
3892 | resched_curr(rq); | 3905 | resched_curr(rq); |
3893 | return; | 3906 | return; |
3894 | } | 3907 | } |
3895 | |||
3896 | /* | ||
3897 | * Don't schedule slices shorter than 10000ns, that just | ||
3898 | * doesn't make sense. Rely on vruntime for fairness. | ||
3899 | */ | ||
3900 | if (rq->curr != p) | ||
3901 | delta = max_t(s64, 10000LL, delta); | ||
3902 | |||
3903 | hrtick_start(rq, delta); | 3908 | hrtick_start(rq, delta); |
3904 | } | 3909 | } |
3905 | } | 3910 | } |
@@ -4087,7 +4092,7 @@ static unsigned long capacity_of(int cpu) | |||
4087 | static unsigned long cpu_avg_load_per_task(int cpu) | 4092 | static unsigned long cpu_avg_load_per_task(int cpu) |
4088 | { | 4093 | { |
4089 | struct rq *rq = cpu_rq(cpu); | 4094 | struct rq *rq = cpu_rq(cpu); |
4090 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 4095 | unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); |
4091 | unsigned long load_avg = rq->cfs.runnable_load_avg; | 4096 | unsigned long load_avg = rq->cfs.runnable_load_avg; |
4092 | 4097 | ||
4093 | if (nr_running) | 4098 | if (nr_running) |
@@ -4276,8 +4281,8 @@ static int wake_wide(struct task_struct *p) | |||
4276 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 4281 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
4277 | { | 4282 | { |
4278 | s64 this_load, load; | 4283 | s64 this_load, load; |
4284 | s64 this_eff_load, prev_eff_load; | ||
4279 | int idx, this_cpu, prev_cpu; | 4285 | int idx, this_cpu, prev_cpu; |
4280 | unsigned long tl_per_task; | ||
4281 | struct task_group *tg; | 4286 | struct task_group *tg; |
4282 | unsigned long weight; | 4287 | unsigned long weight; |
4283 | int balanced; | 4288 | int balanced; |
@@ -4320,47 +4325,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
4320 | * Otherwise check if either cpus are near enough in load to allow this | 4325 | * Otherwise check if either cpus are near enough in load to allow this |
4321 | * task to be woken on this_cpu. | 4326 | * task to be woken on this_cpu. |
4322 | */ | 4327 | */ |
4323 | if (this_load > 0) { | 4328 | this_eff_load = 100; |
4324 | s64 this_eff_load, prev_eff_load; | 4329 | this_eff_load *= capacity_of(prev_cpu); |
4325 | 4330 | ||
4326 | this_eff_load = 100; | 4331 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; |
4327 | this_eff_load *= capacity_of(prev_cpu); | 4332 | prev_eff_load *= capacity_of(this_cpu); |
4333 | |||
4334 | if (this_load > 0) { | ||
4328 | this_eff_load *= this_load + | 4335 | this_eff_load *= this_load + |
4329 | effective_load(tg, this_cpu, weight, weight); | 4336 | effective_load(tg, this_cpu, weight, weight); |
4330 | 4337 | ||
4331 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||
4332 | prev_eff_load *= capacity_of(this_cpu); | ||
4333 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); | 4338 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); |
4339 | } | ||
4334 | 4340 | ||
4335 | balanced = this_eff_load <= prev_eff_load; | 4341 | balanced = this_eff_load <= prev_eff_load; |
4336 | } else | ||
4337 | balanced = true; | ||
4338 | |||
4339 | /* | ||
4340 | * If the currently running task will sleep within | ||
4341 | * a reasonable amount of time then attract this newly | ||
4342 | * woken task: | ||
4343 | */ | ||
4344 | if (sync && balanced) | ||
4345 | return 1; | ||
4346 | 4342 | ||
4347 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); | 4343 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); |
4348 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
4349 | 4344 | ||
4350 | if (balanced || | 4345 | if (!balanced) |
4351 | (this_load <= load && | 4346 | return 0; |
4352 | this_load + target_load(prev_cpu, idx) <= tl_per_task)) { | ||
4353 | /* | ||
4354 | * This domain has SD_WAKE_AFFINE and | ||
4355 | * p is cache cold in this domain, and | ||
4356 | * there is no bad imbalance. | ||
4357 | */ | ||
4358 | schedstat_inc(sd, ttwu_move_affine); | ||
4359 | schedstat_inc(p, se.statistics.nr_wakeups_affine); | ||
4360 | 4347 | ||
4361 | return 1; | 4348 | schedstat_inc(sd, ttwu_move_affine); |
4362 | } | 4349 | schedstat_inc(p, se.statistics.nr_wakeups_affine); |
4363 | return 0; | 4350 | |
4351 | return 1; | ||
4364 | } | 4352 | } |
4365 | 4353 | ||
4366 | /* | 4354 | /* |
@@ -4428,20 +4416,46 @@ static int | |||
4428 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 4416 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
4429 | { | 4417 | { |
4430 | unsigned long load, min_load = ULONG_MAX; | 4418 | unsigned long load, min_load = ULONG_MAX; |
4431 | int idlest = -1; | 4419 | unsigned int min_exit_latency = UINT_MAX; |
4420 | u64 latest_idle_timestamp = 0; | ||
4421 | int least_loaded_cpu = this_cpu; | ||
4422 | int shallowest_idle_cpu = -1; | ||
4432 | int i; | 4423 | int i; |
4433 | 4424 | ||
4434 | /* Traverse only the allowed CPUs */ | 4425 | /* Traverse only the allowed CPUs */ |
4435 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { | 4426 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
4436 | load = weighted_cpuload(i); | 4427 | if (idle_cpu(i)) { |
4437 | 4428 | struct rq *rq = cpu_rq(i); | |
4438 | if (load < min_load || (load == min_load && i == this_cpu)) { | 4429 | struct cpuidle_state *idle = idle_get_state(rq); |
4439 | min_load = load; | 4430 | if (idle && idle->exit_latency < min_exit_latency) { |
4440 | idlest = i; | 4431 | /* |
4432 | * We give priority to a CPU whose idle state | ||
4433 | * has the smallest exit latency irrespective | ||
4434 | * of any idle timestamp. | ||
4435 | */ | ||
4436 | min_exit_latency = idle->exit_latency; | ||
4437 | latest_idle_timestamp = rq->idle_stamp; | ||
4438 | shallowest_idle_cpu = i; | ||
4439 | } else if ((!idle || idle->exit_latency == min_exit_latency) && | ||
4440 | rq->idle_stamp > latest_idle_timestamp) { | ||
4441 | /* | ||
4442 | * If equal or no active idle state, then | ||
4443 | * the most recently idled CPU might have | ||
4444 | * a warmer cache. | ||
4445 | */ | ||
4446 | latest_idle_timestamp = rq->idle_stamp; | ||
4447 | shallowest_idle_cpu = i; | ||
4448 | } | ||
4449 | } else { | ||
4450 | load = weighted_cpuload(i); | ||
4451 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
4452 | min_load = load; | ||
4453 | least_loaded_cpu = i; | ||
4454 | } | ||
4441 | } | 4455 | } |
4442 | } | 4456 | } |
4443 | 4457 | ||
4444 | return idlest; | 4458 | return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; |
4445 | } | 4459 | } |
4446 | 4460 | ||
4447 | /* | 4461 | /* |
@@ -4513,11 +4527,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4513 | if (p->nr_cpus_allowed == 1) | 4527 | if (p->nr_cpus_allowed == 1) |
4514 | return prev_cpu; | 4528 | return prev_cpu; |
4515 | 4529 | ||
4516 | if (sd_flag & SD_BALANCE_WAKE) { | 4530 | if (sd_flag & SD_BALANCE_WAKE) |
4517 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | 4531 | want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
4518 | want_affine = 1; | ||
4519 | new_cpu = prev_cpu; | ||
4520 | } | ||
4521 | 4532 | ||
4522 | rcu_read_lock(); | 4533 | rcu_read_lock(); |
4523 | for_each_domain(cpu, tmp) { | 4534 | for_each_domain(cpu, tmp) { |
@@ -4704,7 +4715,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
4704 | return; | 4715 | return; |
4705 | 4716 | ||
4706 | /* | 4717 | /* |
4707 | * This is possible from callers such as move_task(), in which we | 4718 | * This is possible from callers such as attach_tasks(), in which we |
4708 | * unconditionally check_prempt_curr() after an enqueue (which may have | 4719 | * unconditionally check_prempt_curr() after an enqueue (which may have |
4709 | * lead to a throttle). This both saves work and prevents false | 4720 | * lead to a throttle). This both saves work and prevents false |
4710 | * next-buddy nomination below. | 4721 | * next-buddy nomination below. |
@@ -5112,27 +5123,18 @@ struct lb_env { | |||
5112 | unsigned int loop_max; | 5123 | unsigned int loop_max; |
5113 | 5124 | ||
5114 | enum fbq_type fbq_type; | 5125 | enum fbq_type fbq_type; |
5126 | struct list_head tasks; | ||
5115 | }; | 5127 | }; |
5116 | 5128 | ||
5117 | /* | 5129 | /* |
5118 | * move_task - move a task from one runqueue to another runqueue. | ||
5119 | * Both runqueues must be locked. | ||
5120 | */ | ||
5121 | static void move_task(struct task_struct *p, struct lb_env *env) | ||
5122 | { | ||
5123 | deactivate_task(env->src_rq, p, 0); | ||
5124 | set_task_cpu(p, env->dst_cpu); | ||
5125 | activate_task(env->dst_rq, p, 0); | ||
5126 | check_preempt_curr(env->dst_rq, p, 0); | ||
5127 | } | ||
5128 | |||
5129 | /* | ||
5130 | * Is this task likely cache-hot: | 5130 | * Is this task likely cache-hot: |
5131 | */ | 5131 | */ |
5132 | static int task_hot(struct task_struct *p, struct lb_env *env) | 5132 | static int task_hot(struct task_struct *p, struct lb_env *env) |
5133 | { | 5133 | { |
5134 | s64 delta; | 5134 | s64 delta; |
5135 | 5135 | ||
5136 | lockdep_assert_held(&env->src_rq->lock); | ||
5137 | |||
5136 | if (p->sched_class != &fair_sched_class) | 5138 | if (p->sched_class != &fair_sched_class) |
5137 | return 0; | 5139 | return 0; |
5138 | 5140 | ||
@@ -5252,6 +5254,9 @@ static | |||
5252 | int can_migrate_task(struct task_struct *p, struct lb_env *env) | 5254 | int can_migrate_task(struct task_struct *p, struct lb_env *env) |
5253 | { | 5255 | { |
5254 | int tsk_cache_hot = 0; | 5256 | int tsk_cache_hot = 0; |
5257 | |||
5258 | lockdep_assert_held(&env->src_rq->lock); | ||
5259 | |||
5255 | /* | 5260 | /* |
5256 | * We do not migrate tasks that are: | 5261 | * We do not migrate tasks that are: |
5257 | * 1) throttled_lb_pair, or | 5262 | * 1) throttled_lb_pair, or |
@@ -5310,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
5310 | if (!tsk_cache_hot) | 5315 | if (!tsk_cache_hot) |
5311 | tsk_cache_hot = migrate_degrades_locality(p, env); | 5316 | tsk_cache_hot = migrate_degrades_locality(p, env); |
5312 | 5317 | ||
5313 | if (migrate_improves_locality(p, env)) { | 5318 | if (migrate_improves_locality(p, env) || !tsk_cache_hot || |
5314 | #ifdef CONFIG_SCHEDSTATS | 5319 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
5315 | if (tsk_cache_hot) { | 5320 | if (tsk_cache_hot) { |
5316 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 5321 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
5317 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 5322 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
5318 | } | 5323 | } |
5319 | #endif | ||
5320 | return 1; | ||
5321 | } | ||
5322 | |||
5323 | if (!tsk_cache_hot || | ||
5324 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | ||
5325 | |||
5326 | if (tsk_cache_hot) { | ||
5327 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | ||
5328 | schedstat_inc(p, se.statistics.nr_forced_migrations); | ||
5329 | } | ||
5330 | |||
5331 | return 1; | 5324 | return 1; |
5332 | } | 5325 | } |
5333 | 5326 | ||
@@ -5336,47 +5329,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
5336 | } | 5329 | } |
5337 | 5330 | ||
5338 | /* | 5331 | /* |
5339 | * move_one_task tries to move exactly one task from busiest to this_rq, as | 5332 | * detach_task() -- detach the task for the migration specified in env |
5333 | */ | ||
5334 | static void detach_task(struct task_struct *p, struct lb_env *env) | ||
5335 | { | ||
5336 | lockdep_assert_held(&env->src_rq->lock); | ||
5337 | |||
5338 | deactivate_task(env->src_rq, p, 0); | ||
5339 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
5340 | set_task_cpu(p, env->dst_cpu); | ||
5341 | } | ||
5342 | |||
5343 | /* | ||
5344 | * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as | ||
5340 | * part of active balancing operations within "domain". | 5345 | * part of active balancing operations within "domain". |
5341 | * Returns 1 if successful and 0 otherwise. | ||
5342 | * | 5346 | * |
5343 | * Called with both runqueues locked. | 5347 | * Returns a task if successful and NULL otherwise. |
5344 | */ | 5348 | */ |
5345 | static int move_one_task(struct lb_env *env) | 5349 | static struct task_struct *detach_one_task(struct lb_env *env) |
5346 | { | 5350 | { |
5347 | struct task_struct *p, *n; | 5351 | struct task_struct *p, *n; |
5348 | 5352 | ||
5353 | lockdep_assert_held(&env->src_rq->lock); | ||
5354 | |||
5349 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { | 5355 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
5350 | if (!can_migrate_task(p, env)) | 5356 | if (!can_migrate_task(p, env)) |
5351 | continue; | 5357 | continue; |
5352 | 5358 | ||
5353 | move_task(p, env); | 5359 | detach_task(p, env); |
5360 | |||
5354 | /* | 5361 | /* |
5355 | * Right now, this is only the second place move_task() | 5362 | * Right now, this is only the second place where |
5356 | * is called, so we can safely collect move_task() | 5363 | * lb_gained[env->idle] is updated (other is detach_tasks) |
5357 | * stats here rather than inside move_task(). | 5364 | * so we can safely collect stats here rather than |
5365 | * inside detach_tasks(). | ||
5358 | */ | 5366 | */ |
5359 | schedstat_inc(env->sd, lb_gained[env->idle]); | 5367 | schedstat_inc(env->sd, lb_gained[env->idle]); |
5360 | return 1; | 5368 | return p; |
5361 | } | 5369 | } |
5362 | return 0; | 5370 | return NULL; |
5363 | } | 5371 | } |
5364 | 5372 | ||
5365 | static const unsigned int sched_nr_migrate_break = 32; | 5373 | static const unsigned int sched_nr_migrate_break = 32; |
5366 | 5374 | ||
5367 | /* | 5375 | /* |
5368 | * move_tasks tries to move up to imbalance weighted load from busiest to | 5376 | * detach_tasks() -- tries to detach up to imbalance weighted load from |
5369 | * this_rq, as part of a balancing operation within domain "sd". | 5377 | * busiest_rq, as part of a balancing operation within domain "sd". |
5370 | * Returns 1 if successful and 0 otherwise. | ||
5371 | * | 5378 | * |
5372 | * Called with both runqueues locked. | 5379 | * Returns number of detached tasks if successful and 0 otherwise. |
5373 | */ | 5380 | */ |
5374 | static int move_tasks(struct lb_env *env) | 5381 | static int detach_tasks(struct lb_env *env) |
5375 | { | 5382 | { |
5376 | struct list_head *tasks = &env->src_rq->cfs_tasks; | 5383 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
5377 | struct task_struct *p; | 5384 | struct task_struct *p; |
5378 | unsigned long load; | 5385 | unsigned long load; |
5379 | int pulled = 0; | 5386 | int detached = 0; |
5387 | |||
5388 | lockdep_assert_held(&env->src_rq->lock); | ||
5380 | 5389 | ||
5381 | if (env->imbalance <= 0) | 5390 | if (env->imbalance <= 0) |
5382 | return 0; | 5391 | return 0; |
@@ -5407,14 +5416,16 @@ static int move_tasks(struct lb_env *env) | |||
5407 | if ((load / 2) > env->imbalance) | 5416 | if ((load / 2) > env->imbalance) |
5408 | goto next; | 5417 | goto next; |
5409 | 5418 | ||
5410 | move_task(p, env); | 5419 | detach_task(p, env); |
5411 | pulled++; | 5420 | list_add(&p->se.group_node, &env->tasks); |
5421 | |||
5422 | detached++; | ||
5412 | env->imbalance -= load; | 5423 | env->imbalance -= load; |
5413 | 5424 | ||
5414 | #ifdef CONFIG_PREEMPT | 5425 | #ifdef CONFIG_PREEMPT |
5415 | /* | 5426 | /* |
5416 | * NEWIDLE balancing is a source of latency, so preemptible | 5427 | * NEWIDLE balancing is a source of latency, so preemptible |
5417 | * kernels will stop after the first task is pulled to minimize | 5428 | * kernels will stop after the first task is detached to minimize |
5418 | * the critical section. | 5429 | * the critical section. |
5419 | */ | 5430 | */ |
5420 | if (env->idle == CPU_NEWLY_IDLE) | 5431 | if (env->idle == CPU_NEWLY_IDLE) |
@@ -5434,13 +5445,58 @@ next: | |||
5434 | } | 5445 | } |
5435 | 5446 | ||
5436 | /* | 5447 | /* |
5437 | * Right now, this is one of only two places move_task() is called, | 5448 | * Right now, this is one of only two places we collect this stat |
5438 | * so we can safely collect move_task() stats here rather than | 5449 | * so we can safely collect detach_one_task() stats here rather |
5439 | * inside move_task(). | 5450 | * than inside detach_one_task(). |
5440 | */ | 5451 | */ |
5441 | schedstat_add(env->sd, lb_gained[env->idle], pulled); | 5452 | schedstat_add(env->sd, lb_gained[env->idle], detached); |
5453 | |||
5454 | return detached; | ||
5455 | } | ||
5456 | |||
5457 | /* | ||
5458 | * attach_task() -- attach the task detached by detach_task() to its new rq. | ||
5459 | */ | ||
5460 | static void attach_task(struct rq *rq, struct task_struct *p) | ||
5461 | { | ||
5462 | lockdep_assert_held(&rq->lock); | ||
5463 | |||
5464 | BUG_ON(task_rq(p) != rq); | ||
5465 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
5466 | activate_task(rq, p, 0); | ||
5467 | check_preempt_curr(rq, p, 0); | ||
5468 | } | ||
5469 | |||
5470 | /* | ||
5471 | * attach_one_task() -- attaches the task returned from detach_one_task() to | ||
5472 | * its new rq. | ||
5473 | */ | ||
5474 | static void attach_one_task(struct rq *rq, struct task_struct *p) | ||
5475 | { | ||
5476 | raw_spin_lock(&rq->lock); | ||
5477 | attach_task(rq, p); | ||
5478 | raw_spin_unlock(&rq->lock); | ||
5479 | } | ||
5480 | |||
5481 | /* | ||
5482 | * attach_tasks() -- attaches all tasks detached by detach_tasks() to their | ||
5483 | * new rq. | ||
5484 | */ | ||
5485 | static void attach_tasks(struct lb_env *env) | ||
5486 | { | ||
5487 | struct list_head *tasks = &env->tasks; | ||
5488 | struct task_struct *p; | ||
5489 | |||
5490 | raw_spin_lock(&env->dst_rq->lock); | ||
5491 | |||
5492 | while (!list_empty(tasks)) { | ||
5493 | p = list_first_entry(tasks, struct task_struct, se.group_node); | ||
5494 | list_del_init(&p->se.group_node); | ||
5442 | 5495 | ||
5443 | return pulled; | 5496 | attach_task(env->dst_rq, p); |
5497 | } | ||
5498 | |||
5499 | raw_spin_unlock(&env->dst_rq->lock); | ||
5444 | } | 5500 | } |
5445 | 5501 | ||
5446 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5502 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -5559,6 +5615,13 @@ static unsigned long task_h_load(struct task_struct *p) | |||
5559 | #endif | 5615 | #endif |
5560 | 5616 | ||
5561 | /********** Helpers for find_busiest_group ************************/ | 5617 | /********** Helpers for find_busiest_group ************************/ |
5618 | |||
5619 | enum group_type { | ||
5620 | group_other = 0, | ||
5621 | group_imbalanced, | ||
5622 | group_overloaded, | ||
5623 | }; | ||
5624 | |||
5562 | /* | 5625 | /* |
5563 | * sg_lb_stats - stats of a sched_group required for load_balancing | 5626 | * sg_lb_stats - stats of a sched_group required for load_balancing |
5564 | */ | 5627 | */ |
@@ -5572,7 +5635,7 @@ struct sg_lb_stats { | |||
5572 | unsigned int group_capacity_factor; | 5635 | unsigned int group_capacity_factor; |
5573 | unsigned int idle_cpus; | 5636 | unsigned int idle_cpus; |
5574 | unsigned int group_weight; | 5637 | unsigned int group_weight; |
5575 | int group_imb; /* Is there an imbalance in the group ? */ | 5638 | enum group_type group_type; |
5576 | int group_has_free_capacity; | 5639 | int group_has_free_capacity; |
5577 | #ifdef CONFIG_NUMA_BALANCING | 5640 | #ifdef CONFIG_NUMA_BALANCING |
5578 | unsigned int nr_numa_running; | 5641 | unsigned int nr_numa_running; |
@@ -5610,6 +5673,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
5610 | .total_capacity = 0UL, | 5673 | .total_capacity = 0UL, |
5611 | .busiest_stat = { | 5674 | .busiest_stat = { |
5612 | .avg_load = 0UL, | 5675 | .avg_load = 0UL, |
5676 | .sum_nr_running = 0, | ||
5677 | .group_type = group_other, | ||
5613 | }, | 5678 | }, |
5614 | }; | 5679 | }; |
5615 | } | 5680 | } |
@@ -5652,19 +5717,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | |||
5652 | return default_scale_capacity(sd, cpu); | 5717 | return default_scale_capacity(sd, cpu); |
5653 | } | 5718 | } |
5654 | 5719 | ||
5655 | static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) | 5720 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
5656 | { | 5721 | { |
5657 | unsigned long weight = sd->span_weight; | 5722 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
5658 | unsigned long smt_gain = sd->smt_gain; | 5723 | return sd->smt_gain / sd->span_weight; |
5659 | 5724 | ||
5660 | smt_gain /= weight; | 5725 | return SCHED_CAPACITY_SCALE; |
5661 | |||
5662 | return smt_gain; | ||
5663 | } | 5726 | } |
5664 | 5727 | ||
5665 | unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) | 5728 | unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
5666 | { | 5729 | { |
5667 | return default_scale_smt_capacity(sd, cpu); | 5730 | return default_scale_cpu_capacity(sd, cpu); |
5668 | } | 5731 | } |
5669 | 5732 | ||
5670 | static unsigned long scale_rt_capacity(int cpu) | 5733 | static unsigned long scale_rt_capacity(int cpu) |
@@ -5703,18 +5766,15 @@ static unsigned long scale_rt_capacity(int cpu) | |||
5703 | 5766 | ||
5704 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 5767 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
5705 | { | 5768 | { |
5706 | unsigned long weight = sd->span_weight; | ||
5707 | unsigned long capacity = SCHED_CAPACITY_SCALE; | 5769 | unsigned long capacity = SCHED_CAPACITY_SCALE; |
5708 | struct sched_group *sdg = sd->groups; | 5770 | struct sched_group *sdg = sd->groups; |
5709 | 5771 | ||
5710 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { | 5772 | if (sched_feat(ARCH_CAPACITY)) |
5711 | if (sched_feat(ARCH_CAPACITY)) | 5773 | capacity *= arch_scale_cpu_capacity(sd, cpu); |
5712 | capacity *= arch_scale_smt_capacity(sd, cpu); | 5774 | else |
5713 | else | 5775 | capacity *= default_scale_cpu_capacity(sd, cpu); |
5714 | capacity *= default_scale_smt_capacity(sd, cpu); | ||
5715 | 5776 | ||
5716 | capacity >>= SCHED_CAPACITY_SHIFT; | 5777 | capacity >>= SCHED_CAPACITY_SHIFT; |
5717 | } | ||
5718 | 5778 | ||
5719 | sdg->sgc->capacity_orig = capacity; | 5779 | sdg->sgc->capacity_orig = capacity; |
5720 | 5780 | ||
@@ -5891,6 +5951,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro | |||
5891 | return capacity_factor; | 5951 | return capacity_factor; |
5892 | } | 5952 | } |
5893 | 5953 | ||
5954 | static enum group_type | ||
5955 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | ||
5956 | { | ||
5957 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | ||
5958 | return group_overloaded; | ||
5959 | |||
5960 | if (sg_imbalanced(group)) | ||
5961 | return group_imbalanced; | ||
5962 | |||
5963 | return group_other; | ||
5964 | } | ||
5965 | |||
5894 | /** | 5966 | /** |
5895 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 5967 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
5896 | * @env: The load balancing environment. | 5968 | * @env: The load balancing environment. |
@@ -5920,7 +5992,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5920 | load = source_load(i, load_idx); | 5992 | load = source_load(i, load_idx); |
5921 | 5993 | ||
5922 | sgs->group_load += load; | 5994 | sgs->group_load += load; |
5923 | sgs->sum_nr_running += rq->nr_running; | 5995 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
5924 | 5996 | ||
5925 | if (rq->nr_running > 1) | 5997 | if (rq->nr_running > 1) |
5926 | *overload = true; | 5998 | *overload = true; |
@@ -5942,9 +6014,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5942 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6014 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
5943 | 6015 | ||
5944 | sgs->group_weight = group->group_weight; | 6016 | sgs->group_weight = group->group_weight; |
5945 | |||
5946 | sgs->group_imb = sg_imbalanced(group); | ||
5947 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | 6017 | sgs->group_capacity_factor = sg_capacity_factor(env, group); |
6018 | sgs->group_type = group_classify(group, sgs); | ||
5948 | 6019 | ||
5949 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6020 | if (sgs->group_capacity_factor > sgs->sum_nr_running) |
5950 | sgs->group_has_free_capacity = 1; | 6021 | sgs->group_has_free_capacity = 1; |
@@ -5968,13 +6039,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
5968 | struct sched_group *sg, | 6039 | struct sched_group *sg, |
5969 | struct sg_lb_stats *sgs) | 6040 | struct sg_lb_stats *sgs) |
5970 | { | 6041 | { |
5971 | if (sgs->avg_load <= sds->busiest_stat.avg_load) | 6042 | struct sg_lb_stats *busiest = &sds->busiest_stat; |
5972 | return false; | ||
5973 | 6043 | ||
5974 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6044 | if (sgs->group_type > busiest->group_type) |
5975 | return true; | 6045 | return true; |
5976 | 6046 | ||
5977 | if (sgs->group_imb) | 6047 | if (sgs->group_type < busiest->group_type) |
6048 | return false; | ||
6049 | |||
6050 | if (sgs->avg_load <= busiest->avg_load) | ||
6051 | return false; | ||
6052 | |||
6053 | /* This is the busiest node in its class. */ | ||
6054 | if (!(env->sd->flags & SD_ASYM_PACKING)) | ||
5978 | return true; | 6055 | return true; |
5979 | 6056 | ||
5980 | /* | 6057 | /* |
@@ -5982,8 +6059,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
5982 | * numbered CPUs in the group, therefore mark all groups | 6059 | * numbered CPUs in the group, therefore mark all groups |
5983 | * higher than ourself as busy. | 6060 | * higher than ourself as busy. |
5984 | */ | 6061 | */ |
5985 | if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | 6062 | if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { |
5986 | env->dst_cpu < group_first_cpu(sg)) { | ||
5987 | if (!sds->busiest) | 6063 | if (!sds->busiest) |
5988 | return true; | 6064 | return true; |
5989 | 6065 | ||
@@ -6228,7 +6304,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6228 | local = &sds->local_stat; | 6304 | local = &sds->local_stat; |
6229 | busiest = &sds->busiest_stat; | 6305 | busiest = &sds->busiest_stat; |
6230 | 6306 | ||
6231 | if (busiest->group_imb) { | 6307 | if (busiest->group_type == group_imbalanced) { |
6232 | /* | 6308 | /* |
6233 | * In the group_imb case we cannot rely on group-wide averages | 6309 | * In the group_imb case we cannot rely on group-wide averages |
6234 | * to ensure cpu-load equilibrium, look at wider averages. XXX | 6310 | * to ensure cpu-load equilibrium, look at wider averages. XXX |
@@ -6248,12 +6324,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6248 | return fix_small_imbalance(env, sds); | 6324 | return fix_small_imbalance(env, sds); |
6249 | } | 6325 | } |
6250 | 6326 | ||
6251 | if (!busiest->group_imb) { | 6327 | /* |
6252 | /* | 6328 | * If there aren't any idle cpus, avoid creating some. |
6253 | * Don't want to pull so many tasks that a group would go idle. | 6329 | */ |
6254 | * Except of course for the group_imb case, since then we might | 6330 | if (busiest->group_type == group_overloaded && |
6255 | * have to drop below capacity to reach cpu-load equilibrium. | 6331 | local->group_type == group_overloaded) { |
6256 | */ | ||
6257 | load_above_capacity = | 6332 | load_above_capacity = |
6258 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6333 | (busiest->sum_nr_running - busiest->group_capacity_factor); |
6259 | 6334 | ||
@@ -6337,7 +6412,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6337 | * work because they assume all things are equal, which typically | 6412 | * work because they assume all things are equal, which typically |
6338 | * isn't true due to cpus_allowed constraints and the like. | 6413 | * isn't true due to cpus_allowed constraints and the like. |
6339 | */ | 6414 | */ |
6340 | if (busiest->group_imb) | 6415 | if (busiest->group_type == group_imbalanced) |
6341 | goto force_balance; | 6416 | goto force_balance; |
6342 | 6417 | ||
6343 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6418 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
@@ -6346,7 +6421,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6346 | goto force_balance; | 6421 | goto force_balance; |
6347 | 6422 | ||
6348 | /* | 6423 | /* |
6349 | * If the local group is more busy than the selected busiest group | 6424 | * If the local group is busier than the selected busiest group |
6350 | * don't try and pull any tasks. | 6425 | * don't try and pull any tasks. |
6351 | */ | 6426 | */ |
6352 | if (local->avg_load >= busiest->avg_load) | 6427 | if (local->avg_load >= busiest->avg_load) |
@@ -6361,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6361 | 6436 | ||
6362 | if (env->idle == CPU_IDLE) { | 6437 | if (env->idle == CPU_IDLE) { |
6363 | /* | 6438 | /* |
6364 | * This cpu is idle. If the busiest group load doesn't | 6439 | * This cpu is idle. If the busiest group is not overloaded |
6365 | * have more tasks than the number of available cpu's and | 6440 | * and there is no imbalance between this and busiest group |
6366 | * there is no imbalance between this and busiest group | 6441 | * wrt idle cpus, it is balanced. The imbalance becomes |
6367 | * wrt to idle cpu's, it is balanced. | 6442 | * significant if the diff is greater than 1 otherwise we |
6443 | * might end up to just move the imbalance on another group | ||
6368 | */ | 6444 | */ |
6369 | if ((local->idle_cpus < busiest->idle_cpus) && | 6445 | if ((busiest->group_type != group_overloaded) && |
6370 | busiest->sum_nr_running <= busiest->group_weight) | 6446 | (local->idle_cpus <= (busiest->idle_cpus + 1))) |
6371 | goto out_balanced; | 6447 | goto out_balanced; |
6372 | } else { | 6448 | } else { |
6373 | /* | 6449 | /* |
@@ -6550,6 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
6550 | .loop_break = sched_nr_migrate_break, | 6626 | .loop_break = sched_nr_migrate_break, |
6551 | .cpus = cpus, | 6627 | .cpus = cpus, |
6552 | .fbq_type = all, | 6628 | .fbq_type = all, |
6629 | .tasks = LIST_HEAD_INIT(env.tasks), | ||
6553 | }; | 6630 | }; |
6554 | 6631 | ||
6555 | /* | 6632 | /* |
@@ -6599,23 +6676,30 @@ redo: | |||
6599 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6676 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
6600 | 6677 | ||
6601 | more_balance: | 6678 | more_balance: |
6602 | local_irq_save(flags); | 6679 | raw_spin_lock_irqsave(&busiest->lock, flags); |
6603 | double_rq_lock(env.dst_rq, busiest); | ||
6604 | 6680 | ||
6605 | /* | 6681 | /* |
6606 | * cur_ld_moved - load moved in current iteration | 6682 | * cur_ld_moved - load moved in current iteration |
6607 | * ld_moved - cumulative load moved across iterations | 6683 | * ld_moved - cumulative load moved across iterations |
6608 | */ | 6684 | */ |
6609 | cur_ld_moved = move_tasks(&env); | 6685 | cur_ld_moved = detach_tasks(&env); |
6610 | ld_moved += cur_ld_moved; | ||
6611 | double_rq_unlock(env.dst_rq, busiest); | ||
6612 | local_irq_restore(flags); | ||
6613 | 6686 | ||
6614 | /* | 6687 | /* |
6615 | * some other cpu did the load balance for us. | 6688 | * We've detached some tasks from busiest_rq. Every |
6689 | * task is masked "TASK_ON_RQ_MIGRATING", so we can safely | ||
6690 | * unlock busiest->lock, and we are able to be sure | ||
6691 | * that nobody can manipulate the tasks in parallel. | ||
6692 | * See task_rq_lock() family for the details. | ||
6616 | */ | 6693 | */ |
6617 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) | 6694 | |
6618 | resched_cpu(env.dst_cpu); | 6695 | raw_spin_unlock(&busiest->lock); |
6696 | |||
6697 | if (cur_ld_moved) { | ||
6698 | attach_tasks(&env); | ||
6699 | ld_moved += cur_ld_moved; | ||
6700 | } | ||
6701 | |||
6702 | local_irq_restore(flags); | ||
6619 | 6703 | ||
6620 | if (env.flags & LBF_NEED_BREAK) { | 6704 | if (env.flags & LBF_NEED_BREAK) { |
6621 | env.flags &= ~LBF_NEED_BREAK; | 6705 | env.flags &= ~LBF_NEED_BREAK; |
@@ -6665,10 +6749,8 @@ more_balance: | |||
6665 | if (sd_parent) { | 6749 | if (sd_parent) { |
6666 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; | 6750 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; |
6667 | 6751 | ||
6668 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6752 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) |
6669 | *group_imbalance = 1; | 6753 | *group_imbalance = 1; |
6670 | } else if (*group_imbalance) | ||
6671 | *group_imbalance = 0; | ||
6672 | } | 6754 | } |
6673 | 6755 | ||
6674 | /* All tasks on this runqueue were pinned by CPU affinity */ | 6756 | /* All tasks on this runqueue were pinned by CPU affinity */ |
@@ -6679,7 +6761,7 @@ more_balance: | |||
6679 | env.loop_break = sched_nr_migrate_break; | 6761 | env.loop_break = sched_nr_migrate_break; |
6680 | goto redo; | 6762 | goto redo; |
6681 | } | 6763 | } |
6682 | goto out_balanced; | 6764 | goto out_all_pinned; |
6683 | } | 6765 | } |
6684 | } | 6766 | } |
6685 | 6767 | ||
@@ -6744,7 +6826,7 @@ more_balance: | |||
6744 | * If we've begun active balancing, start to back off. This | 6826 | * If we've begun active balancing, start to back off. This |
6745 | * case may not be covered by the all_pinned logic if there | 6827 | * case may not be covered by the all_pinned logic if there |
6746 | * is only 1 task on the busy runqueue (because we don't call | 6828 | * is only 1 task on the busy runqueue (because we don't call |
6747 | * move_tasks). | 6829 | * detach_tasks). |
6748 | */ | 6830 | */ |
6749 | if (sd->balance_interval < sd->max_interval) | 6831 | if (sd->balance_interval < sd->max_interval) |
6750 | sd->balance_interval *= 2; | 6832 | sd->balance_interval *= 2; |
@@ -6753,6 +6835,23 @@ more_balance: | |||
6753 | goto out; | 6835 | goto out; |
6754 | 6836 | ||
6755 | out_balanced: | 6837 | out_balanced: |
6838 | /* | ||
6839 | * We reach balance although we may have faced some affinity | ||
6840 | * constraints. Clear the imbalance flag if it was set. | ||
6841 | */ | ||
6842 | if (sd_parent) { | ||
6843 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; | ||
6844 | |||
6845 | if (*group_imbalance) | ||
6846 | *group_imbalance = 0; | ||
6847 | } | ||
6848 | |||
6849 | out_all_pinned: | ||
6850 | /* | ||
6851 | * We reach balance because all tasks are pinned at this level so | ||
6852 | * we can't migrate them. Let the imbalance flag set so parent level | ||
6853 | * can try to migrate them. | ||
6854 | */ | ||
6756 | schedstat_inc(sd, lb_balanced[idle]); | 6855 | schedstat_inc(sd, lb_balanced[idle]); |
6757 | 6856 | ||
6758 | sd->nr_balance_failed = 0; | 6857 | sd->nr_balance_failed = 0; |
@@ -6914,6 +7013,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
6914 | int target_cpu = busiest_rq->push_cpu; | 7013 | int target_cpu = busiest_rq->push_cpu; |
6915 | struct rq *target_rq = cpu_rq(target_cpu); | 7014 | struct rq *target_rq = cpu_rq(target_cpu); |
6916 | struct sched_domain *sd; | 7015 | struct sched_domain *sd; |
7016 | struct task_struct *p = NULL; | ||
6917 | 7017 | ||
6918 | raw_spin_lock_irq(&busiest_rq->lock); | 7018 | raw_spin_lock_irq(&busiest_rq->lock); |
6919 | 7019 | ||
@@ -6933,9 +7033,6 @@ static int active_load_balance_cpu_stop(void *data) | |||
6933 | */ | 7033 | */ |
6934 | BUG_ON(busiest_rq == target_rq); | 7034 | BUG_ON(busiest_rq == target_rq); |
6935 | 7035 | ||
6936 | /* move a task from busiest_rq to target_rq */ | ||
6937 | double_lock_balance(busiest_rq, target_rq); | ||
6938 | |||
6939 | /* Search for an sd spanning us and the target CPU. */ | 7036 | /* Search for an sd spanning us and the target CPU. */ |
6940 | rcu_read_lock(); | 7037 | rcu_read_lock(); |
6941 | for_each_domain(target_cpu, sd) { | 7038 | for_each_domain(target_cpu, sd) { |
@@ -6956,16 +7053,22 @@ static int active_load_balance_cpu_stop(void *data) | |||
6956 | 7053 | ||
6957 | schedstat_inc(sd, alb_count); | 7054 | schedstat_inc(sd, alb_count); |
6958 | 7055 | ||
6959 | if (move_one_task(&env)) | 7056 | p = detach_one_task(&env); |
7057 | if (p) | ||
6960 | schedstat_inc(sd, alb_pushed); | 7058 | schedstat_inc(sd, alb_pushed); |
6961 | else | 7059 | else |
6962 | schedstat_inc(sd, alb_failed); | 7060 | schedstat_inc(sd, alb_failed); |
6963 | } | 7061 | } |
6964 | rcu_read_unlock(); | 7062 | rcu_read_unlock(); |
6965 | double_unlock_balance(busiest_rq, target_rq); | ||
6966 | out_unlock: | 7063 | out_unlock: |
6967 | busiest_rq->active_balance = 0; | 7064 | busiest_rq->active_balance = 0; |
6968 | raw_spin_unlock_irq(&busiest_rq->lock); | 7065 | raw_spin_unlock(&busiest_rq->lock); |
7066 | |||
7067 | if (p) | ||
7068 | attach_one_task(target_rq, p); | ||
7069 | |||
7070 | local_irq_enable(); | ||
7071 | |||
6969 | return 0; | 7072 | return 0; |
6970 | } | 7073 | } |
6971 | 7074 | ||
@@ -7465,7 +7568,7 @@ static void task_fork_fair(struct task_struct *p) | |||
7465 | static void | 7568 | static void |
7466 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) | 7569 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
7467 | { | 7570 | { |
7468 | if (!p->se.on_rq) | 7571 | if (!task_on_rq_queued(p)) |
7469 | return; | 7572 | return; |
7470 | 7573 | ||
7471 | /* | 7574 | /* |
@@ -7490,11 +7593,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
7490 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7593 | * switched back to the fair class the enqueue_entity(.flags=0) will |
7491 | * do the right thing. | 7594 | * do the right thing. |
7492 | * | 7595 | * |
7493 | * If it's on_rq, then the dequeue_entity(.flags=0) will already | 7596 | * If it's queued, then the dequeue_entity(.flags=0) will already |
7494 | * have normalized the vruntime, if it's !on_rq, then only when | 7597 | * have normalized the vruntime, if it's !queued, then only when |
7495 | * the task is sleeping will it still have non-normalized vruntime. | 7598 | * the task is sleeping will it still have non-normalized vruntime. |
7496 | */ | 7599 | */ |
7497 | if (!p->on_rq && p->state != TASK_RUNNING) { | 7600 | if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { |
7498 | /* | 7601 | /* |
7499 | * Fix up our vruntime so that the current sleep doesn't | 7602 | * Fix up our vruntime so that the current sleep doesn't |
7500 | * cause 'unlimited' sleep bonus. | 7603 | * cause 'unlimited' sleep bonus. |
@@ -7521,15 +7624,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
7521 | */ | 7624 | */ |
7522 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | 7625 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
7523 | { | 7626 | { |
7524 | struct sched_entity *se = &p->se; | ||
7525 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7627 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7628 | struct sched_entity *se = &p->se; | ||
7526 | /* | 7629 | /* |
7527 | * Since the real-depth could have been changed (only FAIR | 7630 | * Since the real-depth could have been changed (only FAIR |
7528 | * class maintain depth value), reset depth properly. | 7631 | * class maintain depth value), reset depth properly. |
7529 | */ | 7632 | */ |
7530 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 7633 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
7531 | #endif | 7634 | #endif |
7532 | if (!se->on_rq) | 7635 | if (!task_on_rq_queued(p)) |
7533 | return; | 7636 | return; |
7534 | 7637 | ||
7535 | /* | 7638 | /* |
@@ -7575,7 +7678,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
7575 | } | 7678 | } |
7576 | 7679 | ||
7577 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7680 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7578 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 7681 | static void task_move_group_fair(struct task_struct *p, int queued) |
7579 | { | 7682 | { |
7580 | struct sched_entity *se = &p->se; | 7683 | struct sched_entity *se = &p->se; |
7581 | struct cfs_rq *cfs_rq; | 7684 | struct cfs_rq *cfs_rq; |
@@ -7594,7 +7697,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
7594 | * fair sleeper stuff for the first placement, but who cares. | 7697 | * fair sleeper stuff for the first placement, but who cares. |
7595 | */ | 7698 | */ |
7596 | /* | 7699 | /* |
7597 | * When !on_rq, vruntime of the task has usually NOT been normalized. | 7700 | * When !queued, vruntime of the task has usually NOT been normalized. |
7598 | * But there are some cases where it has already been normalized: | 7701 | * But there are some cases where it has already been normalized: |
7599 | * | 7702 | * |
7600 | * - Moving a forked child which is waiting for being woken up by | 7703 | * - Moving a forked child which is waiting for being woken up by |
@@ -7605,14 +7708,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
7605 | * To prevent boost or penalty in the new cfs_rq caused by delta | 7708 | * To prevent boost or penalty in the new cfs_rq caused by delta |
7606 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | 7709 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. |
7607 | */ | 7710 | */ |
7608 | if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) | 7711 | if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) |
7609 | on_rq = 1; | 7712 | queued = 1; |
7610 | 7713 | ||
7611 | if (!on_rq) | 7714 | if (!queued) |
7612 | se->vruntime -= cfs_rq_of(se)->min_vruntime; | 7715 | se->vruntime -= cfs_rq_of(se)->min_vruntime; |
7613 | set_task_rq(p, task_cpu(p)); | 7716 | set_task_rq(p, task_cpu(p)); |
7614 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 7717 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
7615 | if (!on_rq) { | 7718 | if (!queued) { |
7616 | cfs_rq = cfs_rq_of(se); | 7719 | cfs_rq = cfs_rq_of(se); |
7617 | se->vruntime += cfs_rq->min_vruntime; | 7720 | se->vruntime += cfs_rq->min_vruntime; |
7618 | #ifdef CONFIG_SMP | 7721 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 11e7bc434f43..c47fce75e666 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -147,6 +147,9 @@ use_default: | |||
147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) |
148 | goto use_default; | 148 | goto use_default; |
149 | 149 | ||
150 | /* Take note of the planned idle state. */ | ||
151 | idle_set_state(this_rq(), &drv->states[next_state]); | ||
152 | |||
150 | /* | 153 | /* |
151 | * Enter the idle state previously returned by the governor decision. | 154 | * Enter the idle state previously returned by the governor decision. |
152 | * This function will block until an interrupt occurs and will take | 155 | * This function will block until an interrupt occurs and will take |
@@ -154,6 +157,9 @@ use_default: | |||
154 | */ | 157 | */ |
155 | entered_state = cpuidle_enter(drv, dev, next_state); | 158 | entered_state = cpuidle_enter(drv, dev, next_state); |
156 | 159 | ||
160 | /* The cpu is no longer idle or about to enter idle. */ | ||
161 | idle_set_state(this_rq(), NULL); | ||
162 | |||
157 | if (broadcast) | 163 | if (broadcast) |
158 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 164 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); |
159 | 165 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca4fafd..87ea5bf1b87f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
1448 | * means a dl or stop task can slip in, in which case we need | 1448 | * means a dl or stop task can slip in, in which case we need |
1449 | * to re-start task selection. | 1449 | * to re-start task selection. |
1450 | */ | 1450 | */ |
1451 | if (unlikely((rq->stop && rq->stop->on_rq) || | 1451 | if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || |
1452 | rq->dl.dl_nr_running)) | 1452 | rq->dl.dl_nr_running)) |
1453 | return RETRY_TASK; | 1453 | return RETRY_TASK; |
1454 | } | 1454 | } |
@@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
1468 | p = _pick_next_task_rt(rq); | 1468 | p = _pick_next_task_rt(rq); |
1469 | 1469 | ||
1470 | /* The running task is never eligible for pushing */ | 1470 | /* The running task is never eligible for pushing */ |
1471 | if (p) | 1471 | dequeue_pushable_task(rq, p); |
1472 | dequeue_pushable_task(rq, p); | ||
1473 | 1472 | ||
1474 | set_post_schedule(rq); | 1473 | set_post_schedule(rq); |
1475 | 1474 | ||
@@ -1624,7 +1623,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1624 | !cpumask_test_cpu(lowest_rq->cpu, | 1623 | !cpumask_test_cpu(lowest_rq->cpu, |
1625 | tsk_cpus_allowed(task)) || | 1624 | tsk_cpus_allowed(task)) || |
1626 | task_running(rq, task) || | 1625 | task_running(rq, task) || |
1627 | !task->on_rq)) { | 1626 | !task_on_rq_queued(task))) { |
1628 | 1627 | ||
1629 | double_unlock_balance(rq, lowest_rq); | 1628 | double_unlock_balance(rq, lowest_rq); |
1630 | lowest_rq = NULL; | 1629 | lowest_rq = NULL; |
@@ -1658,7 +1657,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
1658 | BUG_ON(task_current(rq, p)); | 1657 | BUG_ON(task_current(rq, p)); |
1659 | BUG_ON(p->nr_cpus_allowed <= 1); | 1658 | BUG_ON(p->nr_cpus_allowed <= 1); |
1660 | 1659 | ||
1661 | BUG_ON(!p->on_rq); | 1660 | BUG_ON(!task_on_rq_queued(p)); |
1662 | BUG_ON(!rt_task(p)); | 1661 | BUG_ON(!rt_task(p)); |
1663 | 1662 | ||
1664 | return p; | 1663 | return p; |
@@ -1809,7 +1808,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
1809 | */ | 1808 | */ |
1810 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { | 1809 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { |
1811 | WARN_ON(p == src_rq->curr); | 1810 | WARN_ON(p == src_rq->curr); |
1812 | WARN_ON(!p->on_rq); | 1811 | WARN_ON(!task_on_rq_queued(p)); |
1813 | 1812 | ||
1814 | /* | 1813 | /* |
1815 | * There's a chance that p is higher in priority | 1814 | * There's a chance that p is higher in priority |
@@ -1870,7 +1869,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1870 | 1869 | ||
1871 | BUG_ON(!rt_task(p)); | 1870 | BUG_ON(!rt_task(p)); |
1872 | 1871 | ||
1873 | if (!p->on_rq) | 1872 | if (!task_on_rq_queued(p)) |
1874 | return; | 1873 | return; |
1875 | 1874 | ||
1876 | weight = cpumask_weight(new_mask); | 1875 | weight = cpumask_weight(new_mask); |
@@ -1936,7 +1935,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
1936 | * we may need to handle the pulling of RT tasks | 1935 | * we may need to handle the pulling of RT tasks |
1937 | * now. | 1936 | * now. |
1938 | */ | 1937 | */ |
1939 | if (!p->on_rq || rq->rt.rt_nr_running) | 1938 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) |
1940 | return; | 1939 | return; |
1941 | 1940 | ||
1942 | if (pull_rt_task(rq)) | 1941 | if (pull_rt_task(rq)) |
@@ -1970,7 +1969,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1970 | * If that current running task is also an RT task | 1969 | * If that current running task is also an RT task |
1971 | * then see if we can move to another run queue. | 1970 | * then see if we can move to another run queue. |
1972 | */ | 1971 | */ |
1973 | if (p->on_rq && rq->curr != p) { | 1972 | if (task_on_rq_queued(p) && rq->curr != p) { |
1974 | #ifdef CONFIG_SMP | 1973 | #ifdef CONFIG_SMP |
1975 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && | 1974 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && |
1976 | /* Don't resched if we changed runqueues */ | 1975 | /* Don't resched if we changed runqueues */ |
@@ -1989,7 +1988,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1989 | static void | 1988 | static void |
1990 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | 1989 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
1991 | { | 1990 | { |
1992 | if (!p->on_rq) | 1991 | if (!task_on_rq_queued(p)) |
1993 | return; | 1992 | return; |
1994 | 1993 | ||
1995 | if (rq->curr == p) { | 1994 | if (rq->curr == p) { |
@@ -2073,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
2073 | for_each_sched_rt_entity(rt_se) { | 2072 | for_each_sched_rt_entity(rt_se) { |
2074 | if (rt_se->run_list.prev != rt_se->run_list.next) { | 2073 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
2075 | requeue_task_rt(rq, p, 0); | 2074 | requeue_task_rt(rq, p, 0); |
2076 | set_tsk_need_resched(p); | 2075 | resched_curr(rq); |
2077 | return; | 2076 | return; |
2078 | } | 2077 | } |
2079 | } | 2078 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f4e9d5..6130251de280 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -14,6 +14,11 @@ | |||
14 | #include "cpuacct.h" | 14 | #include "cpuacct.h" |
15 | 15 | ||
16 | struct rq; | 16 | struct rq; |
17 | struct cpuidle_state; | ||
18 | |||
19 | /* task_struct::on_rq states: */ | ||
20 | #define TASK_ON_RQ_QUEUED 1 | ||
21 | #define TASK_ON_RQ_MIGRATING 2 | ||
17 | 22 | ||
18 | extern __read_mostly int scheduler_running; | 23 | extern __read_mostly int scheduler_running; |
19 | 24 | ||
@@ -126,6 +131,9 @@ struct rt_bandwidth { | |||
126 | u64 rt_runtime; | 131 | u64 rt_runtime; |
127 | struct hrtimer rt_period_timer; | 132 | struct hrtimer rt_period_timer; |
128 | }; | 133 | }; |
134 | |||
135 | void __dl_clear_params(struct task_struct *p); | ||
136 | |||
129 | /* | 137 | /* |
130 | * To keep the bandwidth of -deadline tasks and groups under control | 138 | * To keep the bandwidth of -deadline tasks and groups under control |
131 | * we need some place where: | 139 | * we need some place where: |
@@ -184,7 +192,7 @@ struct cfs_bandwidth { | |||
184 | raw_spinlock_t lock; | 192 | raw_spinlock_t lock; |
185 | ktime_t period; | 193 | ktime_t period; |
186 | u64 quota, runtime; | 194 | u64 quota, runtime; |
187 | s64 hierarchal_quota; | 195 | s64 hierarchical_quota; |
188 | u64 runtime_expires; | 196 | u64 runtime_expires; |
189 | 197 | ||
190 | int idle, timer_active; | 198 | int idle, timer_active; |
@@ -636,6 +644,11 @@ struct rq { | |||
636 | #ifdef CONFIG_SMP | 644 | #ifdef CONFIG_SMP |
637 | struct llist_head wake_list; | 645 | struct llist_head wake_list; |
638 | #endif | 646 | #endif |
647 | |||
648 | #ifdef CONFIG_CPU_IDLE | ||
649 | /* Must be inspected within a rcu lock section */ | ||
650 | struct cpuidle_state *idle_state; | ||
651 | #endif | ||
639 | }; | 652 | }; |
640 | 653 | ||
641 | static inline int cpu_of(struct rq *rq) | 654 | static inline int cpu_of(struct rq *rq) |
@@ -647,7 +660,7 @@ static inline int cpu_of(struct rq *rq) | |||
647 | #endif | 660 | #endif |
648 | } | 661 | } |
649 | 662 | ||
650 | DECLARE_PER_CPU(struct rq, runqueues); | 663 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
651 | 664 | ||
652 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 665 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
653 | #define this_rq() (&__get_cpu_var(runqueues)) | 666 | #define this_rq() (&__get_cpu_var(runqueues)) |
@@ -942,6 +955,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
942 | #endif | 955 | #endif |
943 | } | 956 | } |
944 | 957 | ||
958 | static inline int task_on_rq_queued(struct task_struct *p) | ||
959 | { | ||
960 | return p->on_rq == TASK_ON_RQ_QUEUED; | ||
961 | } | ||
962 | |||
963 | static inline int task_on_rq_migrating(struct task_struct *p) | ||
964 | { | ||
965 | return p->on_rq == TASK_ON_RQ_MIGRATING; | ||
966 | } | ||
945 | 967 | ||
946 | #ifndef prepare_arch_switch | 968 | #ifndef prepare_arch_switch |
947 | # define prepare_arch_switch(next) do { } while (0) | 969 | # define prepare_arch_switch(next) do { } while (0) |
@@ -953,7 +975,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
953 | # define finish_arch_post_lock_switch() do { } while (0) | 975 | # define finish_arch_post_lock_switch() do { } while (0) |
954 | #endif | 976 | #endif |
955 | 977 | ||
956 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
957 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 978 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
958 | { | 979 | { |
959 | #ifdef CONFIG_SMP | 980 | #ifdef CONFIG_SMP |
@@ -991,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
991 | raw_spin_unlock_irq(&rq->lock); | 1012 | raw_spin_unlock_irq(&rq->lock); |
992 | } | 1013 | } |
993 | 1014 | ||
994 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
995 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
996 | { | ||
997 | #ifdef CONFIG_SMP | ||
998 | /* | ||
999 | * We can optimise this out completely for !SMP, because the | ||
1000 | * SMP rebalancing from interrupt is the only thing that cares | ||
1001 | * here. | ||
1002 | */ | ||
1003 | next->on_cpu = 1; | ||
1004 | #endif | ||
1005 | raw_spin_unlock(&rq->lock); | ||
1006 | } | ||
1007 | |||
1008 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
1009 | { | ||
1010 | #ifdef CONFIG_SMP | ||
1011 | /* | ||
1012 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
1013 | * We must ensure this doesn't happen until the switch is completely | ||
1014 | * finished. | ||
1015 | */ | ||
1016 | smp_wmb(); | ||
1017 | prev->on_cpu = 0; | ||
1018 | #endif | ||
1019 | local_irq_enable(); | ||
1020 | } | ||
1021 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
1022 | |||
1023 | /* | 1015 | /* |
1024 | * wake flags | 1016 | * wake flags |
1025 | */ | 1017 | */ |
@@ -1180,6 +1172,30 @@ static inline void idle_exit_fair(struct rq *rq) { } | |||
1180 | 1172 | ||
1181 | #endif | 1173 | #endif |
1182 | 1174 | ||
1175 | #ifdef CONFIG_CPU_IDLE | ||
1176 | static inline void idle_set_state(struct rq *rq, | ||
1177 | struct cpuidle_state *idle_state) | ||
1178 | { | ||
1179 | rq->idle_state = idle_state; | ||
1180 | } | ||
1181 | |||
1182 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | ||
1183 | { | ||
1184 | WARN_ON(!rcu_read_lock_held()); | ||
1185 | return rq->idle_state; | ||
1186 | } | ||
1187 | #else | ||
1188 | static inline void idle_set_state(struct rq *rq, | ||
1189 | struct cpuidle_state *idle_state) | ||
1190 | { | ||
1191 | } | ||
1192 | |||
1193 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | ||
1194 | { | ||
1195 | return NULL; | ||
1196 | } | ||
1197 | #endif | ||
1198 | |||
1183 | extern void sysrq_sched_debug_show(void); | 1199 | extern void sysrq_sched_debug_show(void); |
1184 | extern void sched_init_granularity(void); | 1200 | extern void sched_init_granularity(void); |
1185 | extern void update_max_interval(void); | 1201 | extern void update_max_interval(void); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0edadbfbb..67426e529f59 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) | |||
28 | { | 28 | { |
29 | struct task_struct *stop = rq->stop; | 29 | struct task_struct *stop = rq->stop; |
30 | 30 | ||
31 | if (!stop || !stop->on_rq) | 31 | if (!stop || !task_on_rq_queued(stop)) |
32 | return NULL; | 32 | return NULL; |
33 | 33 | ||
34 | put_prev_task(rq, prev); | 34 | put_prev_task(rq, prev); |
diff --git a/kernel/smp.c b/kernel/smp.c index aff8aa14f547..9e0d0b289118 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/gfp.h> | 13 | #include <linux/gfp.h> |
14 | #include <linux/smp.h> | 14 | #include <linux/smp.h> |
15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
16 | #include <linux/sched.h> | ||
16 | 17 | ||
17 | #include "smpboot.h" | 18 | #include "smpboot.h" |
18 | 19 | ||
@@ -699,3 +700,24 @@ void kick_all_cpus_sync(void) | |||
699 | smp_call_function(do_nothing, NULL, 1); | 700 | smp_call_function(do_nothing, NULL, 1); |
700 | } | 701 | } |
701 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); | 702 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); |
703 | |||
704 | /** | ||
705 | * wake_up_all_idle_cpus - break all cpus out of idle | ||
706 | * wake_up_all_idle_cpus try to break all cpus which is in idle state even | ||
707 | * including idle polling cpus, for non-idle cpus, we will do nothing | ||
708 | * for them. | ||
709 | */ | ||
710 | void wake_up_all_idle_cpus(void) | ||
711 | { | ||
712 | int cpu; | ||
713 | |||
714 | preempt_disable(); | ||
715 | for_each_online_cpu(cpu) { | ||
716 | if (cpu == smp_processor_id()) | ||
717 | continue; | ||
718 | |||
719 | wake_up_if_idle(cpu); | ||
720 | } | ||
721 | preempt_enable(); | ||
722 | } | ||
723 | EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); | ||
diff --git a/kernel/sys.c b/kernel/sys.c index dfce4debd138..1eaa2f0b0246 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -869,11 +869,9 @@ void do_sys_times(struct tms *tms) | |||
869 | { | 869 | { |
870 | cputime_t tgutime, tgstime, cutime, cstime; | 870 | cputime_t tgutime, tgstime, cutime, cstime; |
871 | 871 | ||
872 | spin_lock_irq(¤t->sighand->siglock); | ||
873 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); | 872 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); |
874 | cutime = current->signal->cutime; | 873 | cutime = current->signal->cutime; |
875 | cstime = current->signal->cstime; | 874 | cstime = current->signal->cstime; |
876 | spin_unlock_irq(¤t->sighand->siglock); | ||
877 | tms->tms_utime = cputime_to_clock_t(tgutime); | 875 | tms->tms_utime = cputime_to_clock_t(tgutime); |
878 | tms->tms_stime = cputime_to_clock_t(tgstime); | 876 | tms->tms_stime = cputime_to_clock_t(tgstime); |
879 | tms->tms_cutime = cputime_to_clock_t(cutime); | 877 | tms->tms_cutime = cputime_to_clock_t(cutime); |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..ab370ffffd53 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, | |||
1776 | */ | 1776 | */ |
1777 | if (!expires) { | 1777 | if (!expires) { |
1778 | schedule(); | 1778 | schedule(); |
1779 | __set_current_state(TASK_RUNNING); | ||
1780 | return -EINTR; | 1779 | return -EINTR; |
1781 | } | 1780 | } |
1782 | 1781 | ||
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..492b986195d5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, | |||
272 | if (same_thread_group(tsk, current)) | 272 | if (same_thread_group(tsk, current)) |
273 | err = cpu_clock_sample(which_clock, tsk, &rtn); | 273 | err = cpu_clock_sample(which_clock, tsk, &rtn); |
274 | } else { | 274 | } else { |
275 | unsigned long flags; | ||
276 | struct sighand_struct *sighand; | ||
277 | |||
278 | /* | ||
279 | * while_each_thread() is not yet entirely RCU safe, | ||
280 | * keep locking the group while sampling process | ||
281 | * clock for now. | ||
282 | */ | ||
283 | sighand = lock_task_sighand(tsk, &flags); | ||
284 | if (!sighand) | ||
285 | return err; | ||
286 | |||
287 | if (tsk == current || thread_group_leader(tsk)) | 275 | if (tsk == current || thread_group_leader(tsk)) |
288 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); | 276 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); |
289 | |||
290 | unlock_task_sighand(tsk, &flags); | ||
291 | } | 277 | } |
292 | 278 | ||
293 | if (!err) | 279 | if (!err) |
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0434ff1b808e..3f9e328c30b5 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
@@ -205,7 +205,6 @@ static void ring_buffer_consumer(void) | |||
205 | break; | 205 | break; |
206 | 206 | ||
207 | schedule(); | 207 | schedule(); |
208 | __set_current_state(TASK_RUNNING); | ||
209 | } | 208 | } |
210 | reader_finish = 0; | 209 | reader_finish = 0; |
211 | complete(&read_done); | 210 | complete(&read_done); |
@@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg) | |||
379 | break; | 378 | break; |
380 | 379 | ||
381 | schedule(); | 380 | schedule(); |
382 | __set_current_state(TASK_RUNNING); | ||
383 | } | 381 | } |
384 | __set_current_state(TASK_RUNNING); | 382 | __set_current_state(TASK_RUNNING); |
385 | 383 | ||
@@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg) | |||
407 | trace_printk("Sleeping for 10 secs\n"); | 405 | trace_printk("Sleeping for 10 secs\n"); |
408 | set_current_state(TASK_INTERRUPTIBLE); | 406 | set_current_state(TASK_INTERRUPTIBLE); |
409 | schedule_timeout(HZ * SLEEP_TIME); | 407 | schedule_timeout(HZ * SLEEP_TIME); |
410 | __set_current_state(TASK_RUNNING); | ||
411 | } | 408 | } |
412 | 409 | ||
413 | if (kill_test) | 410 | if (kill_test) |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..16eddb308c33 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/sysctl.h> | 13 | #include <linux/sysctl.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/magic.h> | ||
17 | 16 | ||
18 | #include <asm/setup.h> | 17 | #include <asm/setup.h> |
19 | 18 | ||
@@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
171 | i++; | 170 | i++; |
172 | } | 171 | } |
173 | 172 | ||
174 | if ((current != &init_task && | 173 | if (task_stack_end_corrupted(current)) { |
175 | *(end_of_stack(current)) != STACK_END_MAGIC)) { | ||
176 | print_max_stack(); | 174 | print_max_stack(); |
177 | BUG(); | 175 | BUG(); |
178 | } | 176 | } |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 49d5fb754e88..e7ad58c5fbeb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -824,6 +824,18 @@ config SCHEDSTATS | |||
824 | application, you can say N to avoid the very slight overhead | 824 | application, you can say N to avoid the very slight overhead |
825 | this adds. | 825 | this adds. |
826 | 826 | ||
827 | config SCHED_STACK_END_CHECK | ||
828 | bool "Detect stack corruption on calls to schedule()" | ||
829 | depends on DEBUG_KERNEL | ||
830 | default n | ||
831 | help | ||
832 | This option checks for a stack overrun on calls to schedule(). | ||
833 | If the stack end location is found to be over written always panic as | ||
834 | the content of the corrupted region can no longer be trusted. | ||
835 | This is to ensure no erroneous behaviour occurs which could result in | ||
836 | data corruption or a sporadic crash at a later stage once the region | ||
837 | is examined. The runtime overhead introduced is minimal. | ||
838 | |||
827 | config TIMER_STATS | 839 | config TIMER_STATS |
828 | bool "Collect kernel timers statistics" | 840 | bool "Collect kernel timers statistics" |
829 | depends on DEBUG_KERNEL && PROC_FS | 841 | depends on DEBUG_KERNEL && PROC_FS |