diff options
29 files changed, 680 insertions, 480 deletions
diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt index 0bd32748a467..c6841eee9598 100644 --- a/Documentation/kernel-doc-nano-HOWTO.txt +++ b/Documentation/kernel-doc-nano-HOWTO.txt | |||
| @@ -168,10 +168,10 @@ if ($#ARGV < 0) { | |||
| 168 | mkdir $ARGV[0],0777; | 168 | mkdir $ARGV[0],0777; |
| 169 | $state = 0; | 169 | $state = 0; |
| 170 | while (<STDIN>) { | 170 | while (<STDIN>) { |
| 171 | if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) { | 171 | if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) { |
| 172 | if ($state == 1) { close OUT } | 172 | if ($state == 1) { close OUT } |
| 173 | $state = 1; | 173 | $state = 1; |
| 174 | $fn = "$ARGV[0]/$1.4"; | 174 | $fn = "$ARGV[0]/$1.9"; |
| 175 | print STDERR "Creating $fn\n"; | 175 | print STDERR "Creating $fn\n"; |
| 176 | open OUT, ">$fn" or die "can't open $fn: $!\n"; | 176 | open OUT, ">$fn" or die "can't open $fn: $!\n"; |
| 177 | print OUT $_; | 177 | print OUT $_; |
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 88bcb8767335..9d8eb553884c 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt | |||
| @@ -1,151 +1,242 @@ | |||
| 1 | ============= | ||
| 2 | CFS Scheduler | ||
| 3 | ============= | ||
| 1 | 4 | ||
| 2 | This is the CFS scheduler. | ||
| 3 | |||
| 4 | 80% of CFS's design can be summed up in a single sentence: CFS basically | ||
| 5 | models an "ideal, precise multi-tasking CPU" on real hardware. | ||
| 6 | |||
| 7 | "Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% | ||
| 8 | physical power and which can run each task at precise equal speed, in | ||
| 9 | parallel, each at 1/nr_running speed. For example: if there are 2 tasks | ||
| 10 | running then it runs each at 50% physical power - totally in parallel. | ||
| 11 | |||
| 12 | On real hardware, we can run only a single task at once, so while that | ||
| 13 | one task runs, the other tasks that are waiting for the CPU are at a | ||
| 14 | disadvantage - the current task gets an unfair amount of CPU time. In | ||
| 15 | CFS this fairness imbalance is expressed and tracked via the per-task | ||
| 16 | p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of | ||
| 17 | time the task should now run on the CPU for it to become completely fair | ||
| 18 | and balanced. | ||
| 19 | |||
| 20 | ( small detail: on 'ideal' hardware, the p->wait_runtime value would | ||
| 21 | always be zero - no task would ever get 'out of balance' from the | ||
| 22 | 'ideal' share of CPU time. ) | ||
| 23 | |||
| 24 | CFS's task picking logic is based on this p->wait_runtime value and it | ||
| 25 | is thus very simple: it always tries to run the task with the largest | ||
| 26 | p->wait_runtime value. In other words, CFS tries to run the task with | ||
| 27 | the 'gravest need' for more CPU time. So CFS always tries to split up | ||
| 28 | CPU time between runnable tasks as close to 'ideal multitasking | ||
| 29 | hardware' as possible. | ||
| 30 | |||
| 31 | Most of the rest of CFS's design just falls out of this really simple | ||
| 32 | concept, with a few add-on embellishments like nice levels, | ||
| 33 | multiprocessing and various algorithm variants to recognize sleepers. | ||
| 34 | |||
| 35 | In practice it works like this: the system runs a task a bit, and when | ||
| 36 | the task schedules (or a scheduler tick happens) the task's CPU usage is | ||
| 37 | 'accounted for': the (small) time it just spent using the physical CPU | ||
| 38 | is deducted from p->wait_runtime. [minus the 'fair share' it would have | ||
| 39 | gotten anyway]. Once p->wait_runtime gets low enough so that another | ||
| 40 | task becomes the 'leftmost task' of the time-ordered rbtree it maintains | ||
| 41 | (plus a small amount of 'granularity' distance relative to the leftmost | ||
| 42 | task so that we do not over-schedule tasks and trash the cache) then the | ||
| 43 | new leftmost task is picked and the current task is preempted. | ||
| 44 | |||
| 45 | The rq->fair_clock value tracks the 'CPU time a runnable task would have | ||
| 46 | fairly gotten, had it been runnable during that time'. So by using | ||
| 47 | rq->fair_clock values we can accurately timestamp and measure the | ||
| 48 | 'expected CPU time' a task should have gotten. All runnable tasks are | ||
| 49 | sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and | ||
| 50 | CFS picks the 'leftmost' task and sticks to it. As the system progresses | ||
| 51 | forwards, newly woken tasks are put into the tree more and more to the | ||
| 52 | right - slowly but surely giving a chance for every task to become the | ||
| 53 | 'leftmost task' and thus get on the CPU within a deterministic amount of | ||
| 54 | time. | ||
| 55 | |||
| 56 | Some implementation details: | ||
| 57 | |||
| 58 | - the introduction of Scheduling Classes: an extensible hierarchy of | ||
| 59 | scheduler modules. These modules encapsulate scheduling policy | ||
| 60 | details and are handled by the scheduler core without the core | ||
| 61 | code assuming about them too much. | ||
| 62 | |||
| 63 | - sched_fair.c implements the 'CFS desktop scheduler': it is a | ||
| 64 | replacement for the vanilla scheduler's SCHED_OTHER interactivity | ||
| 65 | code. | ||
| 66 | |||
| 67 | I'd like to give credit to Con Kolivas for the general approach here: | ||
| 68 | he has proven via RSDL/SD that 'fair scheduling' is possible and that | ||
| 69 | it results in better desktop scheduling. Kudos Con! | ||
| 70 | |||
| 71 | The CFS patch uses a completely different approach and implementation | ||
| 72 | from RSDL/SD. My goal was to make CFS's interactivity quality exceed | ||
| 73 | that of RSDL/SD, which is a high standard to meet :-) Testing | ||
| 74 | feedback is welcome to decide this one way or another. [ and, in any | ||
| 75 | case, all of SD's logic could be added via a kernel/sched_sd.c module | ||
| 76 | as well, if Con is interested in such an approach. ] | ||
| 77 | |||
| 78 | CFS's design is quite radical: it does not use runqueues, it uses a | ||
| 79 | time-ordered rbtree to build a 'timeline' of future task execution, | ||
| 80 | and thus has no 'array switch' artifacts (by which both the vanilla | ||
| 81 | scheduler and RSDL/SD are affected). | ||
| 82 | |||
| 83 | CFS uses nanosecond granularity accounting and does not rely on any | ||
| 84 | jiffies or other HZ detail. Thus the CFS scheduler has no notion of | ||
| 85 | 'timeslices' and has no heuristics whatsoever. There is only one | ||
| 86 | central tunable (you have to switch on CONFIG_SCHED_DEBUG): | ||
| 87 | |||
| 88 | /proc/sys/kernel/sched_granularity_ns | ||
| 89 | |||
| 90 | which can be used to tune the scheduler from 'desktop' (low | ||
| 91 | latencies) to 'server' (good batching) workloads. It defaults to a | ||
| 92 | setting suitable for desktop workloads. SCHED_BATCH is handled by the | ||
| 93 | CFS scheduler module too. | ||
| 94 | |||
| 95 | Due to its design, the CFS scheduler is not prone to any of the | ||
| 96 | 'attacks' that exist today against the heuristics of the stock | ||
| 97 | scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all | ||
| 98 | work fine and do not impact interactivity and produce the expected | ||
| 99 | behavior. | ||
| 100 | |||
| 101 | the CFS scheduler has a much stronger handling of nice levels and | ||
| 102 | SCHED_BATCH: both types of workloads should be isolated much more | ||
| 103 | agressively than under the vanilla scheduler. | ||
| 104 | |||
| 105 | ( another detail: due to nanosec accounting and timeline sorting, | ||
| 106 | sched_yield() support is very simple under CFS, and in fact under | ||
| 107 | CFS sched_yield() behaves much better than under any other | ||
| 108 | scheduler i have tested so far. ) | ||
| 109 | |||
| 110 | - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler | ||
| 111 | way than the vanilla scheduler does. It uses 100 runqueues (for all | ||
| 112 | 100 RT priority levels, instead of 140 in the vanilla scheduler) | ||
| 113 | and it needs no expired array. | ||
| 114 | |||
| 115 | - reworked/sanitized SMP load-balancing: the runqueue-walking | ||
| 116 | assumptions are gone from the load-balancing code now, and | ||
| 117 | iterators of the scheduling modules are used. The balancing code got | ||
| 118 | quite a bit simpler as a result. | ||
| 119 | |||
| 120 | |||
| 121 | Group scheduler extension to CFS | ||
| 122 | ================================ | ||
| 123 | |||
| 124 | Normally the scheduler operates on individual tasks and strives to provide | ||
| 125 | fair CPU time to each task. Sometimes, it may be desirable to group tasks | ||
| 126 | and provide fair CPU time to each such task group. For example, it may | ||
| 127 | be desirable to first provide fair CPU time to each user on the system | ||
| 128 | and then to each task belonging to a user. | ||
| 129 | |||
| 130 | CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets | ||
| 131 | SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such | ||
| 132 | groups. At present, there are two (mutually exclusive) mechanisms to group | ||
| 133 | tasks for CPU bandwidth control purpose: | ||
| 134 | |||
| 135 | - Based on user id (CONFIG_FAIR_USER_SCHED) | ||
| 136 | In this option, tasks are grouped according to their user id. | ||
| 137 | - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) | ||
| 138 | This options lets the administrator create arbitrary groups | ||
| 139 | of tasks, using the "cgroup" pseudo filesystem. See | ||
| 140 | Documentation/cgroups.txt for more information about this | ||
| 141 | filesystem. | ||
| 142 | 5 | ||
| 143 | Only one of these options to group tasks can be chosen and not both. | 6 | 1. OVERVIEW |
| 7 | |||
| 8 | CFS stands for "Completely Fair Scheduler," and is the new "desktop" process | ||
| 9 | scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the | ||
| 10 | replacement for the previous vanilla scheduler's SCHED_OTHER interactivity | ||
| 11 | code. | ||
| 12 | |||
| 13 | 80% of CFS's design can be summed up in a single sentence: CFS basically models | ||
| 14 | an "ideal, precise multi-tasking CPU" on real hardware. | ||
| 15 | |||
| 16 | "Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% physical | ||
| 17 | power and which can run each task at precise equal speed, in parallel, each at | ||
| 18 | 1/nr_running speed. For example: if there are 2 tasks running, then it runs | ||
| 19 | each at 50% physical power --- i.e., actually in parallel. | ||
| 20 | |||
| 21 | On real hardware, we can run only a single task at once, so we have to | ||
| 22 | introduce the concept of "virtual runtime." The virtual runtime of a task | ||
| 23 | specifies when its next timeslice would start execution on the ideal | ||
| 24 | multi-tasking CPU described above. In practice, the virtual runtime of a task | ||
| 25 | is its actual runtime normalized to the total number of running tasks. | ||
| 26 | |||
| 27 | |||
| 28 | |||
| 29 | 2. FEW IMPLEMENTATION DETAILS | ||
| 30 | |||
| 31 | In CFS the virtual runtime is expressed and tracked via the per-task | ||
| 32 | p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately | ||
| 33 | timestamp and measure the "expected CPU time" a task should have gotten. | ||
| 34 | |||
| 35 | [ small detail: on "ideal" hardware, at any time all tasks would have the same | ||
| 36 | p->se.vruntime value --- i.e., tasks would execute simultaneously and no task | ||
| 37 | would ever get "out of balance" from the "ideal" share of CPU time. ] | ||
| 38 | |||
| 39 | CFS's task picking logic is based on this p->se.vruntime value and it is thus | ||
| 40 | very simple: it always tries to run the task with the smallest p->se.vruntime | ||
| 41 | value (i.e., the task which executed least so far). CFS always tries to split | ||
| 42 | up CPU time between runnable tasks as close to "ideal multitasking hardware" as | ||
| 43 | possible. | ||
| 44 | |||
| 45 | Most of the rest of CFS's design just falls out of this really simple concept, | ||
| 46 | with a few add-on embellishments like nice levels, multiprocessing and various | ||
| 47 | algorithm variants to recognize sleepers. | ||
| 48 | |||
| 49 | |||
| 50 | |||
| 51 | 3. THE RBTREE | ||
| 52 | |||
| 53 | CFS's design is quite radical: it does not use the old data structures for the | ||
| 54 | runqueues, but it uses a time-ordered rbtree to build a "timeline" of future | ||
| 55 | task execution, and thus has no "array switch" artifacts (by which both the | ||
| 56 | previous vanilla scheduler and RSDL/SD are affected). | ||
| 57 | |||
| 58 | CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic | ||
| 59 | increasing value tracking the smallest vruntime among all tasks in the | ||
| 60 | runqueue. The total amount of work done by the system is tracked using | ||
| 61 | min_vruntime; that value is used to place newly activated entities on the left | ||
| 62 | side of the tree as much as possible. | ||
| 63 | |||
| 64 | The total number of running tasks in the runqueue is accounted through the | ||
| 65 | rq->cfs.load value, which is the sum of the weights of the tasks queued on the | ||
| 66 | runqueue. | ||
| 67 | |||
| 68 | CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the | ||
| 69 | p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to | ||
| 70 | account for possible wraparounds). CFS picks the "leftmost" task from this | ||
| 71 | tree and sticks to it. | ||
| 72 | As the system progresses forwards, the executed tasks are put into the tree | ||
| 73 | more and more to the right --- slowly but surely giving a chance for every task | ||
| 74 | to become the "leftmost task" and thus get on the CPU within a deterministic | ||
| 75 | amount of time. | ||
| 76 | |||
| 77 | Summing up, CFS works like this: it runs a task a bit, and when the task | ||
| 78 | schedules (or a scheduler tick happens) the task's CPU usage is "accounted | ||
| 79 | for": the (small) time it just spent using the physical CPU is added to | ||
| 80 | p->se.vruntime. Once p->se.vruntime gets high enough so that another task | ||
| 81 | becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a | ||
| 82 | small amount of "granularity" distance relative to the leftmost task so that we | ||
| 83 | do not over-schedule tasks and trash the cache), then the new leftmost task is | ||
| 84 | picked and the current task is preempted. | ||
| 85 | |||
| 86 | |||
| 87 | |||
| 88 | 4. SOME FEATURES OF CFS | ||
| 89 | |||
| 90 | CFS uses nanosecond granularity accounting and does not rely on any jiffies or | ||
| 91 | other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the | ||
| 92 | way the previous scheduler had, and has no heuristics whatsoever. There is | ||
| 93 | only one central tunable (you have to switch on CONFIG_SCHED_DEBUG): | ||
| 94 | |||
| 95 | /proc/sys/kernel/sched_granularity_ns | ||
| 96 | |||
| 97 | which can be used to tune the scheduler from "desktop" (i.e., low latencies) to | ||
| 98 | "server" (i.e., good batching) workloads. It defaults to a setting suitable | ||
| 99 | for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too. | ||
| 100 | |||
| 101 | Due to its design, the CFS scheduler is not prone to any of the "attacks" that | ||
| 102 | exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c, | ||
| 103 | chew.c, ring-test.c, massive_intr.c all work fine and do not impact | ||
| 104 | interactivity and produce the expected behavior. | ||
| 105 | |||
| 106 | The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH | ||
| 107 | than the previous vanilla scheduler: both types of workloads are isolated much | ||
| 108 | more aggressively. | ||
| 109 | |||
| 110 | SMP load-balancing has been reworked/sanitized: the runqueue-walking | ||
| 111 | assumptions are gone from the load-balancing code now, and iterators of the | ||
| 112 | scheduling modules are used. The balancing code got quite a bit simpler as a | ||
| 113 | result. | ||
| 114 | |||
| 115 | |||
| 116 | |||
| 117 | 5. Scheduling policies | ||
| 118 | |||
| 119 | CFS implements three scheduling policies: | ||
| 120 | |||
| 121 | - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling | ||
| 122 | policy that is used for regular tasks. | ||
| 123 | |||
| 124 | - SCHED_BATCH: Does not preempt nearly as often as regular tasks | ||
| 125 | would, thereby allowing tasks to run longer and make better use of | ||
| 126 | caches but at the cost of interactivity. This is well suited for | ||
| 127 | batch jobs. | ||
| 128 | |||
| 129 | - SCHED_IDLE: This is even weaker than nice 19, but its not a true | ||
| 130 | idle timer scheduler in order to avoid to get into priority | ||
| 131 | inversion problems which would deadlock the machine. | ||
| 132 | |||
| 133 | SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by | ||
| 134 | POSIX. | ||
| 135 | |||
| 136 | The command chrt from util-linux-ng 2.13.1.1 can set all of these except | ||
| 137 | SCHED_IDLE. | ||
| 144 | 138 | ||
| 145 | Group scheduler tunables: | ||
| 146 | 139 | ||
| 147 | When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for | 140 | |
| 148 | each new user and a "cpu_share" file is added in that directory. | 141 | 6. SCHEDULING CLASSES |
| 142 | |||
| 143 | The new CFS scheduler has been designed in such a way to introduce "Scheduling | ||
| 144 | Classes," an extensible hierarchy of scheduler modules. These modules | ||
| 145 | encapsulate scheduling policy details and are handled by the scheduler core | ||
| 146 | without the core code assuming too much about them. | ||
| 147 | |||
| 148 | sched_fair.c implements the CFS scheduler described above. | ||
| 149 | |||
| 150 | sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than | ||
| 151 | the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT | ||
| 152 | priority levels, instead of 140 in the previous scheduler) and it needs no | ||
| 153 | expired array. | ||
| 154 | |||
| 155 | Scheduling classes are implemented through the sched_class structure, which | ||
| 156 | contains hooks to functions that must be called whenever an interesting event | ||
| 157 | occurs. | ||
| 158 | |||
| 159 | This is the (partial) list of the hooks: | ||
| 160 | |||
| 161 | - enqueue_task(...) | ||
| 162 | |||
| 163 | Called when a task enters a runnable state. | ||
| 164 | It puts the scheduling entity (task) into the red-black tree and | ||
| 165 | increments the nr_running variable. | ||
| 166 | |||
| 167 | - dequeue_tree(...) | ||
| 168 | |||
| 169 | When a task is no longer runnable, this function is called to keep the | ||
| 170 | corresponding scheduling entity out of the red-black tree. It decrements | ||
| 171 | the nr_running variable. | ||
| 172 | |||
| 173 | - yield_task(...) | ||
| 174 | |||
| 175 | This function is basically just a dequeue followed by an enqueue, unless the | ||
| 176 | compat_yield sysctl is turned on; in that case, it places the scheduling | ||
| 177 | entity at the right-most end of the red-black tree. | ||
| 178 | |||
| 179 | - check_preempt_curr(...) | ||
| 180 | |||
| 181 | This function checks if a task that entered the runnable state should | ||
| 182 | preempt the currently running task. | ||
| 183 | |||
| 184 | - pick_next_task(...) | ||
| 185 | |||
| 186 | This function chooses the most appropriate task eligible to run next. | ||
| 187 | |||
| 188 | - set_curr_task(...) | ||
| 189 | |||
| 190 | This function is called when a task changes its scheduling class or changes | ||
| 191 | its task group. | ||
| 192 | |||
| 193 | - task_tick(...) | ||
| 194 | |||
| 195 | This function is mostly called from time tick functions; it might lead to | ||
| 196 | process switch. This drives the running preemption. | ||
| 197 | |||
| 198 | - task_new(...) | ||
| 199 | |||
| 200 | The core scheduler gives the scheduling module an opportunity to manage new | ||
| 201 | task startup. The CFS scheduling module uses it for group scheduling, while | ||
| 202 | the scheduling module for a real-time task does not use it. | ||
| 203 | |||
| 204 | |||
| 205 | |||
| 206 | 7. GROUP SCHEDULER EXTENSIONS TO CFS | ||
| 207 | |||
| 208 | Normally, the scheduler operates on individual tasks and strives to provide | ||
| 209 | fair CPU time to each task. Sometimes, it may be desirable to group tasks and | ||
| 210 | provide fair CPU time to each such task group. For example, it may be | ||
| 211 | desirable to first provide fair CPU time to each user on the system and then to | ||
| 212 | each task belonging to a user. | ||
| 213 | |||
| 214 | CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be | ||
| 215 | grouped and divides CPU time fairly among such groups. | ||
| 216 | |||
| 217 | CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and | ||
| 218 | SCHED_RR) tasks. | ||
| 219 | |||
| 220 | CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and | ||
| 221 | SCHED_BATCH) tasks. | ||
| 222 | |||
| 223 | At present, there are two (mutually exclusive) mechanisms to group tasks for | ||
| 224 | CPU bandwidth control purposes: | ||
| 225 | |||
| 226 | - Based on user id (CONFIG_USER_SCHED) | ||
| 227 | |||
| 228 | With this option, tasks are grouped according to their user id. | ||
| 229 | |||
| 230 | - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED) | ||
| 231 | |||
| 232 | This options needs CONFIG_CGROUPS to be defined, and lets the administrator | ||
| 233 | create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See | ||
| 234 | Documentation/cgroups.txt for more information about this filesystem. | ||
| 235 | |||
| 236 | Only one of these options to group tasks can be chosen and not both. | ||
| 237 | |||
| 238 | When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new | ||
| 239 | user and a "cpu_share" file is added in that directory. | ||
| 149 | 240 | ||
| 150 | # cd /sys/kernel/uids | 241 | # cd /sys/kernel/uids |
| 151 | # cat 512/cpu_share # Display user 512's CPU share | 242 | # cat 512/cpu_share # Display user 512's CPU share |
| @@ -155,16 +246,14 @@ each new user and a "cpu_share" file is added in that directory. | |||
| 155 | 2048 | 246 | 2048 |
| 156 | # | 247 | # |
| 157 | 248 | ||
| 158 | CPU bandwidth between two users are divided in the ratio of their CPU shares. | 249 | CPU bandwidth between two users is divided in the ratio of their CPU shares. |
| 159 | For ex: if you would like user "root" to get twice the bandwidth of user | 250 | For example: if you would like user "root" to get twice the bandwidth of user |
| 160 | "guest", then set the cpu_share for both the users such that "root"'s | 251 | "guest," then set the cpu_share for both the users such that "root"'s cpu_share |
| 161 | cpu_share is twice "guest"'s cpu_share | 252 | is twice "guest"'s cpu_share. |
| 162 | |||
| 163 | 253 | ||
| 164 | When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created | 254 | When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each |
| 165 | for each group created using the pseudo filesystem. See example steps | 255 | group created using the pseudo filesystem. See example steps below to create |
| 166 | below to create task groups and modify their CPU share using the "cgroups" | 256 | task groups and modify their CPU share using the "cgroups" pseudo filesystem. |
| 167 | pseudo filesystem | ||
| 168 | 257 | ||
| 169 | # mkdir /dev/cpuctl | 258 | # mkdir /dev/cpuctl |
| 170 | # mount -t cgroup -ocpu none /dev/cpuctl | 259 | # mount -t cgroup -ocpu none /dev/cpuctl |
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 83df541650fc..06b6fdab639f 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c | |||
| @@ -149,6 +149,9 @@ smp_callin(void) | |||
| 149 | atomic_inc(&init_mm.mm_count); | 149 | atomic_inc(&init_mm.mm_count); |
| 150 | current->active_mm = &init_mm; | 150 | current->active_mm = &init_mm; |
| 151 | 151 | ||
| 152 | /* inform the notifiers about the new cpu */ | ||
| 153 | notify_cpu_starting(cpuid); | ||
| 154 | |||
| 152 | /* Must have completely accurate bogos. */ | 155 | /* Must have completely accurate bogos. */ |
| 153 | local_irq_enable(); | 156 | local_irq_enable(); |
| 154 | 157 | ||
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index e9842f6767f9..e42a749a56dd 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c | |||
| @@ -277,6 +277,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void) | |||
| 277 | /* | 277 | /* |
| 278 | * Enable local interrupts. | 278 | * Enable local interrupts. |
| 279 | */ | 279 | */ |
| 280 | notify_cpu_starting(cpu); | ||
| 280 | local_irq_enable(); | 281 | local_irq_enable(); |
| 281 | local_fiq_enable(); | 282 | local_fiq_enable(); |
| 282 | 283 | ||
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 952a24b2f5a9..52e16c6436f9 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c | |||
| @@ -178,6 +178,7 @@ void __init smp_callin(void) | |||
| 178 | unmask_irq(IPI_INTR_VECT); | 178 | unmask_irq(IPI_INTR_VECT); |
| 179 | unmask_irq(TIMER0_INTR_VECT); | 179 | unmask_irq(TIMER0_INTR_VECT); |
| 180 | preempt_disable(); | 180 | preempt_disable(); |
| 181 | notify_cpu_starting(cpu); | ||
| 181 | local_irq_enable(); | 182 | local_irq_enable(); |
| 182 | 183 | ||
| 183 | cpu_set(cpu, cpu_online_map); | 184 | cpu_set(cpu, cpu_online_map); |
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index d8f05e504fbf..1dcbb85fc4ee 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c | |||
| @@ -401,6 +401,7 @@ smp_callin (void) | |||
| 401 | spin_lock(&vector_lock); | 401 | spin_lock(&vector_lock); |
| 402 | /* Setup the per cpu irq handling data structures */ | 402 | /* Setup the per cpu irq handling data structures */ |
| 403 | __setup_vector_irq(cpuid); | 403 | __setup_vector_irq(cpuid); |
| 404 | notify_cpu_starting(cpuid); | ||
| 404 | cpu_set(cpuid, cpu_online_map); | 405 | cpu_set(cpuid, cpu_online_map); |
| 405 | per_cpu(cpu_state, cpuid) = CPU_ONLINE; | 406 | per_cpu(cpu_state, cpuid) = CPU_ONLINE; |
| 406 | spin_unlock(&vector_lock); | 407 | spin_unlock(&vector_lock); |
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c index 2c03ac1d005f..fc2994811f15 100644 --- a/arch/m32r/kernel/smpboot.c +++ b/arch/m32r/kernel/smpboot.c | |||
| @@ -498,6 +498,8 @@ static void __init smp_online(void) | |||
| 498 | { | 498 | { |
| 499 | int cpu_id = smp_processor_id(); | 499 | int cpu_id = smp_processor_id(); |
| 500 | 500 | ||
| 501 | notify_cpu_starting(cpu_id); | ||
| 502 | |||
| 501 | local_irq_enable(); | 503 | local_irq_enable(); |
| 502 | 504 | ||
| 503 | /* Get our bogomips. */ | 505 | /* Get our bogomips. */ |
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 4410f172b8ab..7b59cfb7e602 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c | |||
| @@ -121,6 +121,8 @@ asmlinkage __cpuinit void start_secondary(void) | |||
| 121 | cpu = smp_processor_id(); | 121 | cpu = smp_processor_id(); |
| 122 | cpu_data[cpu].udelay_val = loops_per_jiffy; | 122 | cpu_data[cpu].udelay_val = loops_per_jiffy; |
| 123 | 123 | ||
| 124 | notify_cpu_starting(cpu); | ||
| 125 | |||
| 124 | mp_ops->smp_finish(); | 126 | mp_ops->smp_finish(); |
| 125 | set_cpu_sibling_map(cpu); | 127 | set_cpu_sibling_map(cpu); |
| 126 | 128 | ||
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5337ca7bb649..c27b10a1bd79 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c | |||
| @@ -453,6 +453,7 @@ int __devinit start_secondary(void *unused) | |||
| 453 | secondary_cpu_time_init(); | 453 | secondary_cpu_time_init(); |
| 454 | 454 | ||
| 455 | ipi_call_lock(); | 455 | ipi_call_lock(); |
| 456 | notify_cpu_starting(cpu); | ||
| 456 | cpu_set(cpu, cpu_online_map); | 457 | cpu_set(cpu, cpu_online_map); |
| 457 | /* Update sibling maps */ | 458 | /* Update sibling maps */ |
| 458 | base = cpu_first_thread_in_core(cpu); | 459 | base = cpu_first_thread_in_core(cpu); |
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 00b9b4dec5eb..9e8b1f9b8f4d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c | |||
| @@ -585,6 +585,8 @@ int __cpuinit start_secondary(void *cpuvoid) | |||
| 585 | /* Enable pfault pseudo page faults on this cpu. */ | 585 | /* Enable pfault pseudo page faults on this cpu. */ |
| 586 | pfault_init(); | 586 | pfault_init(); |
| 587 | 587 | ||
| 588 | /* call cpu notifiers */ | ||
| 589 | notify_cpu_starting(smp_processor_id()); | ||
| 588 | /* Mark this cpu as online */ | 590 | /* Mark this cpu as online */ |
| 589 | spin_lock(&call_lock); | 591 | spin_lock(&call_lock); |
| 590 | cpu_set(smp_processor_id(), cpu_online_map); | 592 | cpu_set(smp_processor_id(), cpu_online_map); |
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 60c50841143e..001778f9adaf 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c | |||
| @@ -82,6 +82,8 @@ asmlinkage void __cpuinit start_secondary(void) | |||
| 82 | 82 | ||
| 83 | preempt_disable(); | 83 | preempt_disable(); |
| 84 | 84 | ||
| 85 | notify_cpu_starting(smp_processor_id()); | ||
| 86 | |||
| 85 | local_irq_enable(); | 87 | local_irq_enable(); |
| 86 | 88 | ||
| 87 | calibrate_delay(); | 89 | calibrate_delay(); |
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c index 69596402a500..446767e8f569 100644 --- a/arch/sparc/kernel/sun4d_smp.c +++ b/arch/sparc/kernel/sun4d_smp.c | |||
| @@ -88,6 +88,7 @@ void __init smp4d_callin(void) | |||
| 88 | local_flush_cache_all(); | 88 | local_flush_cache_all(); |
| 89 | local_flush_tlb_all(); | 89 | local_flush_tlb_all(); |
| 90 | 90 | ||
| 91 | notify_cpu_starting(cpuid); | ||
| 91 | /* | 92 | /* |
| 92 | * Unblock the master CPU _only_ when the scheduler state | 93 | * Unblock the master CPU _only_ when the scheduler state |
| 93 | * of all secondary CPUs will be up-to-date, so after | 94 | * of all secondary CPUs will be up-to-date, so after |
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c index a14a76ac7f36..9964890dc1db 100644 --- a/arch/sparc/kernel/sun4m_smp.c +++ b/arch/sparc/kernel/sun4m_smp.c | |||
| @@ -71,6 +71,8 @@ void __cpuinit smp4m_callin(void) | |||
| 71 | local_flush_cache_all(); | 71 | local_flush_cache_all(); |
| 72 | local_flush_tlb_all(); | 72 | local_flush_tlb_all(); |
| 73 | 73 | ||
| 74 | notify_cpu_starting(cpuid); | ||
| 75 | |||
| 74 | /* Get our local ticker going. */ | 76 | /* Get our local ticker going. */ |
| 75 | smp_setup_percpu_timer(); | 77 | smp_setup_percpu_timer(); |
| 76 | 78 | ||
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index be2d50c3aa95..045772142844 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c | |||
| @@ -85,6 +85,7 @@ static int idle_proc(void *cpup) | |||
| 85 | while (!cpu_isset(cpu, smp_commenced_mask)) | 85 | while (!cpu_isset(cpu, smp_commenced_mask)) |
| 86 | cpu_relax(); | 86 | cpu_relax(); |
| 87 | 87 | ||
| 88 | notify_cpu_starting(cpu); | ||
| 88 | cpu_set(cpu, cpu_online_map); | 89 | cpu_set(cpu, cpu_online_map); |
| 89 | default_idle(); | 90 | default_idle(); |
| 90 | return 0; | 91 | return 0; |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7985c5b3f916..0b8261c3cac2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
| @@ -257,6 +257,7 @@ static void __cpuinit smp_callin(void) | |||
| 257 | end_local_APIC_setup(); | 257 | end_local_APIC_setup(); |
| 258 | map_cpu_to_logical_apicid(); | 258 | map_cpu_to_logical_apicid(); |
| 259 | 259 | ||
| 260 | notify_cpu_starting(cpuid); | ||
| 260 | /* | 261 | /* |
| 261 | * Get our bogomips. | 262 | * Get our bogomips. |
| 262 | * | 263 | * |
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index ee0fba092157..199a5f4a873c 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c | |||
| @@ -448,6 +448,8 @@ static void __init start_secondary(void *unused) | |||
| 448 | 448 | ||
| 449 | VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); | 449 | VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); |
| 450 | 450 | ||
| 451 | notify_cpu_starting(cpuid); | ||
| 452 | |||
| 451 | /* enable interrupts */ | 453 | /* enable interrupts */ |
| 452 | local_irq_enable(); | 454 | local_irq_enable(); |
| 453 | 455 | ||
diff --git a/include/linux/completion.h b/include/linux/completion.h index 02ef8835999c..4a6b604ef7e4 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h | |||
| @@ -10,6 +10,18 @@ | |||
| 10 | 10 | ||
| 11 | #include <linux/wait.h> | 11 | #include <linux/wait.h> |
| 12 | 12 | ||
| 13 | /** | ||
| 14 | * struct completion - structure used to maintain state for a "completion" | ||
| 15 | * | ||
| 16 | * This is the opaque structure used to maintain the state for a "completion". | ||
| 17 | * Completions currently use a FIFO to queue threads that have to wait for | ||
| 18 | * the "completion" event. | ||
| 19 | * | ||
| 20 | * See also: complete(), wait_for_completion() (and friends _timeout, | ||
| 21 | * _interruptible, _interruptible_timeout, and _killable), init_completion(), | ||
| 22 | * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and | ||
| 23 | * INIT_COMPLETION(). | ||
| 24 | */ | ||
| 13 | struct completion { | 25 | struct completion { |
| 14 | unsigned int done; | 26 | unsigned int done; |
| 15 | wait_queue_head_t wait; | 27 | wait_queue_head_t wait; |
| @@ -21,6 +33,14 @@ struct completion { | |||
| 21 | #define COMPLETION_INITIALIZER_ONSTACK(work) \ | 33 | #define COMPLETION_INITIALIZER_ONSTACK(work) \ |
| 22 | ({ init_completion(&work); work; }) | 34 | ({ init_completion(&work); work; }) |
| 23 | 35 | ||
| 36 | /** | ||
| 37 | * DECLARE_COMPLETION: - declare and initialize a completion structure | ||
| 38 | * @work: identifier for the completion structure | ||
| 39 | * | ||
| 40 | * This macro declares and initializes a completion structure. Generally used | ||
| 41 | * for static declarations. You should use the _ONSTACK variant for automatic | ||
| 42 | * variables. | ||
| 43 | */ | ||
| 24 | #define DECLARE_COMPLETION(work) \ | 44 | #define DECLARE_COMPLETION(work) \ |
| 25 | struct completion work = COMPLETION_INITIALIZER(work) | 45 | struct completion work = COMPLETION_INITIALIZER(work) |
| 26 | 46 | ||
| @@ -29,6 +49,13 @@ struct completion { | |||
| 29 | * completions - so we use the _ONSTACK() variant for those that | 49 | * completions - so we use the _ONSTACK() variant for those that |
| 30 | * are on the kernel stack: | 50 | * are on the kernel stack: |
| 31 | */ | 51 | */ |
| 52 | /** | ||
| 53 | * DECLARE_COMPLETION_ONSTACK: - declare and initialize a completion structure | ||
| 54 | * @work: identifier for the completion structure | ||
| 55 | * | ||
| 56 | * This macro declares and initializes a completion structure on the kernel | ||
| 57 | * stack. | ||
| 58 | */ | ||
| 32 | #ifdef CONFIG_LOCKDEP | 59 | #ifdef CONFIG_LOCKDEP |
| 33 | # define DECLARE_COMPLETION_ONSTACK(work) \ | 60 | # define DECLARE_COMPLETION_ONSTACK(work) \ |
| 34 | struct completion work = COMPLETION_INITIALIZER_ONSTACK(work) | 61 | struct completion work = COMPLETION_INITIALIZER_ONSTACK(work) |
| @@ -36,6 +63,13 @@ struct completion { | |||
| 36 | # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work) | 63 | # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work) |
| 37 | #endif | 64 | #endif |
| 38 | 65 | ||
| 66 | /** | ||
| 67 | * init_completion: - Initialize a dynamically allocated completion | ||
| 68 | * @x: completion structure that is to be initialized | ||
| 69 | * | ||
| 70 | * This inline function will initialize a dynamically created completion | ||
| 71 | * structure. | ||
| 72 | */ | ||
| 39 | static inline void init_completion(struct completion *x) | 73 | static inline void init_completion(struct completion *x) |
| 40 | { | 74 | { |
| 41 | x->done = 0; | 75 | x->done = 0; |
| @@ -55,6 +89,13 @@ extern bool completion_done(struct completion *x); | |||
| 55 | extern void complete(struct completion *); | 89 | extern void complete(struct completion *); |
| 56 | extern void complete_all(struct completion *); | 90 | extern void complete_all(struct completion *); |
| 57 | 91 | ||
| 92 | /** | ||
| 93 | * INIT_COMPLETION: - reinitialize a completion structure | ||
| 94 | * @x: completion structure to be reinitialized | ||
| 95 | * | ||
| 96 | * This macro should be used to reinitialize a completion structure so it can | ||
| 97 | * be reused. This is especially important after complete_all() is used. | ||
| 98 | */ | ||
| 58 | #define INIT_COMPLETION(x) ((x).done = 0) | 99 | #define INIT_COMPLETION(x) ((x).done = 0) |
| 59 | 100 | ||
| 60 | 101 | ||
diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d7faf8808497..c2747ac2ae43 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h | |||
| @@ -69,6 +69,7 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb) | |||
| 69 | #endif | 69 | #endif |
| 70 | 70 | ||
| 71 | int cpu_up(unsigned int cpu); | 71 | int cpu_up(unsigned int cpu); |
| 72 | void notify_cpu_starting(unsigned int cpu); | ||
| 72 | extern void cpu_hotplug_init(void); | 73 | extern void cpu_hotplug_init(void); |
| 73 | extern void cpu_maps_update_begin(void); | 74 | extern void cpu_maps_update_begin(void); |
| 74 | extern void cpu_maps_update_done(void); | 75 | extern void cpu_maps_update_done(void); |
diff --git a/include/linux/notifier.h b/include/linux/notifier.h index da2698b0fdd1..b86fa2ffca0c 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h | |||
| @@ -213,9 +213,16 @@ static inline int notifier_to_errno(int ret) | |||
| 213 | #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ | 213 | #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ |
| 214 | #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ | 214 | #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ |
| 215 | #define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task, | 215 | #define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task, |
| 216 | * not handling interrupts, soon dead */ | 216 | * not handling interrupts, soon dead. |
| 217 | * Called on the dying cpu, interrupts | ||
| 218 | * are already disabled. Must not | ||
| 219 | * sleep, must not fail */ | ||
| 217 | #define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug | 220 | #define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug |
| 218 | * lock is dropped */ | 221 | * lock is dropped */ |
| 222 | #define CPU_STARTING 0x000A /* CPU (unsigned)v soon running. | ||
| 223 | * Called on the new cpu, just before | ||
| 224 | * enabling interrupts. Must not sleep, | ||
| 225 | * must not fail */ | ||
| 219 | 226 | ||
| 220 | /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend | 227 | /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend |
| 221 | * operation in progress | 228 | * operation in progress |
| @@ -229,6 +236,7 @@ static inline int notifier_to_errno(int ret) | |||
| 229 | #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) | 236 | #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) |
| 230 | #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) | 237 | #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) |
| 231 | #define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN) | 238 | #define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN) |
| 239 | #define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN) | ||
| 232 | 240 | ||
| 233 | /* Hibernation and suspend events */ | 241 | /* Hibernation and suspend events */ |
| 234 | #define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */ | 242 | #define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */ |
diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 5afc1b23346d..cf793bbbd05e 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h | |||
| @@ -104,8 +104,8 @@ struct prop_local_single { | |||
| 104 | * snapshot of the last seen global state | 104 | * snapshot of the last seen global state |
| 105 | * and a lock protecting this state | 105 | * and a lock protecting this state |
| 106 | */ | 106 | */ |
| 107 | int shift; | ||
| 108 | unsigned long period; | 107 | unsigned long period; |
| 108 | int shift; | ||
| 109 | spinlock_t lock; /* protect the snapshot state */ | 109 | spinlock_t lock; /* protect the snapshot state */ |
| 110 | }; | 110 | }; |
| 111 | 111 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 3d9120c5ad15..d8e699b55858 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -451,8 +451,8 @@ struct signal_struct { | |||
| 451 | * - everyone except group_exit_task is stopped during signal delivery | 451 | * - everyone except group_exit_task is stopped during signal delivery |
| 452 | * of fatal signals, group_exit_task processes the signal. | 452 | * of fatal signals, group_exit_task processes the signal. |
| 453 | */ | 453 | */ |
| 454 | struct task_struct *group_exit_task; | ||
| 455 | int notify_count; | 454 | int notify_count; |
| 455 | struct task_struct *group_exit_task; | ||
| 456 | 456 | ||
| 457 | /* thread group stop support, overloads group_exit_code too */ | 457 | /* thread group stop support, overloads group_exit_code too */ |
| 458 | int group_stop_count; | 458 | int group_stop_count; |
| @@ -897,7 +897,7 @@ struct sched_class { | |||
| 897 | void (*yield_task) (struct rq *rq); | 897 | void (*yield_task) (struct rq *rq); |
| 898 | int (*select_task_rq)(struct task_struct *p, int sync); | 898 | int (*select_task_rq)(struct task_struct *p, int sync); |
| 899 | 899 | ||
| 900 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); | 900 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync); |
| 901 | 901 | ||
| 902 | struct task_struct * (*pick_next_task) (struct rq *rq); | 902 | struct task_struct * (*pick_next_task) (struct rq *rq); |
| 903 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 903 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
| @@ -1010,8 +1010,8 @@ struct sched_entity { | |||
| 1010 | 1010 | ||
| 1011 | struct sched_rt_entity { | 1011 | struct sched_rt_entity { |
| 1012 | struct list_head run_list; | 1012 | struct list_head run_list; |
| 1013 | unsigned int time_slice; | ||
| 1014 | unsigned long timeout; | 1013 | unsigned long timeout; |
| 1014 | unsigned int time_slice; | ||
| 1015 | int nr_cpus_allowed; | 1015 | int nr_cpus_allowed; |
| 1016 | 1016 | ||
| 1017 | struct sched_rt_entity *back; | 1017 | struct sched_rt_entity *back; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c246..86d49045daed 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) | |||
| 199 | struct take_cpu_down_param *param = _param; | 199 | struct take_cpu_down_param *param = _param; |
| 200 | int err; | 200 | int err; |
| 201 | 201 | ||
| 202 | raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, | ||
| 203 | param->hcpu); | ||
| 204 | /* Ensure this CPU doesn't handle any more interrupts. */ | 202 | /* Ensure this CPU doesn't handle any more interrupts. */ |
| 205 | err = __cpu_disable(); | 203 | err = __cpu_disable(); |
| 206 | if (err < 0) | 204 | if (err < 0) |
| 207 | return err; | 205 | return err; |
| 208 | 206 | ||
| 207 | raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, | ||
| 208 | param->hcpu); | ||
| 209 | |||
| 209 | /* Force idle task to run as soon as we yield: it should | 210 | /* Force idle task to run as soon as we yield: it should |
| 210 | immediately notice cpu is offline and die quickly. */ | 211 | immediately notice cpu is offline and die quickly. */ |
| 211 | sched_idle_next(); | 212 | sched_idle_next(); |
| @@ -453,6 +454,25 @@ out: | |||
| 453 | } | 454 | } |
| 454 | #endif /* CONFIG_PM_SLEEP_SMP */ | 455 | #endif /* CONFIG_PM_SLEEP_SMP */ |
| 455 | 456 | ||
| 457 | /** | ||
| 458 | * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers | ||
| 459 | * @cpu: cpu that just started | ||
| 460 | * | ||
| 461 | * This function calls the cpu_chain notifiers with CPU_STARTING. | ||
| 462 | * It must be called by the arch code on the new cpu, before the new cpu | ||
| 463 | * enables interrupts and before the "boot" cpu returns from __cpu_up(). | ||
| 464 | */ | ||
| 465 | void notify_cpu_starting(unsigned int cpu) | ||
| 466 | { | ||
| 467 | unsigned long val = CPU_STARTING; | ||
| 468 | |||
| 469 | #ifdef CONFIG_PM_SLEEP_SMP | ||
| 470 | if (cpu_isset(cpu, frozen_cpus)) | ||
| 471 | val = CPU_STARTING_FROZEN; | ||
| 472 | #endif /* CONFIG_PM_SLEEP_SMP */ | ||
| 473 | raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); | ||
| 474 | } | ||
| 475 | |||
| 456 | #endif /* CONFIG_SMP */ | 476 | #endif /* CONFIG_SMP */ |
| 457 | 477 | ||
| 458 | /* | 478 | /* |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 827cd9adccb2..eab7bd6628e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
| 1921 | * that has tasks along with an empty 'mems'. But if we did see such | 1921 | * that has tasks along with an empty 'mems'. But if we did see such |
| 1922 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | 1922 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. |
| 1923 | */ | 1923 | */ |
| 1924 | static void scan_for_empty_cpusets(const struct cpuset *root) | 1924 | static void scan_for_empty_cpusets(struct cpuset *root) |
| 1925 | { | 1925 | { |
| 1926 | LIST_HEAD(queue); | 1926 | LIST_HEAD(queue); |
| 1927 | struct cpuset *cp; /* scans cpusets being updated */ | 1927 | struct cpuset *cp; /* scans cpusets being updated */ |
diff --git a/kernel/sched.c b/kernel/sched.c index ad1962dc0aa2..9715f4ce6cfe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -204,11 +204,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
| 204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; | 204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; |
| 205 | } | 205 | } |
| 206 | 206 | ||
| 207 | static inline int rt_bandwidth_enabled(void) | ||
| 208 | { | ||
| 209 | return sysctl_sched_rt_runtime >= 0; | ||
| 210 | } | ||
| 211 | |||
| 207 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 212 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) |
| 208 | { | 213 | { |
| 209 | ktime_t now; | 214 | ktime_t now; |
| 210 | 215 | ||
| 211 | if (rt_b->rt_runtime == RUNTIME_INF) | 216 | if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) |
| 212 | return; | 217 | return; |
| 213 | 218 | ||
| 214 | if (hrtimer_active(&rt_b->rt_period_timer)) | 219 | if (hrtimer_active(&rt_b->rt_period_timer)) |
| @@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | |||
| 298 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 303 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
| 299 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 304 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
| 300 | #endif /* CONFIG_RT_GROUP_SCHED */ | 305 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 301 | #else /* !CONFIG_FAIR_GROUP_SCHED */ | 306 | #else /* !CONFIG_USER_SCHED */ |
| 302 | #define root_task_group init_task_group | 307 | #define root_task_group init_task_group |
| 303 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 308 | #endif /* CONFIG_USER_SCHED */ |
| 304 | 309 | ||
| 305 | /* task_group_lock serializes add/remove of task groups and also changes to | 310 | /* task_group_lock serializes add/remove of task groups and also changes to |
| 306 | * a task group's cpu shares. | 311 | * a task group's cpu shares. |
| @@ -604,9 +609,9 @@ struct rq { | |||
| 604 | 609 | ||
| 605 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 610 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 606 | 611 | ||
| 607 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 612 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) |
| 608 | { | 613 | { |
| 609 | rq->curr->sched_class->check_preempt_curr(rq, p); | 614 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); |
| 610 | } | 615 | } |
| 611 | 616 | ||
| 612 | static inline int cpu_of(struct rq *rq) | 617 | static inline int cpu_of(struct rq *rq) |
| @@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay) | |||
| 1102 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); | 1107 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); |
| 1103 | } | 1108 | } |
| 1104 | 1109 | ||
| 1105 | static void init_hrtick(void) | 1110 | static inline void init_hrtick(void) |
| 1106 | { | 1111 | { |
| 1107 | } | 1112 | } |
| 1108 | #endif /* CONFIG_SMP */ | 1113 | #endif /* CONFIG_SMP */ |
| @@ -1121,7 +1126,7 @@ static void init_rq_hrtick(struct rq *rq) | |||
| 1121 | rq->hrtick_timer.function = hrtick; | 1126 | rq->hrtick_timer.function = hrtick; |
| 1122 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; | 1127 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; |
| 1123 | } | 1128 | } |
| 1124 | #else | 1129 | #else /* CONFIG_SCHED_HRTICK */ |
| 1125 | static inline void hrtick_clear(struct rq *rq) | 1130 | static inline void hrtick_clear(struct rq *rq) |
| 1126 | { | 1131 | { |
| 1127 | } | 1132 | } |
| @@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq) | |||
| 1133 | static inline void init_hrtick(void) | 1138 | static inline void init_hrtick(void) |
| 1134 | { | 1139 | { |
| 1135 | } | 1140 | } |
| 1136 | #endif | 1141 | #endif /* CONFIG_SCHED_HRTICK */ |
| 1137 | 1142 | ||
| 1138 | /* | 1143 | /* |
| 1139 | * resched_task - mark a task 'to be rescheduled now'. | 1144 | * resched_task - mark a task 'to be rescheduled now'. |
| @@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
| 1380 | update_load_sub(&rq->load, load); | 1385 | update_load_sub(&rq->load, load); |
| 1381 | } | 1386 | } |
| 1382 | 1387 | ||
| 1383 | #ifdef CONFIG_SMP | 1388 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) |
| 1384 | static unsigned long source_load(int cpu, int type); | 1389 | typedef int (*tg_visitor)(struct task_group *, void *); |
| 1385 | static unsigned long target_load(int cpu, int type); | ||
| 1386 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
| 1387 | |||
| 1388 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
| 1389 | { | ||
| 1390 | struct rq *rq = cpu_rq(cpu); | ||
| 1391 | |||
| 1392 | if (rq->nr_running) | ||
| 1393 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
| 1394 | |||
| 1395 | return rq->avg_load_per_task; | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1399 | |||
| 1400 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); | ||
| 1401 | 1390 | ||
| 1402 | /* | 1391 | /* |
| 1403 | * Iterate the full tree, calling @down when first entering a node and @up when | 1392 | * Iterate the full tree, calling @down when first entering a node and @up when |
| 1404 | * leaving it for the final time. | 1393 | * leaving it for the final time. |
| 1405 | */ | 1394 | */ |
| 1406 | static void | 1395 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) |
| 1407 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) | ||
| 1408 | { | 1396 | { |
| 1409 | struct task_group *parent, *child; | 1397 | struct task_group *parent, *child; |
| 1398 | int ret; | ||
| 1410 | 1399 | ||
| 1411 | rcu_read_lock(); | 1400 | rcu_read_lock(); |
| 1412 | parent = &root_task_group; | 1401 | parent = &root_task_group; |
| 1413 | down: | 1402 | down: |
| 1414 | (*down)(parent, cpu, sd); | 1403 | ret = (*down)(parent, data); |
| 1404 | if (ret) | ||
| 1405 | goto out_unlock; | ||
| 1415 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1406 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
| 1416 | parent = child; | 1407 | parent = child; |
| 1417 | goto down; | 1408 | goto down; |
| @@ -1419,15 +1410,43 @@ down: | |||
| 1419 | up: | 1410 | up: |
| 1420 | continue; | 1411 | continue; |
| 1421 | } | 1412 | } |
| 1422 | (*up)(parent, cpu, sd); | 1413 | ret = (*up)(parent, data); |
| 1414 | if (ret) | ||
| 1415 | goto out_unlock; | ||
| 1423 | 1416 | ||
| 1424 | child = parent; | 1417 | child = parent; |
| 1425 | parent = parent->parent; | 1418 | parent = parent->parent; |
| 1426 | if (parent) | 1419 | if (parent) |
| 1427 | goto up; | 1420 | goto up; |
| 1421 | out_unlock: | ||
| 1428 | rcu_read_unlock(); | 1422 | rcu_read_unlock(); |
| 1423 | |||
| 1424 | return ret; | ||
| 1429 | } | 1425 | } |
| 1430 | 1426 | ||
| 1427 | static int tg_nop(struct task_group *tg, void *data) | ||
| 1428 | { | ||
| 1429 | return 0; | ||
| 1430 | } | ||
| 1431 | #endif | ||
| 1432 | |||
| 1433 | #ifdef CONFIG_SMP | ||
| 1434 | static unsigned long source_load(int cpu, int type); | ||
| 1435 | static unsigned long target_load(int cpu, int type); | ||
| 1436 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
| 1437 | |||
| 1438 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
| 1439 | { | ||
| 1440 | struct rq *rq = cpu_rq(cpu); | ||
| 1441 | |||
| 1442 | if (rq->nr_running) | ||
| 1443 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
| 1444 | |||
| 1445 | return rq->avg_load_per_task; | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1449 | |||
| 1431 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1450 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
| 1432 | 1451 | ||
| 1433 | /* | 1452 | /* |
| @@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1486 | * This needs to be done in a bottom-up fashion because the rq weight of a | 1505 | * This needs to be done in a bottom-up fashion because the rq weight of a |
| 1487 | * parent group depends on the shares of its child groups. | 1506 | * parent group depends on the shares of its child groups. |
| 1488 | */ | 1507 | */ |
| 1489 | static void | 1508 | static int tg_shares_up(struct task_group *tg, void *data) |
| 1490 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
| 1491 | { | 1509 | { |
| 1492 | unsigned long rq_weight = 0; | 1510 | unsigned long rq_weight = 0; |
| 1493 | unsigned long shares = 0; | 1511 | unsigned long shares = 0; |
| 1512 | struct sched_domain *sd = data; | ||
| 1494 | int i; | 1513 | int i; |
| 1495 | 1514 | ||
| 1496 | for_each_cpu_mask(i, sd->span) { | 1515 | for_each_cpu_mask(i, sd->span) { |
| @@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
| 1515 | __update_group_shares_cpu(tg, i, shares, rq_weight); | 1534 | __update_group_shares_cpu(tg, i, shares, rq_weight); |
| 1516 | spin_unlock_irqrestore(&rq->lock, flags); | 1535 | spin_unlock_irqrestore(&rq->lock, flags); |
| 1517 | } | 1536 | } |
| 1537 | |||
| 1538 | return 0; | ||
| 1518 | } | 1539 | } |
| 1519 | 1540 | ||
| 1520 | /* | 1541 | /* |
| @@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
| 1522 | * This needs to be done in a top-down fashion because the load of a child | 1543 | * This needs to be done in a top-down fashion because the load of a child |
| 1523 | * group is a fraction of its parents load. | 1544 | * group is a fraction of its parents load. |
| 1524 | */ | 1545 | */ |
| 1525 | static void | 1546 | static int tg_load_down(struct task_group *tg, void *data) |
| 1526 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
| 1527 | { | 1547 | { |
| 1528 | unsigned long load; | 1548 | unsigned long load; |
| 1549 | long cpu = (long)data; | ||
| 1529 | 1550 | ||
| 1530 | if (!tg->parent) { | 1551 | if (!tg->parent) { |
| 1531 | load = cpu_rq(cpu)->load.weight; | 1552 | load = cpu_rq(cpu)->load.weight; |
| @@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | |||
| 1536 | } | 1557 | } |
| 1537 | 1558 | ||
| 1538 | tg->cfs_rq[cpu]->h_load = load; | 1559 | tg->cfs_rq[cpu]->h_load = load; |
| 1539 | } | ||
| 1540 | 1560 | ||
| 1541 | static void | 1561 | return 0; |
| 1542 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
| 1543 | { | ||
| 1544 | } | 1562 | } |
| 1545 | 1563 | ||
| 1546 | static void update_shares(struct sched_domain *sd) | 1564 | static void update_shares(struct sched_domain *sd) |
| @@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd) | |||
| 1550 | 1568 | ||
| 1551 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1569 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
| 1552 | sd->last_update = now; | 1570 | sd->last_update = now; |
| 1553 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); | 1571 | walk_tg_tree(tg_nop, tg_shares_up, sd); |
| 1554 | } | 1572 | } |
| 1555 | } | 1573 | } |
| 1556 | 1574 | ||
| @@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
| 1561 | spin_lock(&rq->lock); | 1579 | spin_lock(&rq->lock); |
| 1562 | } | 1580 | } |
| 1563 | 1581 | ||
| 1564 | static void update_h_load(int cpu) | 1582 | static void update_h_load(long cpu) |
| 1565 | { | 1583 | { |
| 1566 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); | 1584 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
| 1567 | } | 1585 | } |
| 1568 | 1586 | ||
| 1569 | #else | 1587 | #else |
| @@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 1921 | running = task_running(rq, p); | 1939 | running = task_running(rq, p); |
| 1922 | on_rq = p->se.on_rq; | 1940 | on_rq = p->se.on_rq; |
| 1923 | ncsw = 0; | 1941 | ncsw = 0; |
| 1924 | if (!match_state || p->state == match_state) { | 1942 | if (!match_state || p->state == match_state) |
| 1925 | ncsw = p->nivcsw + p->nvcsw; | 1943 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
| 1926 | if (unlikely(!ncsw)) | ||
| 1927 | ncsw = 1; | ||
| 1928 | } | ||
| 1929 | task_rq_unlock(rq, &flags); | 1944 | task_rq_unlock(rq, &flags); |
| 1930 | 1945 | ||
| 1931 | /* | 1946 | /* |
| @@ -2285,7 +2300,7 @@ out_running: | |||
| 2285 | trace_mark(kernel_sched_wakeup, | 2300 | trace_mark(kernel_sched_wakeup, |
| 2286 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2301 | "pid %d state %ld ## rq %p task %p rq->curr %p", |
| 2287 | p->pid, p->state, rq, p, rq->curr); | 2302 | p->pid, p->state, rq, p, rq->curr); |
| 2288 | check_preempt_curr(rq, p); | 2303 | check_preempt_curr(rq, p, sync); |
| 2289 | 2304 | ||
| 2290 | p->state = TASK_RUNNING; | 2305 | p->state = TASK_RUNNING; |
| 2291 | #ifdef CONFIG_SMP | 2306 | #ifdef CONFIG_SMP |
| @@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 2420 | trace_mark(kernel_sched_wakeup_new, | 2435 | trace_mark(kernel_sched_wakeup_new, |
| 2421 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2436 | "pid %d state %ld ## rq %p task %p rq->curr %p", |
| 2422 | p->pid, p->state, rq, p, rq->curr); | 2437 | p->pid, p->state, rq, p, rq->curr); |
| 2423 | check_preempt_curr(rq, p); | 2438 | check_preempt_curr(rq, p, 0); |
| 2424 | #ifdef CONFIG_SMP | 2439 | #ifdef CONFIG_SMP |
| 2425 | if (p->sched_class->task_wake_up) | 2440 | if (p->sched_class->task_wake_up) |
| 2426 | p->sched_class->task_wake_up(rq, p); | 2441 | p->sched_class->task_wake_up(rq, p); |
| @@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
| 2880 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2895 | * Note that idle threads have a prio of MAX_PRIO, for this test |
| 2881 | * to be always true for them. | 2896 | * to be always true for them. |
| 2882 | */ | 2897 | */ |
| 2883 | check_preempt_curr(this_rq, p); | 2898 | check_preempt_curr(this_rq, p, 0); |
| 2884 | } | 2899 | } |
| 2885 | 2900 | ||
| 2886 | /* | 2901 | /* |
| @@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | |||
| 4627 | } | 4642 | } |
| 4628 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 4643 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
| 4629 | 4644 | ||
| 4645 | /** | ||
| 4646 | * complete: - signals a single thread waiting on this completion | ||
| 4647 | * @x: holds the state of this particular completion | ||
| 4648 | * | ||
| 4649 | * This will wake up a single thread waiting on this completion. Threads will be | ||
| 4650 | * awakened in the same order in which they were queued. | ||
| 4651 | * | ||
| 4652 | * See also complete_all(), wait_for_completion() and related routines. | ||
| 4653 | */ | ||
| 4630 | void complete(struct completion *x) | 4654 | void complete(struct completion *x) |
| 4631 | { | 4655 | { |
| 4632 | unsigned long flags; | 4656 | unsigned long flags; |
| @@ -4638,6 +4662,12 @@ void complete(struct completion *x) | |||
| 4638 | } | 4662 | } |
| 4639 | EXPORT_SYMBOL(complete); | 4663 | EXPORT_SYMBOL(complete); |
| 4640 | 4664 | ||
| 4665 | /** | ||
| 4666 | * complete_all: - signals all threads waiting on this completion | ||
| 4667 | * @x: holds the state of this particular completion | ||
| 4668 | * | ||
| 4669 | * This will wake up all threads waiting on this particular completion event. | ||
| 4670 | */ | ||
| 4641 | void complete_all(struct completion *x) | 4671 | void complete_all(struct completion *x) |
| 4642 | { | 4672 | { |
| 4643 | unsigned long flags; | 4673 | unsigned long flags; |
| @@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
| 4658 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 4688 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
| 4659 | __add_wait_queue_tail(&x->wait, &wait); | 4689 | __add_wait_queue_tail(&x->wait, &wait); |
| 4660 | do { | 4690 | do { |
| 4661 | if ((state == TASK_INTERRUPTIBLE && | 4691 | if (signal_pending_state(state, current)) { |
| 4662 | signal_pending(current)) || | ||
| 4663 | (state == TASK_KILLABLE && | ||
| 4664 | fatal_signal_pending(current))) { | ||
| 4665 | timeout = -ERESTARTSYS; | 4692 | timeout = -ERESTARTSYS; |
| 4666 | break; | 4693 | break; |
| 4667 | } | 4694 | } |
| @@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state) | |||
| 4689 | return timeout; | 4716 | return timeout; |
| 4690 | } | 4717 | } |
| 4691 | 4718 | ||
| 4719 | /** | ||
| 4720 | * wait_for_completion: - waits for completion of a task | ||
| 4721 | * @x: holds the state of this particular completion | ||
| 4722 | * | ||
| 4723 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 4724 | * interruptible and there is no timeout. | ||
| 4725 | * | ||
| 4726 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
| 4727 | * and interrupt capability. Also see complete(). | ||
| 4728 | */ | ||
| 4692 | void __sched wait_for_completion(struct completion *x) | 4729 | void __sched wait_for_completion(struct completion *x) |
| 4693 | { | 4730 | { |
| 4694 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | 4731 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
| 4695 | } | 4732 | } |
| 4696 | EXPORT_SYMBOL(wait_for_completion); | 4733 | EXPORT_SYMBOL(wait_for_completion); |
| 4697 | 4734 | ||
| 4735 | /** | ||
| 4736 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
| 4737 | * @x: holds the state of this particular completion | ||
| 4738 | * @timeout: timeout value in jiffies | ||
| 4739 | * | ||
| 4740 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 4741 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 4742 | * interruptible. | ||
| 4743 | */ | ||
| 4698 | unsigned long __sched | 4744 | unsigned long __sched |
| 4699 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 4745 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
| 4700 | { | 4746 | { |
| @@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) | |||
| 4702 | } | 4748 | } |
| 4703 | EXPORT_SYMBOL(wait_for_completion_timeout); | 4749 | EXPORT_SYMBOL(wait_for_completion_timeout); |
| 4704 | 4750 | ||
| 4751 | /** | ||
| 4752 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
| 4753 | * @x: holds the state of this particular completion | ||
| 4754 | * | ||
| 4755 | * This waits for completion of a specific task to be signaled. It is | ||
| 4756 | * interruptible. | ||
| 4757 | */ | ||
| 4705 | int __sched wait_for_completion_interruptible(struct completion *x) | 4758 | int __sched wait_for_completion_interruptible(struct completion *x) |
| 4706 | { | 4759 | { |
| 4707 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | 4760 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
| @@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) | |||
| 4711 | } | 4764 | } |
| 4712 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 4765 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
| 4713 | 4766 | ||
| 4767 | /** | ||
| 4768 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
| 4769 | * @x: holds the state of this particular completion | ||
| 4770 | * @timeout: timeout value in jiffies | ||
| 4771 | * | ||
| 4772 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 4773 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
| 4774 | */ | ||
| 4714 | unsigned long __sched | 4775 | unsigned long __sched |
| 4715 | wait_for_completion_interruptible_timeout(struct completion *x, | 4776 | wait_for_completion_interruptible_timeout(struct completion *x, |
| 4716 | unsigned long timeout) | 4777 | unsigned long timeout) |
| @@ -4719,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, | |||
| 4719 | } | 4780 | } |
| 4720 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 4781 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
| 4721 | 4782 | ||
| 4783 | /** | ||
| 4784 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
| 4785 | * @x: holds the state of this particular completion | ||
| 4786 | * | ||
| 4787 | * This waits to be signaled for completion of a specific task. It can be | ||
| 4788 | * interrupted by a kill signal. | ||
| 4789 | */ | ||
| 4722 | int __sched wait_for_completion_killable(struct completion *x) | 4790 | int __sched wait_for_completion_killable(struct completion *x) |
| 4723 | { | 4791 | { |
| 4724 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | 4792 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); |
| @@ -5121,7 +5189,8 @@ recheck: | |||
| 5121 | * Do not allow realtime tasks into groups that have no runtime | 5189 | * Do not allow realtime tasks into groups that have no runtime |
| 5122 | * assigned. | 5190 | * assigned. |
| 5123 | */ | 5191 | */ |
| 5124 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | 5192 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
| 5193 | task_group(p)->rt_bandwidth.rt_runtime == 0) | ||
| 5125 | return -EPERM; | 5194 | return -EPERM; |
| 5126 | #endif | 5195 | #endif |
| 5127 | 5196 | ||
| @@ -5957,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 5957 | set_task_cpu(p, dest_cpu); | 6026 | set_task_cpu(p, dest_cpu); |
| 5958 | if (on_rq) { | 6027 | if (on_rq) { |
| 5959 | activate_task(rq_dest, p, 0); | 6028 | activate_task(rq_dest, p, 0); |
| 5960 | check_preempt_curr(rq_dest, p); | 6029 | check_preempt_curr(rq_dest, p, 0); |
| 5961 | } | 6030 | } |
| 5962 | done: | 6031 | done: |
| 5963 | ret = 1; | 6032 | ret = 1; |
| @@ -8242,20 +8311,25 @@ void __might_sleep(char *file, int line) | |||
| 8242 | #ifdef in_atomic | 8311 | #ifdef in_atomic |
| 8243 | static unsigned long prev_jiffy; /* ratelimiting */ | 8312 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 8244 | 8313 | ||
| 8245 | if ((in_atomic() || irqs_disabled()) && | 8314 | if ((!in_atomic() && !irqs_disabled()) || |
| 8246 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 8315 | system_state != SYSTEM_RUNNING || oops_in_progress) |
| 8247 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 8316 | return; |
| 8248 | return; | 8317 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
| 8249 | prev_jiffy = jiffies; | 8318 | return; |
| 8250 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 8319 | prev_jiffy = jiffies; |
| 8251 | " context at %s:%d\n", file, line); | 8320 | |
| 8252 | printk("in_atomic():%d, irqs_disabled():%d\n", | 8321 | printk(KERN_ERR |
| 8253 | in_atomic(), irqs_disabled()); | 8322 | "BUG: sleeping function called from invalid context at %s:%d\n", |
| 8254 | debug_show_held_locks(current); | 8323 | file, line); |
| 8255 | if (irqs_disabled()) | 8324 | printk(KERN_ERR |
| 8256 | print_irqtrace_events(current); | 8325 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
| 8257 | dump_stack(); | 8326 | in_atomic(), irqs_disabled(), |
| 8258 | } | 8327 | current->pid, current->comm); |
| 8328 | |||
| 8329 | debug_show_held_locks(current); | ||
| 8330 | if (irqs_disabled()) | ||
| 8331 | print_irqtrace_events(current); | ||
| 8332 | dump_stack(); | ||
| 8259 | #endif | 8333 | #endif |
| 8260 | } | 8334 | } |
| 8261 | EXPORT_SYMBOL(__might_sleep); | 8335 | EXPORT_SYMBOL(__might_sleep); |
| @@ -8753,73 +8827,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); | |||
| 8753 | static unsigned long to_ratio(u64 period, u64 runtime) | 8827 | static unsigned long to_ratio(u64 period, u64 runtime) |
| 8754 | { | 8828 | { |
| 8755 | if (runtime == RUNTIME_INF) | 8829 | if (runtime == RUNTIME_INF) |
| 8756 | return 1ULL << 16; | 8830 | return 1ULL << 20; |
| 8757 | 8831 | ||
| 8758 | return div64_u64(runtime << 16, period); | 8832 | return div64_u64(runtime << 20, period); |
| 8759 | } | 8833 | } |
| 8760 | 8834 | ||
| 8761 | #ifdef CONFIG_CGROUP_SCHED | 8835 | /* Must be called with tasklist_lock held */ |
| 8762 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8836 | static inline int tg_has_rt_tasks(struct task_group *tg) |
| 8763 | { | 8837 | { |
| 8764 | struct task_group *tgi, *parent = tg->parent; | 8838 | struct task_struct *g, *p; |
| 8765 | unsigned long total = 0; | ||
| 8766 | 8839 | ||
| 8767 | if (!parent) { | 8840 | do_each_thread(g, p) { |
| 8768 | if (global_rt_period() < period) | 8841 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) |
| 8769 | return 0; | 8842 | return 1; |
| 8843 | } while_each_thread(g, p); | ||
| 8770 | 8844 | ||
| 8771 | return to_ratio(period, runtime) < | 8845 | return 0; |
| 8772 | to_ratio(global_rt_period(), global_rt_runtime()); | 8846 | } |
| 8773 | } | ||
| 8774 | 8847 | ||
| 8775 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | 8848 | struct rt_schedulable_data { |
| 8776 | return 0; | 8849 | struct task_group *tg; |
| 8850 | u64 rt_period; | ||
| 8851 | u64 rt_runtime; | ||
| 8852 | }; | ||
| 8777 | 8853 | ||
| 8778 | rcu_read_lock(); | 8854 | static int tg_schedulable(struct task_group *tg, void *data) |
| 8779 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | 8855 | { |
| 8780 | if (tgi == tg) | 8856 | struct rt_schedulable_data *d = data; |
| 8781 | continue; | 8857 | struct task_group *child; |
| 8858 | unsigned long total, sum = 0; | ||
| 8859 | u64 period, runtime; | ||
| 8782 | 8860 | ||
| 8783 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8861 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); |
| 8784 | tgi->rt_bandwidth.rt_runtime); | 8862 | runtime = tg->rt_bandwidth.rt_runtime; |
| 8863 | |||
| 8864 | if (tg == d->tg) { | ||
| 8865 | period = d->rt_period; | ||
| 8866 | runtime = d->rt_runtime; | ||
| 8785 | } | 8867 | } |
| 8786 | rcu_read_unlock(); | ||
| 8787 | 8868 | ||
| 8788 | return total + to_ratio(period, runtime) <= | 8869 | /* |
| 8789 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8870 | * Cannot have more runtime than the period. |
| 8790 | parent->rt_bandwidth.rt_runtime); | 8871 | */ |
| 8791 | } | 8872 | if (runtime > period && runtime != RUNTIME_INF) |
| 8792 | #elif defined CONFIG_USER_SCHED | 8873 | return -EINVAL; |
| 8793 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
| 8794 | { | ||
| 8795 | struct task_group *tgi; | ||
| 8796 | unsigned long total = 0; | ||
| 8797 | unsigned long global_ratio = | ||
| 8798 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
| 8799 | 8874 | ||
| 8800 | rcu_read_lock(); | 8875 | /* |
| 8801 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8876 | * Ensure we don't starve existing RT tasks. |
| 8802 | if (tgi == tg) | 8877 | */ |
| 8803 | continue; | 8878 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) |
| 8879 | return -EBUSY; | ||
| 8880 | |||
| 8881 | total = to_ratio(period, runtime); | ||
| 8882 | |||
| 8883 | /* | ||
| 8884 | * Nobody can have more than the global setting allows. | ||
| 8885 | */ | ||
| 8886 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) | ||
| 8887 | return -EINVAL; | ||
| 8888 | |||
| 8889 | /* | ||
| 8890 | * The sum of our children's runtime should not exceed our own. | ||
| 8891 | */ | ||
| 8892 | list_for_each_entry_rcu(child, &tg->children, siblings) { | ||
| 8893 | period = ktime_to_ns(child->rt_bandwidth.rt_period); | ||
| 8894 | runtime = child->rt_bandwidth.rt_runtime; | ||
| 8895 | |||
| 8896 | if (child == d->tg) { | ||
| 8897 | period = d->rt_period; | ||
| 8898 | runtime = d->rt_runtime; | ||
| 8899 | } | ||
| 8804 | 8900 | ||
| 8805 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8901 | sum += to_ratio(period, runtime); |
| 8806 | tgi->rt_bandwidth.rt_runtime); | ||
| 8807 | } | 8902 | } |
| 8808 | rcu_read_unlock(); | ||
| 8809 | 8903 | ||
| 8810 | return total + to_ratio(period, runtime) < global_ratio; | 8904 | if (sum > total) |
| 8905 | return -EINVAL; | ||
| 8906 | |||
| 8907 | return 0; | ||
| 8811 | } | 8908 | } |
| 8812 | #endif | ||
| 8813 | 8909 | ||
| 8814 | /* Must be called with tasklist_lock held */ | 8910 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
| 8815 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
| 8816 | { | 8911 | { |
| 8817 | struct task_struct *g, *p; | 8912 | struct rt_schedulable_data data = { |
| 8818 | do_each_thread(g, p) { | 8913 | .tg = tg, |
| 8819 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 8914 | .rt_period = period, |
| 8820 | return 1; | 8915 | .rt_runtime = runtime, |
| 8821 | } while_each_thread(g, p); | 8916 | }; |
| 8822 | return 0; | 8917 | |
| 8918 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | ||
| 8823 | } | 8919 | } |
| 8824 | 8920 | ||
| 8825 | static int tg_set_bandwidth(struct task_group *tg, | 8921 | static int tg_set_bandwidth(struct task_group *tg, |
| @@ -8829,14 +8925,9 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
| 8829 | 8925 | ||
| 8830 | mutex_lock(&rt_constraints_mutex); | 8926 | mutex_lock(&rt_constraints_mutex); |
| 8831 | read_lock(&tasklist_lock); | 8927 | read_lock(&tasklist_lock); |
| 8832 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { | 8928 | err = __rt_schedulable(tg, rt_period, rt_runtime); |
| 8833 | err = -EBUSY; | 8929 | if (err) |
| 8834 | goto unlock; | 8930 | goto unlock; |
| 8835 | } | ||
| 8836 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { | ||
| 8837 | err = -EINVAL; | ||
| 8838 | goto unlock; | ||
| 8839 | } | ||
| 8840 | 8931 | ||
| 8841 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8932 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
| 8842 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | 8933 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); |
| @@ -8905,19 +8996,25 @@ long sched_group_rt_period(struct task_group *tg) | |||
| 8905 | 8996 | ||
| 8906 | static int sched_rt_global_constraints(void) | 8997 | static int sched_rt_global_constraints(void) |
| 8907 | { | 8998 | { |
| 8908 | struct task_group *tg = &root_task_group; | 8999 | u64 runtime, period; |
| 8909 | u64 rt_runtime, rt_period; | ||
| 8910 | int ret = 0; | 9000 | int ret = 0; |
| 8911 | 9001 | ||
| 8912 | if (sysctl_sched_rt_period <= 0) | 9002 | if (sysctl_sched_rt_period <= 0) |
| 8913 | return -EINVAL; | 9003 | return -EINVAL; |
| 8914 | 9004 | ||
| 8915 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | 9005 | runtime = global_rt_runtime(); |
| 8916 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 9006 | period = global_rt_period(); |
| 9007 | |||
| 9008 | /* | ||
| 9009 | * Sanity check on the sysctl variables. | ||
| 9010 | */ | ||
| 9011 | if (runtime > period && runtime != RUNTIME_INF) | ||
| 9012 | return -EINVAL; | ||
| 8917 | 9013 | ||
| 8918 | mutex_lock(&rt_constraints_mutex); | 9014 | mutex_lock(&rt_constraints_mutex); |
| 8919 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) | 9015 | read_lock(&tasklist_lock); |
| 8920 | ret = -EINVAL; | 9016 | ret = __rt_schedulable(NULL, 0, 0); |
| 9017 | read_unlock(&tasklist_lock); | ||
| 8921 | mutex_unlock(&rt_constraints_mutex); | 9018 | mutex_unlock(&rt_constraints_mutex); |
| 8922 | 9019 | ||
| 8923 | return ret; | 9020 | return ret; |
| @@ -8991,7 +9088,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 8991 | 9088 | ||
| 8992 | if (!cgrp->parent) { | 9089 | if (!cgrp->parent) { |
| 8993 | /* This is early initialization for the top cgroup */ | 9090 | /* This is early initialization for the top cgroup */ |
| 8994 | init_task_group.css.cgroup = cgrp; | ||
| 8995 | return &init_task_group.css; | 9091 | return &init_task_group.css; |
| 8996 | } | 9092 | } |
| 8997 | 9093 | ||
| @@ -9000,9 +9096,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 9000 | if (IS_ERR(tg)) | 9096 | if (IS_ERR(tg)) |
| 9001 | return ERR_PTR(-ENOMEM); | 9097 | return ERR_PTR(-ENOMEM); |
| 9002 | 9098 | ||
| 9003 | /* Bind the cgroup to task_group object we just created */ | ||
| 9004 | tg->css.cgroup = cgrp; | ||
| 9005 | |||
| 9006 | return &tg->css; | 9099 | return &tg->css; |
| 9007 | } | 9100 | } |
| 9008 | 9101 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fb8994c6d4bb..fcbe850a5a90 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 409 | } | 409 | } |
| 410 | 410 | ||
| 411 | /* | 411 | /* |
| 412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
| 413 | * that it favours >=0 over <0. | ||
| 414 | * | ||
| 415 | * -20 | | ||
| 416 | * | | ||
| 417 | * 0 --------+------- | ||
| 418 | * .' | ||
| 419 | * 19 .' | ||
| 420 | * | ||
| 421 | */ | ||
| 422 | static unsigned long | ||
| 423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
| 424 | { | ||
| 425 | struct load_weight lw = { | ||
| 426 | .weight = NICE_0_LOAD, | ||
| 427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
| 428 | }; | ||
| 429 | |||
| 430 | for_each_sched_entity(se) { | ||
| 431 | struct load_weight *se_lw = &se->load; | ||
| 432 | unsigned long rw = cfs_rq_of(se)->load.weight; | ||
| 433 | |||
| 434 | #ifdef CONFIG_FAIR_SCHED_GROUP | ||
| 435 | struct cfs_rq *cfs_rq = se->my_q; | ||
| 436 | struct task_group *tg = NULL | ||
| 437 | |||
| 438 | if (cfs_rq) | ||
| 439 | tg = cfs_rq->tg; | ||
| 440 | |||
| 441 | if (tg && tg->shares < NICE_0_LOAD) { | ||
| 442 | /* | ||
| 443 | * scale shares to what it would have been had | ||
| 444 | * tg->weight been NICE_0_LOAD: | ||
| 445 | * | ||
| 446 | * weight = 1024 * shares / tg->weight | ||
| 447 | */ | ||
| 448 | lw.weight *= se->load.weight; | ||
| 449 | lw.weight /= tg->shares; | ||
| 450 | |||
| 451 | lw.inv_weight = 0; | ||
| 452 | |||
| 453 | se_lw = &lw; | ||
| 454 | rw += lw.weight - se->load.weight; | ||
| 455 | } else | ||
| 456 | #endif | ||
| 457 | |||
| 458 | if (se->load.weight < NICE_0_LOAD) { | ||
| 459 | se_lw = &lw; | ||
| 460 | rw += NICE_0_LOAD - se->load.weight; | ||
| 461 | } | ||
| 462 | |||
| 463 | delta = calc_delta_mine(delta, rw, se_lw); | ||
| 464 | } | ||
| 465 | |||
| 466 | return delta; | ||
| 467 | } | ||
| 468 | |||
| 469 | /* | ||
| 470 | * Update the current task's runtime statistics. Skip current tasks that | 412 | * Update the current task's runtime statistics. Skip current tasks that |
| 471 | * are not in our scheduling class. | 413 | * are not in our scheduling class. |
| 472 | */ | 414 | */ |
| @@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 586 | update_load_add(&cfs_rq->load, se->load.weight); | 528 | update_load_add(&cfs_rq->load, se->load.weight); |
| 587 | if (!parent_entity(se)) | 529 | if (!parent_entity(se)) |
| 588 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | 530 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); |
| 589 | if (entity_is_task(se)) | 531 | if (entity_is_task(se)) { |
| 590 | add_cfs_task_weight(cfs_rq, se->load.weight); | 532 | add_cfs_task_weight(cfs_rq, se->load.weight); |
| 533 | list_add(&se->group_node, &cfs_rq->tasks); | ||
| 534 | } | ||
| 591 | cfs_rq->nr_running++; | 535 | cfs_rq->nr_running++; |
| 592 | se->on_rq = 1; | 536 | se->on_rq = 1; |
| 593 | list_add(&se->group_node, &cfs_rq->tasks); | ||
| 594 | } | 537 | } |
| 595 | 538 | ||
| 596 | static void | 539 | static void |
| @@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 599 | update_load_sub(&cfs_rq->load, se->load.weight); | 542 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 600 | if (!parent_entity(se)) | 543 | if (!parent_entity(se)) |
| 601 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | 544 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); |
| 602 | if (entity_is_task(se)) | 545 | if (entity_is_task(se)) { |
| 603 | add_cfs_task_weight(cfs_rq, -se->load.weight); | 546 | add_cfs_task_weight(cfs_rq, -se->load.weight); |
| 547 | list_del_init(&se->group_node); | ||
| 548 | } | ||
| 604 | cfs_rq->nr_running--; | 549 | cfs_rq->nr_running--; |
| 605 | se->on_rq = 0; | 550 | se->on_rq = 0; |
| 606 | list_del_init(&se->group_node); | ||
| 607 | } | 551 | } |
| 608 | 552 | ||
| 609 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 553 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| @@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu, | |||
| 1085 | long wl, long wg) | 1029 | long wl, long wg) |
| 1086 | { | 1030 | { |
| 1087 | struct sched_entity *se = tg->se[cpu]; | 1031 | struct sched_entity *se = tg->se[cpu]; |
| 1088 | long more_w; | ||
| 1089 | 1032 | ||
| 1090 | if (!tg->parent) | 1033 | if (!tg->parent) |
| 1091 | return wl; | 1034 | return wl; |
| @@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu, | |||
| 1097 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | 1040 | if (!wl && sched_feat(ASYM_EFF_LOAD)) |
| 1098 | return wl; | 1041 | return wl; |
| 1099 | 1042 | ||
| 1100 | /* | ||
| 1101 | * Instead of using this increment, also add the difference | ||
| 1102 | * between when the shares were last updated and now. | ||
| 1103 | */ | ||
| 1104 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
| 1105 | wl += more_w; | ||
| 1106 | wg += more_w; | ||
| 1107 | |||
| 1108 | for_each_sched_entity(se) { | 1043 | for_each_sched_entity(se) { |
| 1109 | #define D(n) (likely(n) ? (n) : 1) | ||
| 1110 | |||
| 1111 | long S, rw, s, a, b; | 1044 | long S, rw, s, a, b; |
| 1045 | long more_w; | ||
| 1046 | |||
| 1047 | /* | ||
| 1048 | * Instead of using this increment, also add the difference | ||
| 1049 | * between when the shares were last updated and now. | ||
| 1050 | */ | ||
| 1051 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
| 1052 | wl += more_w; | ||
| 1053 | wg += more_w; | ||
| 1112 | 1054 | ||
| 1113 | S = se->my_q->tg->shares; | 1055 | S = se->my_q->tg->shares; |
| 1114 | s = se->my_q->shares; | 1056 | s = se->my_q->shares; |
| @@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu, | |||
| 1117 | a = S*(rw + wl); | 1059 | a = S*(rw + wl); |
| 1118 | b = S*rw + s*wg; | 1060 | b = S*rw + s*wg; |
| 1119 | 1061 | ||
| 1120 | wl = s*(a-b)/D(b); | 1062 | wl = s*(a-b); |
| 1063 | |||
| 1064 | if (likely(b)) | ||
| 1065 | wl /= b; | ||
| 1066 | |||
| 1121 | /* | 1067 | /* |
| 1122 | * Assume the group is already running and will | 1068 | * Assume the group is already running and will |
| 1123 | * thus already be accounted for in the weight. | 1069 | * thus already be accounted for in the weight. |
| @@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu, | |||
| 1126 | * alter the group weight. | 1072 | * alter the group weight. |
| 1127 | */ | 1073 | */ |
| 1128 | wg = 0; | 1074 | wg = 0; |
| 1129 | #undef D | ||
| 1130 | } | 1075 | } |
| 1131 | 1076 | ||
| 1132 | return wl; | 1077 | return wl; |
| @@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
| 1143 | #endif | 1088 | #endif |
| 1144 | 1089 | ||
| 1145 | static int | 1090 | static int |
| 1146 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | 1091 | wake_affine(struct sched_domain *this_sd, struct rq *this_rq, |
| 1147 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, | 1092 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, |
| 1148 | int idx, unsigned long load, unsigned long this_load, | 1093 | int idx, unsigned long load, unsigned long this_load, |
| 1149 | unsigned int imbalance) | 1094 | unsigned int imbalance) |
| @@ -1191,8 +1136,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
| 1191 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | 1136 | schedstat_inc(p, se.nr_wakeups_affine_attempts); |
| 1192 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1137 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
| 1193 | 1138 | ||
| 1194 | if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || | 1139 | if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= |
| 1195 | balanced) { | 1140 | tl_per_task)) { |
| 1196 | /* | 1141 | /* |
| 1197 | * This domain has SD_WAKE_AFFINE and | 1142 | * This domain has SD_WAKE_AFFINE and |
| 1198 | * p is cache cold in this domain, and | 1143 | * p is cache cold in this domain, and |
| @@ -1211,16 +1156,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync) | |||
| 1211 | struct sched_domain *sd, *this_sd = NULL; | 1156 | struct sched_domain *sd, *this_sd = NULL; |
| 1212 | int prev_cpu, this_cpu, new_cpu; | 1157 | int prev_cpu, this_cpu, new_cpu; |
| 1213 | unsigned long load, this_load; | 1158 | unsigned long load, this_load; |
| 1214 | struct rq *rq, *this_rq; | 1159 | struct rq *this_rq; |
| 1215 | unsigned int imbalance; | 1160 | unsigned int imbalance; |
| 1216 | int idx; | 1161 | int idx; |
| 1217 | 1162 | ||
| 1218 | prev_cpu = task_cpu(p); | 1163 | prev_cpu = task_cpu(p); |
| 1219 | rq = task_rq(p); | ||
| 1220 | this_cpu = smp_processor_id(); | 1164 | this_cpu = smp_processor_id(); |
| 1221 | this_rq = cpu_rq(this_cpu); | 1165 | this_rq = cpu_rq(this_cpu); |
| 1222 | new_cpu = prev_cpu; | 1166 | new_cpu = prev_cpu; |
| 1223 | 1167 | ||
| 1168 | if (prev_cpu == this_cpu) | ||
| 1169 | goto out; | ||
| 1224 | /* | 1170 | /* |
| 1225 | * 'this_sd' is the first domain that both | 1171 | * 'this_sd' is the first domain that both |
| 1226 | * this_cpu and prev_cpu are present in: | 1172 | * this_cpu and prev_cpu are present in: |
| @@ -1248,13 +1194,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync) | |||
| 1248 | load = source_load(prev_cpu, idx); | 1194 | load = source_load(prev_cpu, idx); |
| 1249 | this_load = target_load(this_cpu, idx); | 1195 | this_load = target_load(this_cpu, idx); |
| 1250 | 1196 | ||
| 1251 | if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, | 1197 | if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, |
| 1252 | load, this_load, imbalance)) | 1198 | load, this_load, imbalance)) |
| 1253 | return this_cpu; | 1199 | return this_cpu; |
| 1254 | 1200 | ||
| 1255 | if (prev_cpu == this_cpu) | ||
| 1256 | goto out; | ||
| 1257 | |||
| 1258 | /* | 1201 | /* |
| 1259 | * Start passive balancing when half the imbalance_pct | 1202 | * Start passive balancing when half the imbalance_pct |
| 1260 | * limit is reached. | 1203 | * limit is reached. |
| @@ -1281,62 +1224,20 @@ static unsigned long wakeup_gran(struct sched_entity *se) | |||
| 1281 | * + nice tasks. | 1224 | * + nice tasks. |
| 1282 | */ | 1225 | */ |
| 1283 | if (sched_feat(ASYM_GRAN)) | 1226 | if (sched_feat(ASYM_GRAN)) |
| 1284 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); | 1227 | gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); |
| 1285 | else | ||
| 1286 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); | ||
| 1287 | 1228 | ||
| 1288 | return gran; | 1229 | return gran; |
| 1289 | } | 1230 | } |
| 1290 | 1231 | ||
| 1291 | /* | 1232 | /* |
| 1292 | * Should 'se' preempt 'curr'. | ||
| 1293 | * | ||
| 1294 | * |s1 | ||
| 1295 | * |s2 | ||
| 1296 | * |s3 | ||
| 1297 | * g | ||
| 1298 | * |<--->|c | ||
| 1299 | * | ||
| 1300 | * w(c, s1) = -1 | ||
| 1301 | * w(c, s2) = 0 | ||
| 1302 | * w(c, s3) = 1 | ||
| 1303 | * | ||
| 1304 | */ | ||
| 1305 | static int | ||
| 1306 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | ||
| 1307 | { | ||
| 1308 | s64 gran, vdiff = curr->vruntime - se->vruntime; | ||
| 1309 | |||
| 1310 | if (vdiff < 0) | ||
| 1311 | return -1; | ||
| 1312 | |||
| 1313 | gran = wakeup_gran(curr); | ||
| 1314 | if (vdiff > gran) | ||
| 1315 | return 1; | ||
| 1316 | |||
| 1317 | return 0; | ||
| 1318 | } | ||
| 1319 | |||
| 1320 | /* return depth at which a sched entity is present in the hierarchy */ | ||
| 1321 | static inline int depth_se(struct sched_entity *se) | ||
| 1322 | { | ||
| 1323 | int depth = 0; | ||
| 1324 | |||
| 1325 | for_each_sched_entity(se) | ||
| 1326 | depth++; | ||
| 1327 | |||
| 1328 | return depth; | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | /* | ||
| 1332 | * Preempt the current task with a newly woken task if needed: | 1233 | * Preempt the current task with a newly woken task if needed: |
| 1333 | */ | 1234 | */ |
| 1334 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | 1235 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) |
| 1335 | { | 1236 | { |
| 1336 | struct task_struct *curr = rq->curr; | 1237 | struct task_struct *curr = rq->curr; |
| 1337 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1238 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
| 1338 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1239 | struct sched_entity *se = &curr->se, *pse = &p->se; |
| 1339 | int se_depth, pse_depth; | 1240 | s64 delta_exec; |
| 1340 | 1241 | ||
| 1341 | if (unlikely(rt_prio(p->prio))) { | 1242 | if (unlikely(rt_prio(p->prio))) { |
| 1342 | update_rq_clock(rq); | 1243 | update_rq_clock(rq); |
| @@ -1351,6 +1252,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1351 | cfs_rq_of(pse)->next = pse; | 1252 | cfs_rq_of(pse)->next = pse; |
| 1352 | 1253 | ||
| 1353 | /* | 1254 | /* |
| 1255 | * We can come here with TIF_NEED_RESCHED already set from new task | ||
| 1256 | * wake up path. | ||
| 1257 | */ | ||
| 1258 | if (test_tsk_need_resched(curr)) | ||
| 1259 | return; | ||
| 1260 | |||
| 1261 | /* | ||
| 1354 | * Batch tasks do not preempt (their preemption is driven by | 1262 | * Batch tasks do not preempt (their preemption is driven by |
| 1355 | * the tick): | 1263 | * the tick): |
| 1356 | */ | 1264 | */ |
| @@ -1360,33 +1268,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1360 | if (!sched_feat(WAKEUP_PREEMPT)) | 1268 | if (!sched_feat(WAKEUP_PREEMPT)) |
| 1361 | return; | 1269 | return; |
| 1362 | 1270 | ||
| 1363 | /* | 1271 | if (sched_feat(WAKEUP_OVERLAP) && sync && |
| 1364 | * preemption test can be made between sibling entities who are in the | 1272 | se->avg_overlap < sysctl_sched_migration_cost && |
| 1365 | * same cfs_rq i.e who have a common parent. Walk up the hierarchy of | 1273 | pse->avg_overlap < sysctl_sched_migration_cost) { |
| 1366 | * both tasks until we find their ancestors who are siblings of common | 1274 | resched_task(curr); |
| 1367 | * parent. | 1275 | return; |
| 1368 | */ | ||
| 1369 | |||
| 1370 | /* First walk up until both entities are at same depth */ | ||
| 1371 | se_depth = depth_se(se); | ||
| 1372 | pse_depth = depth_se(pse); | ||
| 1373 | |||
| 1374 | while (se_depth > pse_depth) { | ||
| 1375 | se_depth--; | ||
| 1376 | se = parent_entity(se); | ||
| 1377 | } | ||
| 1378 | |||
| 1379 | while (pse_depth > se_depth) { | ||
| 1380 | pse_depth--; | ||
| 1381 | pse = parent_entity(pse); | ||
| 1382 | } | ||
| 1383 | |||
| 1384 | while (!is_same_group(se, pse)) { | ||
| 1385 | se = parent_entity(se); | ||
| 1386 | pse = parent_entity(pse); | ||
| 1387 | } | 1276 | } |
| 1388 | 1277 | ||
| 1389 | if (wakeup_preempt_entity(se, pse) == 1) | 1278 | delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
| 1279 | if (delta_exec > wakeup_gran(pse)) | ||
| 1390 | resched_task(curr); | 1280 | resched_task(curr); |
| 1391 | } | 1281 | } |
| 1392 | 1282 | ||
| @@ -1445,19 +1335,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) | |||
| 1445 | if (next == &cfs_rq->tasks) | 1335 | if (next == &cfs_rq->tasks) |
| 1446 | return NULL; | 1336 | return NULL; |
| 1447 | 1337 | ||
| 1448 | /* Skip over entities that are not tasks */ | 1338 | se = list_entry(next, struct sched_entity, group_node); |
| 1449 | do { | 1339 | p = task_of(se); |
| 1450 | se = list_entry(next, struct sched_entity, group_node); | 1340 | cfs_rq->balance_iterator = next->next; |
| 1451 | next = next->next; | ||
| 1452 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
| 1453 | |||
| 1454 | if (next == &cfs_rq->tasks) | ||
| 1455 | return NULL; | ||
| 1456 | |||
| 1457 | cfs_rq->balance_iterator = next; | ||
| 1458 | |||
| 1459 | if (entity_is_task(se)) | ||
| 1460 | p = task_of(se); | ||
| 1461 | 1341 | ||
| 1462 | return p; | 1342 | return p; |
| 1463 | } | 1343 | } |
| @@ -1507,7 +1387,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1507 | rcu_read_lock(); | 1387 | rcu_read_lock(); |
| 1508 | update_h_load(busiest_cpu); | 1388 | update_h_load(busiest_cpu); |
| 1509 | 1389 | ||
| 1510 | list_for_each_entry(tg, &task_groups, list) { | 1390 | list_for_each_entry_rcu(tg, &task_groups, list) { |
| 1511 | struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; | 1391 | struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; |
| 1512 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | 1392 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; |
| 1513 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | 1393 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; |
| @@ -1620,10 +1500,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
| 1620 | * 'current' within the tree based on its new key value. | 1500 | * 'current' within the tree based on its new key value. |
| 1621 | */ | 1501 | */ |
| 1622 | swap(curr->vruntime, se->vruntime); | 1502 | swap(curr->vruntime, se->vruntime); |
| 1503 | resched_task(rq->curr); | ||
| 1623 | } | 1504 | } |
| 1624 | 1505 | ||
| 1625 | enqueue_task_fair(rq, p, 0); | 1506 | enqueue_task_fair(rq, p, 0); |
| 1626 | resched_task(rq->curr); | ||
| 1627 | } | 1507 | } |
| 1628 | 1508 | ||
| 1629 | /* | 1509 | /* |
| @@ -1642,7 +1522,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, | |||
| 1642 | if (p->prio > oldprio) | 1522 | if (p->prio > oldprio) |
| 1643 | resched_task(rq->curr); | 1523 | resched_task(rq->curr); |
| 1644 | } else | 1524 | } else |
| 1645 | check_preempt_curr(rq, p); | 1525 | check_preempt_curr(rq, p, 0); |
| 1646 | } | 1526 | } |
| 1647 | 1527 | ||
| 1648 | /* | 1528 | /* |
| @@ -1659,7 +1539,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, | |||
| 1659 | if (running) | 1539 | if (running) |
| 1660 | resched_task(rq->curr); | 1540 | resched_task(rq->curr); |
| 1661 | else | 1541 | else |
| 1662 | check_preempt_curr(rq, p); | 1542 | check_preempt_curr(rq, p, 0); |
| 1663 | } | 1543 | } |
| 1664 | 1544 | ||
| 1665 | /* Account for a task changing its policy or group. | 1545 | /* Account for a task changing its policy or group. |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 9353ca78154e..7c9e8f4a049f 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
| @@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) | |||
| 11 | SCHED_FEAT(LB_BIAS, 1) | 11 | SCHED_FEAT(LB_BIAS, 1) |
| 12 | SCHED_FEAT(LB_WAKEUP_UPDATE, 1) | 12 | SCHED_FEAT(LB_WAKEUP_UPDATE, 1) |
| 13 | SCHED_FEAT(ASYM_EFF_LOAD, 1) | 13 | SCHED_FEAT(ASYM_EFF_LOAD, 1) |
| 14 | SCHED_FEAT(WAKEUP_OVERLAP, 0) | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3a4f92dbbe66..dec4ccabe2f5 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
| @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) | |||
| 14 | /* | 14 | /* |
| 15 | * Idle tasks are unconditionally rescheduled: | 15 | * Idle tasks are unconditionally rescheduled: |
| 16 | */ | 16 | */ |
| 17 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) | 17 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) |
| 18 | { | 18 | { |
| 19 | resched_task(rq->idle); | 19 | resched_task(rq->idle); |
| 20 | } | 20 | } |
| @@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, | |||
| 76 | if (running) | 76 | if (running) |
| 77 | resched_task(rq->curr); | 77 | resched_task(rq->curr); |
| 78 | else | 78 | else |
| 79 | check_preempt_curr(rq, p); | 79 | check_preempt_curr(rq, p, 0); |
| 80 | } | 80 | } |
| 81 | 81 | ||
| 82 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | 82 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, |
| @@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, | |||
| 93 | if (p->prio > oldprio) | 93 | if (p->prio > oldprio) |
| 94 | resched_task(rq->curr); | 94 | resched_task(rq->curr); |
| 95 | } else | 95 | } else |
| 96 | check_preempt_curr(rq, p); | 96 | check_preempt_curr(rq, p, 0); |
| 97 | } | 97 | } |
| 98 | 98 | ||
| 99 | /* | 99 | /* |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1113157b2058..cdf5740ab03e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | |||
| 102 | 102 | ||
| 103 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 103 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
| 104 | { | 104 | { |
| 105 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | ||
| 105 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | 106 | struct sched_rt_entity *rt_se = rt_rq->rt_se; |
| 106 | 107 | ||
| 107 | if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { | 108 | if (rt_rq->rt_nr_running) { |
| 108 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 109 | if (rt_se && !on_rt_rq(rt_se)) |
| 109 | 110 | enqueue_rt_entity(rt_se); | |
| 110 | enqueue_rt_entity(rt_se); | ||
| 111 | if (rt_rq->highest_prio < curr->prio) | 111 | if (rt_rq->highest_prio < curr->prio) |
| 112 | resched_task(curr); | 112 | resched_task(curr); |
| 113 | } | 113 | } |
| @@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
| 231 | #endif /* CONFIG_RT_GROUP_SCHED */ | 231 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 232 | 232 | ||
| 233 | #ifdef CONFIG_SMP | 233 | #ifdef CONFIG_SMP |
| 234 | /* | ||
| 235 | * We ran out of runtime, see if we can borrow some from our neighbours. | ||
| 236 | */ | ||
| 234 | static int do_balance_runtime(struct rt_rq *rt_rq) | 237 | static int do_balance_runtime(struct rt_rq *rt_rq) |
| 235 | { | 238 | { |
| 236 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 239 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
| @@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq) | |||
| 250 | continue; | 253 | continue; |
| 251 | 254 | ||
| 252 | spin_lock(&iter->rt_runtime_lock); | 255 | spin_lock(&iter->rt_runtime_lock); |
| 256 | /* | ||
| 257 | * Either all rqs have inf runtime and there's nothing to steal | ||
| 258 | * or __disable_runtime() below sets a specific rq to inf to | ||
| 259 | * indicate its been disabled and disalow stealing. | ||
| 260 | */ | ||
| 253 | if (iter->rt_runtime == RUNTIME_INF) | 261 | if (iter->rt_runtime == RUNTIME_INF) |
| 254 | goto next; | 262 | goto next; |
| 255 | 263 | ||
| 264 | /* | ||
| 265 | * From runqueues with spare time, take 1/n part of their | ||
| 266 | * spare time, but no more than our period. | ||
| 267 | */ | ||
| 256 | diff = iter->rt_runtime - iter->rt_time; | 268 | diff = iter->rt_runtime - iter->rt_time; |
| 257 | if (diff > 0) { | 269 | if (diff > 0) { |
| 258 | diff = div_u64((u64)diff, weight); | 270 | diff = div_u64((u64)diff, weight); |
| @@ -274,6 +286,9 @@ next: | |||
| 274 | return more; | 286 | return more; |
| 275 | } | 287 | } |
| 276 | 288 | ||
| 289 | /* | ||
| 290 | * Ensure this RQ takes back all the runtime it lend to its neighbours. | ||
| 291 | */ | ||
| 277 | static void __disable_runtime(struct rq *rq) | 292 | static void __disable_runtime(struct rq *rq) |
| 278 | { | 293 | { |
| 279 | struct root_domain *rd = rq->rd; | 294 | struct root_domain *rd = rq->rd; |
| @@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq) | |||
| 289 | 304 | ||
| 290 | spin_lock(&rt_b->rt_runtime_lock); | 305 | spin_lock(&rt_b->rt_runtime_lock); |
| 291 | spin_lock(&rt_rq->rt_runtime_lock); | 306 | spin_lock(&rt_rq->rt_runtime_lock); |
| 307 | /* | ||
| 308 | * Either we're all inf and nobody needs to borrow, or we're | ||
| 309 | * already disabled and thus have nothing to do, or we have | ||
| 310 | * exactly the right amount of runtime to take out. | ||
| 311 | */ | ||
| 292 | if (rt_rq->rt_runtime == RUNTIME_INF || | 312 | if (rt_rq->rt_runtime == RUNTIME_INF || |
| 293 | rt_rq->rt_runtime == rt_b->rt_runtime) | 313 | rt_rq->rt_runtime == rt_b->rt_runtime) |
| 294 | goto balanced; | 314 | goto balanced; |
| 295 | spin_unlock(&rt_rq->rt_runtime_lock); | 315 | spin_unlock(&rt_rq->rt_runtime_lock); |
| 296 | 316 | ||
| 317 | /* | ||
| 318 | * Calculate the difference between what we started out with | ||
| 319 | * and what we current have, that's the amount of runtime | ||
| 320 | * we lend and now have to reclaim. | ||
| 321 | */ | ||
| 297 | want = rt_b->rt_runtime - rt_rq->rt_runtime; | 322 | want = rt_b->rt_runtime - rt_rq->rt_runtime; |
| 298 | 323 | ||
| 324 | /* | ||
| 325 | * Greedy reclaim, take back as much as we can. | ||
| 326 | */ | ||
| 299 | for_each_cpu_mask(i, rd->span) { | 327 | for_each_cpu_mask(i, rd->span) { |
| 300 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); | 328 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); |
| 301 | s64 diff; | 329 | s64 diff; |
| 302 | 330 | ||
| 331 | /* | ||
| 332 | * Can't reclaim from ourselves or disabled runqueues. | ||
| 333 | */ | ||
| 303 | if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) | 334 | if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) |
| 304 | continue; | 335 | continue; |
| 305 | 336 | ||
| @@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq) | |||
| 319 | } | 350 | } |
| 320 | 351 | ||
| 321 | spin_lock(&rt_rq->rt_runtime_lock); | 352 | spin_lock(&rt_rq->rt_runtime_lock); |
| 353 | /* | ||
| 354 | * We cannot be left wanting - that would mean some runtime | ||
| 355 | * leaked out of the system. | ||
| 356 | */ | ||
| 322 | BUG_ON(want); | 357 | BUG_ON(want); |
| 323 | balanced: | 358 | balanced: |
| 359 | /* | ||
| 360 | * Disable all the borrow logic by pretending we have inf | ||
| 361 | * runtime - in which case borrowing doesn't make sense. | ||
| 362 | */ | ||
| 324 | rt_rq->rt_runtime = RUNTIME_INF; | 363 | rt_rq->rt_runtime = RUNTIME_INF; |
| 325 | spin_unlock(&rt_rq->rt_runtime_lock); | 364 | spin_unlock(&rt_rq->rt_runtime_lock); |
| 326 | spin_unlock(&rt_b->rt_runtime_lock); | 365 | spin_unlock(&rt_b->rt_runtime_lock); |
| @@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq) | |||
| 343 | if (unlikely(!scheduler_running)) | 382 | if (unlikely(!scheduler_running)) |
| 344 | return; | 383 | return; |
| 345 | 384 | ||
| 385 | /* | ||
| 386 | * Reset each runqueue's bandwidth settings | ||
| 387 | */ | ||
| 346 | for_each_leaf_rt_rq(rt_rq, rq) { | 388 | for_each_leaf_rt_rq(rt_rq, rq) { |
| 347 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 389 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
| 348 | 390 | ||
| @@ -389,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
| 389 | int i, idle = 1; | 431 | int i, idle = 1; |
| 390 | cpumask_t span; | 432 | cpumask_t span; |
| 391 | 433 | ||
| 392 | if (rt_b->rt_runtime == RUNTIME_INF) | 434 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
| 393 | return 1; | 435 | return 1; |
| 394 | 436 | ||
| 395 | span = sched_rt_period_mask(); | 437 | span = sched_rt_period_mask(); |
| @@ -487,6 +529,9 @@ static void update_curr_rt(struct rq *rq) | |||
| 487 | curr->se.exec_start = rq->clock; | 529 | curr->se.exec_start = rq->clock; |
| 488 | cpuacct_charge(curr, delta_exec); | 530 | cpuacct_charge(curr, delta_exec); |
| 489 | 531 | ||
| 532 | if (!rt_bandwidth_enabled()) | ||
| 533 | return; | ||
| 534 | |||
| 490 | for_each_sched_rt_entity(rt_se) { | 535 | for_each_sched_rt_entity(rt_se) { |
| 491 | rt_rq = rt_rq_of_se(rt_se); | 536 | rt_rq = rt_rq_of_se(rt_se); |
| 492 | 537 | ||
| @@ -784,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
| 784 | /* | 829 | /* |
| 785 | * Preempt the current task with a newly woken task if needed: | 830 | * Preempt the current task with a newly woken task if needed: |
| 786 | */ | 831 | */ |
| 787 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | 832 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) |
| 788 | { | 833 | { |
| 789 | if (p->prio < rq->curr->prio) { | 834 | if (p->prio < rq->curr->prio) { |
| 790 | resched_task(rq->curr); | 835 | resched_task(rq->curr); |
diff --git a/kernel/user.c b/kernel/user.c index 865ecf57a096..39d6159fae43 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, | |||
| 169 | { | 169 | { |
| 170 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | 170 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); |
| 171 | 171 | ||
| 172 | return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); | 172 | return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); |
| 173 | } | 173 | } |
| 174 | 174 | ||
| 175 | static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | 175 | static ssize_t cpu_rt_runtime_store(struct kobject *kobj, |
| @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | |||
| 180 | unsigned long rt_runtime; | 180 | unsigned long rt_runtime; |
| 181 | int rc; | 181 | int rc; |
| 182 | 182 | ||
| 183 | sscanf(buf, "%lu", &rt_runtime); | 183 | sscanf(buf, "%ld", &rt_runtime); |
| 184 | 184 | ||
| 185 | rc = sched_group_set_rt_runtime(up->tg, rt_runtime); | 185 | rc = sched_group_set_rt_runtime(up->tg, rt_runtime); |
| 186 | 186 | ||
