diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-21 18:40:24 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-21 18:40:24 -0400 |
commit | ec965350bb98bd291eb34f6ecddfdcfc36da1e6e (patch) | |
tree | 983bcaf33ed00b48a86f7f8790cc460cf15dd252 | |
parent | 5f033bb9bc5cb3bb37a79e3ef131f50ecdcb72b0 (diff) | |
parent | 486fdae21458bd9f4e125099bb3c38a4064e450e (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel: (62 commits)
sched: build fix
sched: better rt-group documentation
sched: features fix
sched: /debug/sched_features
sched: add SCHED_FEAT_DEADLINE
sched: debug: show a weight tree
sched: fair: weight calculations
sched: fair-group: de-couple load-balancing from the rb-trees
sched: fair-group scheduling vs latency
sched: rt-group: optimize dequeue_rt_stack
sched: debug: add some debug code to handle the full hierarchy
sched: fair-group: SMP-nice for group scheduling
sched, cpuset: customize sched domains, core
sched, cpuset: customize sched domains, docs
sched: prepatory code movement
sched: rt: multi level group constraints
sched: task_group hierarchy
sched: fix the task_group hierarchy for UID grouping
sched: allow the group scheduler to have multiple levels
sched: mix tasks and groups
...
68 files changed, 3157 insertions, 997 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index ad2bb3b3acc1..aa854b9b18cd 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -8,6 +8,7 @@ Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | |||
8 | Modified by Paul Jackson <pj@sgi.com> | 8 | Modified by Paul Jackson <pj@sgi.com> |
9 | Modified by Christoph Lameter <clameter@sgi.com> | 9 | Modified by Christoph Lameter <clameter@sgi.com> |
10 | Modified by Paul Menage <menage@google.com> | 10 | Modified by Paul Menage <menage@google.com> |
11 | Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> | ||
11 | 12 | ||
12 | CONTENTS: | 13 | CONTENTS: |
13 | ========= | 14 | ========= |
@@ -20,7 +21,8 @@ CONTENTS: | |||
20 | 1.5 What is memory_pressure ? | 21 | 1.5 What is memory_pressure ? |
21 | 1.6 What is memory spread ? | 22 | 1.6 What is memory spread ? |
22 | 1.7 What is sched_load_balance ? | 23 | 1.7 What is sched_load_balance ? |
23 | 1.8 How do I use cpusets ? | 24 | 1.8 What is sched_relax_domain_level ? |
25 | 1.9 How do I use cpusets ? | ||
24 | 2. Usage Examples and Syntax | 26 | 2. Usage Examples and Syntax |
25 | 2.1 Basic Usage | 27 | 2.1 Basic Usage |
26 | 2.2 Adding/removing cpus | 28 | 2.2 Adding/removing cpus |
@@ -497,7 +499,73 @@ the cpuset code to update these sched domains, it compares the new | |||
497 | partition requested with the current, and updates its sched domains, | 499 | partition requested with the current, and updates its sched domains, |
498 | removing the old and adding the new, for each change. | 500 | removing the old and adding the new, for each change. |
499 | 501 | ||
500 | 1.8 How do I use cpusets ? | 502 | |
503 | 1.8 What is sched_relax_domain_level ? | ||
504 | -------------------------------------- | ||
505 | |||
506 | In sched domain, the scheduler migrates tasks in 2 ways; periodic load | ||
507 | balance on tick, and at time of some schedule events. | ||
508 | |||
509 | When a task is woken up, scheduler try to move the task on idle CPU. | ||
510 | For example, if a task A running on CPU X activates another task B | ||
511 | on the same CPU X, and if CPU Y is X's sibling and performing idle, | ||
512 | then scheduler migrate task B to CPU Y so that task B can start on | ||
513 | CPU Y without waiting task A on CPU X. | ||
514 | |||
515 | And if a CPU run out of tasks in its runqueue, the CPU try to pull | ||
516 | extra tasks from other busy CPUs to help them before it is going to | ||
517 | be idle. | ||
518 | |||
519 | Of course it takes some searching cost to find movable tasks and/or | ||
520 | idle CPUs, the scheduler might not search all CPUs in the domain | ||
521 | everytime. In fact, in some architectures, the searching ranges on | ||
522 | events are limited in the same socket or node where the CPU locates, | ||
523 | while the load balance on tick searchs all. | ||
524 | |||
525 | For example, assume CPU Z is relatively far from CPU X. Even if CPU Z | ||
526 | is idle while CPU X and the siblings are busy, scheduler can't migrate | ||
527 | woken task B from X to Z since it is out of its searching range. | ||
528 | As the result, task B on CPU X need to wait task A or wait load balance | ||
529 | on the next tick. For some applications in special situation, waiting | ||
530 | 1 tick may be too long. | ||
531 | |||
532 | The 'sched_relax_domain_level' file allows you to request changing | ||
533 | this searching range as you like. This file takes int value which | ||
534 | indicates size of searching range in levels ideally as follows, | ||
535 | otherwise initial value -1 that indicates the cpuset has no request. | ||
536 | |||
537 | -1 : no request. use system default or follow request of others. | ||
538 | 0 : no search. | ||
539 | 1 : search siblings (hyperthreads in a core). | ||
540 | 2 : search cores in a package. | ||
541 | 3 : search cpus in a node [= system wide on non-NUMA system] | ||
542 | ( 4 : search nodes in a chunk of node [on NUMA system] ) | ||
543 | ( 5~ : search system wide [on NUMA system]) | ||
544 | |||
545 | This file is per-cpuset and affect the sched domain where the cpuset | ||
546 | belongs to. Therefore if the flag 'sched_load_balance' of a cpuset | ||
547 | is disabled, then 'sched_relax_domain_level' have no effect since | ||
548 | there is no sched domain belonging the cpuset. | ||
549 | |||
550 | If multiple cpusets are overlapping and hence they form a single sched | ||
551 | domain, the largest value among those is used. Be careful, if one | ||
552 | requests 0 and others are -1 then 0 is used. | ||
553 | |||
554 | Note that modifying this file will have both good and bad effects, | ||
555 | and whether it is acceptable or not will be depend on your situation. | ||
556 | Don't modify this file if you are not sure. | ||
557 | |||
558 | If your situation is: | ||
559 | - The migration costs between each cpu can be assumed considerably | ||
560 | small(for you) due to your special application's behavior or | ||
561 | special hardware support for CPU cache etc. | ||
562 | - The searching cost doesn't have impact(for you) or you can make | ||
563 | the searching cost enough small by managing cpuset to compact etc. | ||
564 | - The latency is required even it sacrifices cache hit rate etc. | ||
565 | then increasing 'sched_relax_domain_level' would benefit you. | ||
566 | |||
567 | |||
568 | 1.9 How do I use cpusets ? | ||
501 | -------------------------- | 569 | -------------------------- |
502 | 570 | ||
503 | In order to minimize the impact of cpusets on critical kernel | 571 | In order to minimize the impact of cpusets on critical kernel |
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 1c6332f4543c..14f901f639ee 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt | |||
@@ -1,59 +1,177 @@ | |||
1 | Real-Time group scheduling | ||
2 | -------------------------- | ||
1 | 3 | ||
4 | CONTENTS | ||
5 | ======== | ||
2 | 6 | ||
3 | Real-Time group scheduling. | 7 | 1. Overview |
8 | 1.1 The problem | ||
9 | 1.2 The solution | ||
10 | 2. The interface | ||
11 | 2.1 System-wide settings | ||
12 | 2.2 Default behaviour | ||
13 | 2.3 Basis for grouping tasks | ||
14 | 3. Future plans | ||
4 | 15 | ||
5 | The problem space: | ||
6 | 16 | ||
7 | In order to schedule multiple groups of realtime tasks each group must | 17 | 1. Overview |
8 | be assigned a fixed portion of the CPU time available. Without a minimum | 18 | =========== |
9 | guarantee a realtime group can obviously fall short. A fuzzy upper limit | ||
10 | is of no use since it cannot be relied upon. Which leaves us with just | ||
11 | the single fixed portion. | ||
12 | 19 | ||
13 | CPU time is divided by means of specifying how much time can be spent | ||
14 | running in a given period. Say a frame fixed realtime renderer must | ||
15 | deliver 25 frames a second, which yields a period of 0.04s. Now say | ||
16 | it will also have to play some music and respond to input, leaving it | ||
17 | with around 80% for the graphics. We can then give this group a runtime | ||
18 | of 0.8 * 0.04s = 0.032s. | ||
19 | 20 | ||
20 | This way the graphics group will have a 0.04s period with a 0.032s runtime | 21 | 1.1 The problem |
21 | limit. | 22 | --------------- |
22 | 23 | ||
23 | Now if the audio thread needs to refill the DMA buffer every 0.005s, but | 24 | Realtime scheduling is all about determinism, a group has to be able to rely on |
24 | needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s | 25 | the amount of bandwidth (eg. CPU time) being constant. In order to schedule |
25 | = 0.00015s. | 26 | multiple groups of realtime tasks, each group must be assigned a fixed portion |
27 | of the CPU time available. Without a minimum guarantee a realtime group can | ||
28 | obviously fall short. A fuzzy upper limit is of no use since it cannot be | ||
29 | relied upon. Which leaves us with just the single fixed portion. | ||
26 | 30 | ||
31 | 1.2 The solution | ||
32 | ---------------- | ||
27 | 33 | ||
28 | The Interface: | 34 | CPU time is divided by means of specifying how much time can be spent running |
35 | in a given period. We allocate this "run time" for each realtime group which | ||
36 | the other realtime groups will not be permitted to use. | ||
29 | 37 | ||
30 | system wide: | 38 | Any time not allocated to a realtime group will be used to run normal priority |
39 | tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by | ||
40 | SCHED_OTHER. | ||
31 | 41 | ||
32 | /proc/sys/kernel/sched_rt_period_ms | 42 | Let's consider an example: a frame fixed realtime renderer must deliver 25 |
33 | /proc/sys/kernel/sched_rt_runtime_us | 43 | frames a second, which yields a period of 0.04s per frame. Now say it will also |
44 | have to play some music and respond to input, leaving it with around 80% CPU | ||
45 | time dedicated for the graphics. We can then give this group a run time of 0.8 | ||
46 | * 0.04s = 0.032s. | ||
34 | 47 | ||
35 | CONFIG_FAIR_USER_SCHED | 48 | This way the graphics group will have a 0.04s period with a 0.032s run time |
49 | limit. Now if the audio thread needs to refill the DMA buffer every 0.005s, but | ||
50 | needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s = | ||
51 | 0.00015s. So this group can be scheduled with a period of 0.005s and a run time | ||
52 | of 0.00015s. | ||
36 | 53 | ||
37 | /sys/kernel/uids/<uid>/cpu_rt_runtime_us | 54 | The remaining CPU time will be used for user input and other tass. Because |
55 | realtime tasks have explicitly allocated the CPU time they need to perform | ||
56 | their tasks, buffer underruns in the graphocs or audio can be eliminated. | ||
38 | 57 | ||
39 | or | 58 | NOTE: the above example is not fully implemented as of yet (2.6.25). We still |
59 | lack an EDF scheduler to make non-uniform periods usable. | ||
40 | 60 | ||
41 | CONFIG_FAIR_CGROUP_SCHED | ||
42 | 61 | ||
43 | /cgroup/<cgroup>/cpu.rt_runtime_us | 62 | 2. The Interface |
63 | ================ | ||
44 | 64 | ||
45 | [ time is specified in us because the interface is s32; this gives an | ||
46 | operating range of ~35m to 1us ] | ||
47 | 65 | ||
48 | The period takes values in [ 1, INT_MAX ], runtime in [ -1, INT_MAX - 1 ]. | 66 | 2.1 System wide settings |
67 | ------------------------ | ||
49 | 68 | ||
50 | A runtime of -1 specifies runtime == period, ie. no limit. | 69 | The system wide settings are configured under the /proc virtual file system: |
51 | 70 | ||
52 | New groups get the period from /proc/sys/kernel/sched_rt_period_us and | 71 | /proc/sys/kernel/sched_rt_period_us: |
53 | a runtime of 0. | 72 | The scheduling period that is equivalent to 100% CPU bandwidth |
54 | 73 | ||
55 | Settings are constrained to: | 74 | /proc/sys/kernel/sched_rt_runtime_us: |
75 | A global limit on how much time realtime scheduling may use. Even without | ||
76 | CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime | ||
77 | processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth | ||
78 | available to all realtime groups. | ||
79 | |||
80 | * Time is specified in us because the interface is s32. This gives an | ||
81 | operating range from 1us to about 35 minutes. | ||
82 | * sched_rt_period_us takes values from 1 to INT_MAX. | ||
83 | * sched_rt_runtime_us takes values from -1 to (INT_MAX - 1). | ||
84 | * A run time of -1 specifies runtime == period, ie. no limit. | ||
85 | |||
86 | |||
87 | 2.2 Default behaviour | ||
88 | --------------------- | ||
89 | |||
90 | The default values for sched_rt_period_us (1000000 or 1s) and | ||
91 | sched_rt_runtime_us (950000 or 0.95s). This gives 0.05s to be used by | ||
92 | SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away | ||
93 | realtime tasks will not lock up the machine but leave a little time to recover | ||
94 | it. By setting runtime to -1 you'd get the old behaviour back. | ||
95 | |||
96 | By default all bandwidth is assigned to the root group and new groups get the | ||
97 | period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you | ||
98 | want to assign bandwidth to another group, reduce the root group's bandwidth | ||
99 | and assign some or all of the difference to another group. | ||
100 | |||
101 | Realtime group scheduling means you have to assign a portion of total CPU | ||
102 | bandwidth to the group before it will accept realtime tasks. Therefore you will | ||
103 | not be able to run realtime tasks as any user other than root until you have | ||
104 | done that, even if the user has the rights to run processes with realtime | ||
105 | priority! | ||
106 | |||
107 | |||
108 | 2.3 Basis for grouping tasks | ||
109 | ---------------------------- | ||
110 | |||
111 | There are two compile-time settings for allocating CPU bandwidth. These are | ||
112 | configured using the "Basis for grouping tasks" multiple choice menu under | ||
113 | General setup > Group CPU Scheduler: | ||
114 | |||
115 | a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id") | ||
116 | |||
117 | This lets you use the virtual files under | ||
118 | "/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for | ||
119 | each user . | ||
120 | |||
121 | The other option is: | ||
122 | |||
123 | .o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups") | ||
124 | |||
125 | This uses the /cgroup virtual file system and "/cgroup/<cgroup>/cpu.rt_runtime_us" | ||
126 | to control the CPU time reserved for each control group instead. | ||
127 | |||
128 | For more information on working with control groups, you should read | ||
129 | Documentation/cgroups.txt as well. | ||
130 | |||
131 | Group settings are checked against the following limits in order to keep the configuration | ||
132 | schedulable: | ||
56 | 133 | ||
57 | \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period | 134 | \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period |
58 | 135 | ||
59 | in order to keep the configuration schedulable. | 136 | For now, this can be simplified to just the following (but see Future plans): |
137 | |||
138 | \Sum_{i} runtime_{i} <= global_runtime | ||
139 | |||
140 | |||
141 | 3. Future plans | ||
142 | =============== | ||
143 | |||
144 | There is work in progress to make the scheduling period for each group | ||
145 | ("/sys/kernel/uids/<uid>/cpu_rt_period_us" or | ||
146 | "/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well. | ||
147 | |||
148 | The constraint on the period is that a subgroup must have a smaller or | ||
149 | equal period to its parent. But realistically its not very useful _yet_ | ||
150 | as its prone to starvation without deadline scheduling. | ||
151 | |||
152 | Consider two sibling groups A and B; both have 50% bandwidth, but A's | ||
153 | period is twice the length of B's. | ||
154 | |||
155 | * group A: period=100000us, runtime=10000us | ||
156 | - this runs for 0.01s once every 0.1s | ||
157 | |||
158 | * group B: period= 50000us, runtime=10000us | ||
159 | - this runs for 0.01s twice every 0.1s (or once every 0.05 sec). | ||
160 | |||
161 | This means that currently a while (1) loop in A will run for the full period of | ||
162 | B and can starve B's tasks (assuming they are of lower priority) for a whole | ||
163 | period. | ||
164 | |||
165 | The next project will be SCHED_EDF (Earliest Deadline First scheduling) to bring | ||
166 | full deadline scheduling to the linux kernel. Deadline scheduling the above | ||
167 | groups and treating end of the period as a deadline will ensure that they both | ||
168 | get their allocated time. | ||
169 | |||
170 | Implementing SCHED_EDF might take a while to complete. Priority Inheritance is | ||
171 | the biggest challenge as the current linux PI infrastructure is geared towards | ||
172 | the limited static priority levels 0-139. With deadline scheduling you need to | ||
173 | do deadline inheritance (since priority is inversely proportional to the | ||
174 | deadline delta (deadline - now). | ||
175 | |||
176 | This means the whole PI machinery will have to be reworked - and that is one of | ||
177 | the most complex pieces of code we have. | ||
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 07cf77113565..87a693cf2bb7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -117,6 +117,9 @@ config ARCH_HAS_CPU_RELAX | |||
117 | config HAVE_SETUP_PER_CPU_AREA | 117 | config HAVE_SETUP_PER_CPU_AREA |
118 | def_bool X86_64 || (X86_SMP && !X86_VOYAGER) | 118 | def_bool X86_64 || (X86_SMP && !X86_VOYAGER) |
119 | 119 | ||
120 | config HAVE_CPUMASK_OF_CPU_MAP | ||
121 | def_bool X86_64_SMP | ||
122 | |||
120 | config ARCH_HIBERNATION_POSSIBLE | 123 | config ARCH_HIBERNATION_POSSIBLE |
121 | def_bool y | 124 | def_bool y |
122 | depends on !SMP || !X86_VOYAGER | 125 | depends on !SMP || !X86_VOYAGER |
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 9366fb68d8d8..c2502eb9aa83 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c | |||
@@ -91,7 +91,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, | |||
91 | 91 | ||
92 | /* Make sure we are running on right CPU */ | 92 | /* Make sure we are running on right CPU */ |
93 | saved_mask = current->cpus_allowed; | 93 | saved_mask = current->cpus_allowed; |
94 | retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 94 | retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
95 | if (retval) | 95 | if (retval) |
96 | return -1; | 96 | return -1; |
97 | 97 | ||
@@ -128,7 +128,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, | |||
128 | cx->address); | 128 | cx->address); |
129 | 129 | ||
130 | out: | 130 | out: |
131 | set_cpus_allowed(current, saved_mask); | 131 | set_cpus_allowed_ptr(current, &saved_mask); |
132 | return retval; | 132 | return retval; |
133 | } | 133 | } |
134 | EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); | 134 | EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); |
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index a962dcb9c408..e2d870de837c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
@@ -192,9 +192,9 @@ static void drv_read(struct drv_cmd *cmd) | |||
192 | cpumask_t saved_mask = current->cpus_allowed; | 192 | cpumask_t saved_mask = current->cpus_allowed; |
193 | cmd->val = 0; | 193 | cmd->val = 0; |
194 | 194 | ||
195 | set_cpus_allowed(current, cmd->mask); | 195 | set_cpus_allowed_ptr(current, &cmd->mask); |
196 | do_drv_read(cmd); | 196 | do_drv_read(cmd); |
197 | set_cpus_allowed(current, saved_mask); | 197 | set_cpus_allowed_ptr(current, &saved_mask); |
198 | } | 198 | } |
199 | 199 | ||
200 | static void drv_write(struct drv_cmd *cmd) | 200 | static void drv_write(struct drv_cmd *cmd) |
@@ -203,30 +203,30 @@ static void drv_write(struct drv_cmd *cmd) | |||
203 | unsigned int i; | 203 | unsigned int i; |
204 | 204 | ||
205 | for_each_cpu_mask(i, cmd->mask) { | 205 | for_each_cpu_mask(i, cmd->mask) { |
206 | set_cpus_allowed(current, cpumask_of_cpu(i)); | 206 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); |
207 | do_drv_write(cmd); | 207 | do_drv_write(cmd); |
208 | } | 208 | } |
209 | 209 | ||
210 | set_cpus_allowed(current, saved_mask); | 210 | set_cpus_allowed_ptr(current, &saved_mask); |
211 | return; | 211 | return; |
212 | } | 212 | } |
213 | 213 | ||
214 | static u32 get_cur_val(cpumask_t mask) | 214 | static u32 get_cur_val(const cpumask_t *mask) |
215 | { | 215 | { |
216 | struct acpi_processor_performance *perf; | 216 | struct acpi_processor_performance *perf; |
217 | struct drv_cmd cmd; | 217 | struct drv_cmd cmd; |
218 | 218 | ||
219 | if (unlikely(cpus_empty(mask))) | 219 | if (unlikely(cpus_empty(*mask))) |
220 | return 0; | 220 | return 0; |
221 | 221 | ||
222 | switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) { | 222 | switch (per_cpu(drv_data, first_cpu(*mask))->cpu_feature) { |
223 | case SYSTEM_INTEL_MSR_CAPABLE: | 223 | case SYSTEM_INTEL_MSR_CAPABLE: |
224 | cmd.type = SYSTEM_INTEL_MSR_CAPABLE; | 224 | cmd.type = SYSTEM_INTEL_MSR_CAPABLE; |
225 | cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; | 225 | cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; |
226 | break; | 226 | break; |
227 | case SYSTEM_IO_CAPABLE: | 227 | case SYSTEM_IO_CAPABLE: |
228 | cmd.type = SYSTEM_IO_CAPABLE; | 228 | cmd.type = SYSTEM_IO_CAPABLE; |
229 | perf = per_cpu(drv_data, first_cpu(mask))->acpi_data; | 229 | perf = per_cpu(drv_data, first_cpu(*mask))->acpi_data; |
230 | cmd.addr.io.port = perf->control_register.address; | 230 | cmd.addr.io.port = perf->control_register.address; |
231 | cmd.addr.io.bit_width = perf->control_register.bit_width; | 231 | cmd.addr.io.bit_width = perf->control_register.bit_width; |
232 | break; | 232 | break; |
@@ -234,7 +234,7 @@ static u32 get_cur_val(cpumask_t mask) | |||
234 | return 0; | 234 | return 0; |
235 | } | 235 | } |
236 | 236 | ||
237 | cmd.mask = mask; | 237 | cmd.mask = *mask; |
238 | 238 | ||
239 | drv_read(&cmd); | 239 | drv_read(&cmd); |
240 | 240 | ||
@@ -271,7 +271,7 @@ static unsigned int get_measured_perf(unsigned int cpu) | |||
271 | unsigned int retval; | 271 | unsigned int retval; |
272 | 272 | ||
273 | saved_mask = current->cpus_allowed; | 273 | saved_mask = current->cpus_allowed; |
274 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 274 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
275 | if (get_cpu() != cpu) { | 275 | if (get_cpu() != cpu) { |
276 | /* We were not able to run on requested processor */ | 276 | /* We were not able to run on requested processor */ |
277 | put_cpu(); | 277 | put_cpu(); |
@@ -329,7 +329,7 @@ static unsigned int get_measured_perf(unsigned int cpu) | |||
329 | retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; | 329 | retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; |
330 | 330 | ||
331 | put_cpu(); | 331 | put_cpu(); |
332 | set_cpus_allowed(current, saved_mask); | 332 | set_cpus_allowed_ptr(current, &saved_mask); |
333 | 333 | ||
334 | dprintk("cpu %d: performance percent %d\n", cpu, perf_percent); | 334 | dprintk("cpu %d: performance percent %d\n", cpu, perf_percent); |
335 | return retval; | 335 | return retval; |
@@ -347,13 +347,13 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) | |||
347 | return 0; | 347 | return 0; |
348 | } | 348 | } |
349 | 349 | ||
350 | freq = extract_freq(get_cur_val(cpumask_of_cpu(cpu)), data); | 350 | freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); |
351 | dprintk("cur freq = %u\n", freq); | 351 | dprintk("cur freq = %u\n", freq); |
352 | 352 | ||
353 | return freq; | 353 | return freq; |
354 | } | 354 | } |
355 | 355 | ||
356 | static unsigned int check_freqs(cpumask_t mask, unsigned int freq, | 356 | static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq, |
357 | struct acpi_cpufreq_data *data) | 357 | struct acpi_cpufreq_data *data) |
358 | { | 358 | { |
359 | unsigned int cur_freq; | 359 | unsigned int cur_freq; |
@@ -449,7 +449,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
449 | drv_write(&cmd); | 449 | drv_write(&cmd); |
450 | 450 | ||
451 | if (acpi_pstate_strict) { | 451 | if (acpi_pstate_strict) { |
452 | if (!check_freqs(cmd.mask, freqs.new, data)) { | 452 | if (!check_freqs(&cmd.mask, freqs.new, data)) { |
453 | dprintk("acpi_cpufreq_target failed (%d)\n", | 453 | dprintk("acpi_cpufreq_target failed (%d)\n", |
454 | policy->cpu); | 454 | policy->cpu); |
455 | return -EAGAIN; | 455 | return -EAGAIN; |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index c99d59d8ef2e..46d4034d9f37 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c | |||
@@ -478,12 +478,12 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi | |||
478 | 478 | ||
479 | static int check_supported_cpu(unsigned int cpu) | 479 | static int check_supported_cpu(unsigned int cpu) |
480 | { | 480 | { |
481 | cpumask_t oldmask = CPU_MASK_ALL; | 481 | cpumask_t oldmask; |
482 | u32 eax, ebx, ecx, edx; | 482 | u32 eax, ebx, ecx, edx; |
483 | unsigned int rc = 0; | 483 | unsigned int rc = 0; |
484 | 484 | ||
485 | oldmask = current->cpus_allowed; | 485 | oldmask = current->cpus_allowed; |
486 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 486 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
487 | 487 | ||
488 | if (smp_processor_id() != cpu) { | 488 | if (smp_processor_id() != cpu) { |
489 | printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); | 489 | printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); |
@@ -528,7 +528,7 @@ static int check_supported_cpu(unsigned int cpu) | |||
528 | rc = 1; | 528 | rc = 1; |
529 | 529 | ||
530 | out: | 530 | out: |
531 | set_cpus_allowed(current, oldmask); | 531 | set_cpus_allowed_ptr(current, &oldmask); |
532 | return rc; | 532 | return rc; |
533 | } | 533 | } |
534 | 534 | ||
@@ -1015,7 +1015,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i | |||
1015 | /* Driver entry point to switch to the target frequency */ | 1015 | /* Driver entry point to switch to the target frequency */ |
1016 | static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) | 1016 | static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) |
1017 | { | 1017 | { |
1018 | cpumask_t oldmask = CPU_MASK_ALL; | 1018 | cpumask_t oldmask; |
1019 | struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); | 1019 | struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); |
1020 | u32 checkfid; | 1020 | u32 checkfid; |
1021 | u32 checkvid; | 1021 | u32 checkvid; |
@@ -1030,7 +1030,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi | |||
1030 | 1030 | ||
1031 | /* only run on specific CPU from here on */ | 1031 | /* only run on specific CPU from here on */ |
1032 | oldmask = current->cpus_allowed; | 1032 | oldmask = current->cpus_allowed; |
1033 | set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); | 1033 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); |
1034 | 1034 | ||
1035 | if (smp_processor_id() != pol->cpu) { | 1035 | if (smp_processor_id() != pol->cpu) { |
1036 | printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); | 1036 | printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); |
@@ -1085,7 +1085,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi | |||
1085 | ret = 0; | 1085 | ret = 0; |
1086 | 1086 | ||
1087 | err_out: | 1087 | err_out: |
1088 | set_cpus_allowed(current, oldmask); | 1088 | set_cpus_allowed_ptr(current, &oldmask); |
1089 | return ret; | 1089 | return ret; |
1090 | } | 1090 | } |
1091 | 1091 | ||
@@ -1104,7 +1104,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol) | |||
1104 | static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | 1104 | static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) |
1105 | { | 1105 | { |
1106 | struct powernow_k8_data *data; | 1106 | struct powernow_k8_data *data; |
1107 | cpumask_t oldmask = CPU_MASK_ALL; | 1107 | cpumask_t oldmask; |
1108 | int rc; | 1108 | int rc; |
1109 | 1109 | ||
1110 | if (!cpu_online(pol->cpu)) | 1110 | if (!cpu_online(pol->cpu)) |
@@ -1145,7 +1145,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | |||
1145 | 1145 | ||
1146 | /* only run on specific CPU from here on */ | 1146 | /* only run on specific CPU from here on */ |
1147 | oldmask = current->cpus_allowed; | 1147 | oldmask = current->cpus_allowed; |
1148 | set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); | 1148 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); |
1149 | 1149 | ||
1150 | if (smp_processor_id() != pol->cpu) { | 1150 | if (smp_processor_id() != pol->cpu) { |
1151 | printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); | 1151 | printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); |
@@ -1164,7 +1164,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | |||
1164 | fidvid_msr_init(); | 1164 | fidvid_msr_init(); |
1165 | 1165 | ||
1166 | /* run on any CPU again */ | 1166 | /* run on any CPU again */ |
1167 | set_cpus_allowed(current, oldmask); | 1167 | set_cpus_allowed_ptr(current, &oldmask); |
1168 | 1168 | ||
1169 | if (cpu_family == CPU_HW_PSTATE) | 1169 | if (cpu_family == CPU_HW_PSTATE) |
1170 | pol->cpus = cpumask_of_cpu(pol->cpu); | 1170 | pol->cpus = cpumask_of_cpu(pol->cpu); |
@@ -1205,7 +1205,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | |||
1205 | return 0; | 1205 | return 0; |
1206 | 1206 | ||
1207 | err_out: | 1207 | err_out: |
1208 | set_cpus_allowed(current, oldmask); | 1208 | set_cpus_allowed_ptr(current, &oldmask); |
1209 | powernow_k8_cpu_exit_acpi(data); | 1209 | powernow_k8_cpu_exit_acpi(data); |
1210 | 1210 | ||
1211 | kfree(data); | 1211 | kfree(data); |
@@ -1242,10 +1242,11 @@ static unsigned int powernowk8_get (unsigned int cpu) | |||
1242 | if (!data) | 1242 | if (!data) |
1243 | return -EINVAL; | 1243 | return -EINVAL; |
1244 | 1244 | ||
1245 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 1245 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
1246 | if (smp_processor_id() != cpu) { | 1246 | if (smp_processor_id() != cpu) { |
1247 | printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu); | 1247 | printk(KERN_ERR PFX |
1248 | set_cpus_allowed(current, oldmask); | 1248 | "limiting to CPU %d failed in powernowk8_get\n", cpu); |
1249 | set_cpus_allowed_ptr(current, &oldmask); | ||
1249 | return 0; | 1250 | return 0; |
1250 | } | 1251 | } |
1251 | 1252 | ||
@@ -1253,13 +1254,14 @@ static unsigned int powernowk8_get (unsigned int cpu) | |||
1253 | goto out; | 1254 | goto out; |
1254 | 1255 | ||
1255 | if (cpu_family == CPU_HW_PSTATE) | 1256 | if (cpu_family == CPU_HW_PSTATE) |
1256 | khz = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); | 1257 | khz = find_khz_freq_from_pstate(data->powernow_table, |
1258 | data->currpstate); | ||
1257 | else | 1259 | else |
1258 | khz = find_khz_freq_from_fid(data->currfid); | 1260 | khz = find_khz_freq_from_fid(data->currfid); |
1259 | 1261 | ||
1260 | 1262 | ||
1261 | out: | 1263 | out: |
1262 | set_cpus_allowed(current, oldmask); | 1264 | set_cpus_allowed_ptr(current, &oldmask); |
1263 | return khz; | 1265 | return khz; |
1264 | } | 1266 | } |
1265 | 1267 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 3031f1196192..908dd347c67e 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | |||
@@ -315,7 +315,7 @@ static unsigned int get_cur_freq(unsigned int cpu) | |||
315 | cpumask_t saved_mask; | 315 | cpumask_t saved_mask; |
316 | 316 | ||
317 | saved_mask = current->cpus_allowed; | 317 | saved_mask = current->cpus_allowed; |
318 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 318 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
319 | if (smp_processor_id() != cpu) | 319 | if (smp_processor_id() != cpu) |
320 | return 0; | 320 | return 0; |
321 | 321 | ||
@@ -333,7 +333,7 @@ static unsigned int get_cur_freq(unsigned int cpu) | |||
333 | clock_freq = extract_clock(l, cpu, 1); | 333 | clock_freq = extract_clock(l, cpu, 1); |
334 | } | 334 | } |
335 | 335 | ||
336 | set_cpus_allowed(current, saved_mask); | 336 | set_cpus_allowed_ptr(current, &saved_mask); |
337 | return clock_freq; | 337 | return clock_freq; |
338 | } | 338 | } |
339 | 339 | ||
@@ -487,7 +487,7 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
487 | else | 487 | else |
488 | cpu_set(j, set_mask); | 488 | cpu_set(j, set_mask); |
489 | 489 | ||
490 | set_cpus_allowed(current, set_mask); | 490 | set_cpus_allowed_ptr(current, &set_mask); |
491 | preempt_disable(); | 491 | preempt_disable(); |
492 | if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { | 492 | if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { |
493 | dprintk("couldn't limit to CPUs in this domain\n"); | 493 | dprintk("couldn't limit to CPUs in this domain\n"); |
@@ -555,7 +555,8 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
555 | 555 | ||
556 | if (!cpus_empty(covered_cpus)) { | 556 | if (!cpus_empty(covered_cpus)) { |
557 | for_each_cpu_mask(j, covered_cpus) { | 557 | for_each_cpu_mask(j, covered_cpus) { |
558 | set_cpus_allowed(current, cpumask_of_cpu(j)); | 558 | set_cpus_allowed_ptr(current, |
559 | &cpumask_of_cpu(j)); | ||
559 | wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); | 560 | wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); |
560 | } | 561 | } |
561 | } | 562 | } |
@@ -569,12 +570,12 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
569 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 570 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
570 | } | 571 | } |
571 | } | 572 | } |
572 | set_cpus_allowed(current, saved_mask); | 573 | set_cpus_allowed_ptr(current, &saved_mask); |
573 | return 0; | 574 | return 0; |
574 | 575 | ||
575 | migrate_end: | 576 | migrate_end: |
576 | preempt_enable(); | 577 | preempt_enable(); |
577 | set_cpus_allowed(current, saved_mask); | 578 | set_cpus_allowed_ptr(current, &saved_mask); |
578 | return 0; | 579 | return 0; |
579 | } | 580 | } |
580 | 581 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 14d68aa301ee..1b50244b1fdf 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | |||
@@ -229,22 +229,22 @@ static unsigned int speedstep_detect_chipset (void) | |||
229 | return 0; | 229 | return 0; |
230 | } | 230 | } |
231 | 231 | ||
232 | static unsigned int _speedstep_get(cpumask_t cpus) | 232 | static unsigned int _speedstep_get(const cpumask_t *cpus) |
233 | { | 233 | { |
234 | unsigned int speed; | 234 | unsigned int speed; |
235 | cpumask_t cpus_allowed; | 235 | cpumask_t cpus_allowed; |
236 | 236 | ||
237 | cpus_allowed = current->cpus_allowed; | 237 | cpus_allowed = current->cpus_allowed; |
238 | set_cpus_allowed(current, cpus); | 238 | set_cpus_allowed_ptr(current, cpus); |
239 | speed = speedstep_get_processor_frequency(speedstep_processor); | 239 | speed = speedstep_get_processor_frequency(speedstep_processor); |
240 | set_cpus_allowed(current, cpus_allowed); | 240 | set_cpus_allowed_ptr(current, &cpus_allowed); |
241 | dprintk("detected %u kHz as current frequency\n", speed); | 241 | dprintk("detected %u kHz as current frequency\n", speed); |
242 | return speed; | 242 | return speed; |
243 | } | 243 | } |
244 | 244 | ||
245 | static unsigned int speedstep_get(unsigned int cpu) | 245 | static unsigned int speedstep_get(unsigned int cpu) |
246 | { | 246 | { |
247 | return _speedstep_get(cpumask_of_cpu(cpu)); | 247 | return _speedstep_get(&cpumask_of_cpu(cpu)); |
248 | } | 248 | } |
249 | 249 | ||
250 | /** | 250 | /** |
@@ -267,7 +267,7 @@ static int speedstep_target (struct cpufreq_policy *policy, | |||
267 | if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) | 267 | if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) |
268 | return -EINVAL; | 268 | return -EINVAL; |
269 | 269 | ||
270 | freqs.old = _speedstep_get(policy->cpus); | 270 | freqs.old = _speedstep_get(&policy->cpus); |
271 | freqs.new = speedstep_freqs[newstate].frequency; | 271 | freqs.new = speedstep_freqs[newstate].frequency; |
272 | freqs.cpu = policy->cpu; | 272 | freqs.cpu = policy->cpu; |
273 | 273 | ||
@@ -285,12 +285,12 @@ static int speedstep_target (struct cpufreq_policy *policy, | |||
285 | } | 285 | } |
286 | 286 | ||
287 | /* switch to physical CPU where state is to be changed */ | 287 | /* switch to physical CPU where state is to be changed */ |
288 | set_cpus_allowed(current, policy->cpus); | 288 | set_cpus_allowed_ptr(current, &policy->cpus); |
289 | 289 | ||
290 | speedstep_set_state(newstate); | 290 | speedstep_set_state(newstate); |
291 | 291 | ||
292 | /* allow to be run on all CPUs */ | 292 | /* allow to be run on all CPUs */ |
293 | set_cpus_allowed(current, cpus_allowed); | 293 | set_cpus_allowed_ptr(current, &cpus_allowed); |
294 | 294 | ||
295 | for_each_cpu_mask(i, policy->cpus) { | 295 | for_each_cpu_mask(i, policy->cpus) { |
296 | freqs.cpu = i; | 296 | freqs.cpu = i; |
@@ -326,7 +326,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy) | |||
326 | #endif | 326 | #endif |
327 | 327 | ||
328 | cpus_allowed = current->cpus_allowed; | 328 | cpus_allowed = current->cpus_allowed; |
329 | set_cpus_allowed(current, policy->cpus); | 329 | set_cpus_allowed_ptr(current, &policy->cpus); |
330 | 330 | ||
331 | /* detect low and high frequency and transition latency */ | 331 | /* detect low and high frequency and transition latency */ |
332 | result = speedstep_get_freqs(speedstep_processor, | 332 | result = speedstep_get_freqs(speedstep_processor, |
@@ -334,12 +334,12 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy) | |||
334 | &speedstep_freqs[SPEEDSTEP_HIGH].frequency, | 334 | &speedstep_freqs[SPEEDSTEP_HIGH].frequency, |
335 | &policy->cpuinfo.transition_latency, | 335 | &policy->cpuinfo.transition_latency, |
336 | &speedstep_set_state); | 336 | &speedstep_set_state); |
337 | set_cpus_allowed(current, cpus_allowed); | 337 | set_cpus_allowed_ptr(current, &cpus_allowed); |
338 | if (result) | 338 | if (result) |
339 | return result; | 339 | return result; |
340 | 340 | ||
341 | /* get current speed setting */ | 341 | /* get current speed setting */ |
342 | speed = _speedstep_get(policy->cpus); | 342 | speed = _speedstep_get(&policy->cpus); |
343 | if (!speed) | 343 | if (!speed) |
344 | return -EIO; | 344 | return -EIO; |
345 | 345 | ||
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 1b889860eb73..26d615dcb149 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -129,7 +129,7 @@ struct _cpuid4_info { | |||
129 | union _cpuid4_leaf_ebx ebx; | 129 | union _cpuid4_leaf_ebx ebx; |
130 | union _cpuid4_leaf_ecx ecx; | 130 | union _cpuid4_leaf_ecx ecx; |
131 | unsigned long size; | 131 | unsigned long size; |
132 | cpumask_t shared_cpu_map; | 132 | cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ |
133 | }; | 133 | }; |
134 | 134 | ||
135 | unsigned short num_cache_leaves; | 135 | unsigned short num_cache_leaves; |
@@ -451,8 +451,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
451 | } | 451 | } |
452 | 452 | ||
453 | /* pointer to _cpuid4_info array (for each cache leaf) */ | 453 | /* pointer to _cpuid4_info array (for each cache leaf) */ |
454 | static struct _cpuid4_info *cpuid4_info[NR_CPUS]; | 454 | static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); |
455 | #define CPUID4_INFO_IDX(x,y) (&((cpuid4_info[x])[y])) | 455 | #define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) |
456 | 456 | ||
457 | #ifdef CONFIG_SMP | 457 | #ifdef CONFIG_SMP |
458 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | 458 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) |
@@ -474,7 +474,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | |||
474 | if (cpu_data(i).apicid >> index_msb == | 474 | if (cpu_data(i).apicid >> index_msb == |
475 | c->apicid >> index_msb) { | 475 | c->apicid >> index_msb) { |
476 | cpu_set(i, this_leaf->shared_cpu_map); | 476 | cpu_set(i, this_leaf->shared_cpu_map); |
477 | if (i != cpu && cpuid4_info[i]) { | 477 | if (i != cpu && per_cpu(cpuid4_info, i)) { |
478 | sibling_leaf = CPUID4_INFO_IDX(i, index); | 478 | sibling_leaf = CPUID4_INFO_IDX(i, index); |
479 | cpu_set(cpu, sibling_leaf->shared_cpu_map); | 479 | cpu_set(cpu, sibling_leaf->shared_cpu_map); |
480 | } | 480 | } |
@@ -505,8 +505,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) | |||
505 | for (i = 0; i < num_cache_leaves; i++) | 505 | for (i = 0; i < num_cache_leaves; i++) |
506 | cache_remove_shared_cpu_map(cpu, i); | 506 | cache_remove_shared_cpu_map(cpu, i); |
507 | 507 | ||
508 | kfree(cpuid4_info[cpu]); | 508 | kfree(per_cpu(cpuid4_info, cpu)); |
509 | cpuid4_info[cpu] = NULL; | 509 | per_cpu(cpuid4_info, cpu) = NULL; |
510 | } | 510 | } |
511 | 511 | ||
512 | static int __cpuinit detect_cache_attributes(unsigned int cpu) | 512 | static int __cpuinit detect_cache_attributes(unsigned int cpu) |
@@ -519,13 +519,13 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) | |||
519 | if (num_cache_leaves == 0) | 519 | if (num_cache_leaves == 0) |
520 | return -ENOENT; | 520 | return -ENOENT; |
521 | 521 | ||
522 | cpuid4_info[cpu] = kzalloc( | 522 | per_cpu(cpuid4_info, cpu) = kzalloc( |
523 | sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); | 523 | sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); |
524 | if (cpuid4_info[cpu] == NULL) | 524 | if (per_cpu(cpuid4_info, cpu) == NULL) |
525 | return -ENOMEM; | 525 | return -ENOMEM; |
526 | 526 | ||
527 | oldmask = current->cpus_allowed; | 527 | oldmask = current->cpus_allowed; |
528 | retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 528 | retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
529 | if (retval) | 529 | if (retval) |
530 | goto out; | 530 | goto out; |
531 | 531 | ||
@@ -542,12 +542,12 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) | |||
542 | } | 542 | } |
543 | cache_shared_cpu_map_setup(cpu, j); | 543 | cache_shared_cpu_map_setup(cpu, j); |
544 | } | 544 | } |
545 | set_cpus_allowed(current, oldmask); | 545 | set_cpus_allowed_ptr(current, &oldmask); |
546 | 546 | ||
547 | out: | 547 | out: |
548 | if (retval) { | 548 | if (retval) { |
549 | kfree(cpuid4_info[cpu]); | 549 | kfree(per_cpu(cpuid4_info, cpu)); |
550 | cpuid4_info[cpu] = NULL; | 550 | per_cpu(cpuid4_info, cpu) = NULL; |
551 | } | 551 | } |
552 | 552 | ||
553 | return retval; | 553 | return retval; |
@@ -561,7 +561,7 @@ out: | |||
561 | extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ | 561 | extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ |
562 | 562 | ||
563 | /* pointer to kobject for cpuX/cache */ | 563 | /* pointer to kobject for cpuX/cache */ |
564 | static struct kobject * cache_kobject[NR_CPUS]; | 564 | static DEFINE_PER_CPU(struct kobject *, cache_kobject); |
565 | 565 | ||
566 | struct _index_kobject { | 566 | struct _index_kobject { |
567 | struct kobject kobj; | 567 | struct kobject kobj; |
@@ -570,8 +570,8 @@ struct _index_kobject { | |||
570 | }; | 570 | }; |
571 | 571 | ||
572 | /* pointer to array of kobjects for cpuX/cache/indexY */ | 572 | /* pointer to array of kobjects for cpuX/cache/indexY */ |
573 | static struct _index_kobject *index_kobject[NR_CPUS]; | 573 | static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); |
574 | #define INDEX_KOBJECT_PTR(x,y) (&((index_kobject[x])[y])) | 574 | #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) |
575 | 575 | ||
576 | #define show_one_plus(file_name, object, val) \ | 576 | #define show_one_plus(file_name, object, val) \ |
577 | static ssize_t show_##file_name \ | 577 | static ssize_t show_##file_name \ |
@@ -591,11 +591,32 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) | |||
591 | return sprintf (buf, "%luK\n", this_leaf->size / 1024); | 591 | return sprintf (buf, "%luK\n", this_leaf->size / 1024); |
592 | } | 592 | } |
593 | 593 | ||
594 | static ssize_t show_shared_cpu_map(struct _cpuid4_info *this_leaf, char *buf) | 594 | static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, |
595 | int type, char *buf) | ||
595 | { | 596 | { |
596 | char mask_str[NR_CPUS]; | 597 | ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; |
597 | cpumask_scnprintf(mask_str, NR_CPUS, this_leaf->shared_cpu_map); | 598 | int n = 0; |
598 | return sprintf(buf, "%s\n", mask_str); | 599 | |
600 | if (len > 1) { | ||
601 | cpumask_t *mask = &this_leaf->shared_cpu_map; | ||
602 | |||
603 | n = type? | ||
604 | cpulist_scnprintf(buf, len-2, *mask): | ||
605 | cpumask_scnprintf(buf, len-2, *mask); | ||
606 | buf[n++] = '\n'; | ||
607 | buf[n] = '\0'; | ||
608 | } | ||
609 | return n; | ||
610 | } | ||
611 | |||
612 | static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) | ||
613 | { | ||
614 | return show_shared_cpu_map_func(leaf, 0, buf); | ||
615 | } | ||
616 | |||
617 | static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) | ||
618 | { | ||
619 | return show_shared_cpu_map_func(leaf, 1, buf); | ||
599 | } | 620 | } |
600 | 621 | ||
601 | static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { | 622 | static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { |
@@ -633,6 +654,7 @@ define_one_ro(ways_of_associativity); | |||
633 | define_one_ro(number_of_sets); | 654 | define_one_ro(number_of_sets); |
634 | define_one_ro(size); | 655 | define_one_ro(size); |
635 | define_one_ro(shared_cpu_map); | 656 | define_one_ro(shared_cpu_map); |
657 | define_one_ro(shared_cpu_list); | ||
636 | 658 | ||
637 | static struct attribute * default_attrs[] = { | 659 | static struct attribute * default_attrs[] = { |
638 | &type.attr, | 660 | &type.attr, |
@@ -643,6 +665,7 @@ static struct attribute * default_attrs[] = { | |||
643 | &number_of_sets.attr, | 665 | &number_of_sets.attr, |
644 | &size.attr, | 666 | &size.attr, |
645 | &shared_cpu_map.attr, | 667 | &shared_cpu_map.attr, |
668 | &shared_cpu_list.attr, | ||
646 | NULL | 669 | NULL |
647 | }; | 670 | }; |
648 | 671 | ||
@@ -684,10 +707,10 @@ static struct kobj_type ktype_percpu_entry = { | |||
684 | 707 | ||
685 | static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) | 708 | static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) |
686 | { | 709 | { |
687 | kfree(cache_kobject[cpu]); | 710 | kfree(per_cpu(cache_kobject, cpu)); |
688 | kfree(index_kobject[cpu]); | 711 | kfree(per_cpu(index_kobject, cpu)); |
689 | cache_kobject[cpu] = NULL; | 712 | per_cpu(cache_kobject, cpu) = NULL; |
690 | index_kobject[cpu] = NULL; | 713 | per_cpu(index_kobject, cpu) = NULL; |
691 | free_cache_attributes(cpu); | 714 | free_cache_attributes(cpu); |
692 | } | 715 | } |
693 | 716 | ||
@@ -703,13 +726,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) | |||
703 | return err; | 726 | return err; |
704 | 727 | ||
705 | /* Allocate all required memory */ | 728 | /* Allocate all required memory */ |
706 | cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL); | 729 | per_cpu(cache_kobject, cpu) = |
707 | if (unlikely(cache_kobject[cpu] == NULL)) | 730 | kzalloc(sizeof(struct kobject), GFP_KERNEL); |
731 | if (unlikely(per_cpu(cache_kobject, cpu) == NULL)) | ||
708 | goto err_out; | 732 | goto err_out; |
709 | 733 | ||
710 | index_kobject[cpu] = kzalloc( | 734 | per_cpu(index_kobject, cpu) = kzalloc( |
711 | sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); | 735 | sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); |
712 | if (unlikely(index_kobject[cpu] == NULL)) | 736 | if (unlikely(per_cpu(index_kobject, cpu) == NULL)) |
713 | goto err_out; | 737 | goto err_out; |
714 | 738 | ||
715 | return 0; | 739 | return 0; |
@@ -733,7 +757,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) | |||
733 | if (unlikely(retval < 0)) | 757 | if (unlikely(retval < 0)) |
734 | return retval; | 758 | return retval; |
735 | 759 | ||
736 | retval = kobject_init_and_add(cache_kobject[cpu], &ktype_percpu_entry, | 760 | retval = kobject_init_and_add(per_cpu(cache_kobject, cpu), |
761 | &ktype_percpu_entry, | ||
737 | &sys_dev->kobj, "%s", "cache"); | 762 | &sys_dev->kobj, "%s", "cache"); |
738 | if (retval < 0) { | 763 | if (retval < 0) { |
739 | cpuid4_cache_sysfs_exit(cpu); | 764 | cpuid4_cache_sysfs_exit(cpu); |
@@ -745,13 +770,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) | |||
745 | this_object->cpu = cpu; | 770 | this_object->cpu = cpu; |
746 | this_object->index = i; | 771 | this_object->index = i; |
747 | retval = kobject_init_and_add(&(this_object->kobj), | 772 | retval = kobject_init_and_add(&(this_object->kobj), |
748 | &ktype_cache, cache_kobject[cpu], | 773 | &ktype_cache, |
774 | per_cpu(cache_kobject, cpu), | ||
749 | "index%1lu", i); | 775 | "index%1lu", i); |
750 | if (unlikely(retval)) { | 776 | if (unlikely(retval)) { |
751 | for (j = 0; j < i; j++) { | 777 | for (j = 0; j < i; j++) { |
752 | kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); | 778 | kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); |
753 | } | 779 | } |
754 | kobject_put(cache_kobject[cpu]); | 780 | kobject_put(per_cpu(cache_kobject, cpu)); |
755 | cpuid4_cache_sysfs_exit(cpu); | 781 | cpuid4_cache_sysfs_exit(cpu); |
756 | break; | 782 | break; |
757 | } | 783 | } |
@@ -760,7 +786,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) | |||
760 | if (!retval) | 786 | if (!retval) |
761 | cpu_set(cpu, cache_dev_map); | 787 | cpu_set(cpu, cache_dev_map); |
762 | 788 | ||
763 | kobject_uevent(cache_kobject[cpu], KOBJ_ADD); | 789 | kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); |
764 | return retval; | 790 | return retval; |
765 | } | 791 | } |
766 | 792 | ||
@@ -769,7 +795,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) | |||
769 | unsigned int cpu = sys_dev->id; | 795 | unsigned int cpu = sys_dev->id; |
770 | unsigned long i; | 796 | unsigned long i; |
771 | 797 | ||
772 | if (cpuid4_info[cpu] == NULL) | 798 | if (per_cpu(cpuid4_info, cpu) == NULL) |
773 | return; | 799 | return; |
774 | if (!cpu_isset(cpu, cache_dev_map)) | 800 | if (!cpu_isset(cpu, cache_dev_map)) |
775 | return; | 801 | return; |
@@ -777,7 +803,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) | |||
777 | 803 | ||
778 | for (i = 0; i < num_cache_leaves; i++) | 804 | for (i = 0; i < num_cache_leaves; i++) |
779 | kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); | 805 | kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); |
780 | kobject_put(cache_kobject[cpu]); | 806 | kobject_put(per_cpu(cache_kobject, cpu)); |
781 | cpuid4_cache_sysfs_exit(cpu); | 807 | cpuid4_cache_sysfs_exit(cpu); |
782 | } | 808 | } |
783 | 809 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 32671da8184e..7c9a813e1193 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c | |||
@@ -251,18 +251,18 @@ struct threshold_attr { | |||
251 | ssize_t(*store) (struct threshold_block *, const char *, size_t count); | 251 | ssize_t(*store) (struct threshold_block *, const char *, size_t count); |
252 | }; | 252 | }; |
253 | 253 | ||
254 | static cpumask_t affinity_set(unsigned int cpu) | 254 | static void affinity_set(unsigned int cpu, cpumask_t *oldmask, |
255 | cpumask_t *newmask) | ||
255 | { | 256 | { |
256 | cpumask_t oldmask = current->cpus_allowed; | 257 | *oldmask = current->cpus_allowed; |
257 | cpumask_t newmask = CPU_MASK_NONE; | 258 | cpus_clear(*newmask); |
258 | cpu_set(cpu, newmask); | 259 | cpu_set(cpu, *newmask); |
259 | set_cpus_allowed(current, newmask); | 260 | set_cpus_allowed_ptr(current, newmask); |
260 | return oldmask; | ||
261 | } | 261 | } |
262 | 262 | ||
263 | static void affinity_restore(cpumask_t oldmask) | 263 | static void affinity_restore(const cpumask_t *oldmask) |
264 | { | 264 | { |
265 | set_cpus_allowed(current, oldmask); | 265 | set_cpus_allowed_ptr(current, oldmask); |
266 | } | 266 | } |
267 | 267 | ||
268 | #define SHOW_FIELDS(name) \ | 268 | #define SHOW_FIELDS(name) \ |
@@ -277,15 +277,15 @@ static ssize_t store_interrupt_enable(struct threshold_block *b, | |||
277 | const char *buf, size_t count) | 277 | const char *buf, size_t count) |
278 | { | 278 | { |
279 | char *end; | 279 | char *end; |
280 | cpumask_t oldmask; | 280 | cpumask_t oldmask, newmask; |
281 | unsigned long new = simple_strtoul(buf, &end, 0); | 281 | unsigned long new = simple_strtoul(buf, &end, 0); |
282 | if (end == buf) | 282 | if (end == buf) |
283 | return -EINVAL; | 283 | return -EINVAL; |
284 | b->interrupt_enable = !!new; | 284 | b->interrupt_enable = !!new; |
285 | 285 | ||
286 | oldmask = affinity_set(b->cpu); | 286 | affinity_set(b->cpu, &oldmask, &newmask); |
287 | threshold_restart_bank(b, 0, 0); | 287 | threshold_restart_bank(b, 0, 0); |
288 | affinity_restore(oldmask); | 288 | affinity_restore(&oldmask); |
289 | 289 | ||
290 | return end - buf; | 290 | return end - buf; |
291 | } | 291 | } |
@@ -294,7 +294,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b, | |||
294 | const char *buf, size_t count) | 294 | const char *buf, size_t count) |
295 | { | 295 | { |
296 | char *end; | 296 | char *end; |
297 | cpumask_t oldmask; | 297 | cpumask_t oldmask, newmask; |
298 | u16 old; | 298 | u16 old; |
299 | unsigned long new = simple_strtoul(buf, &end, 0); | 299 | unsigned long new = simple_strtoul(buf, &end, 0); |
300 | if (end == buf) | 300 | if (end == buf) |
@@ -306,9 +306,9 @@ static ssize_t store_threshold_limit(struct threshold_block *b, | |||
306 | old = b->threshold_limit; | 306 | old = b->threshold_limit; |
307 | b->threshold_limit = new; | 307 | b->threshold_limit = new; |
308 | 308 | ||
309 | oldmask = affinity_set(b->cpu); | 309 | affinity_set(b->cpu, &oldmask, &newmask); |
310 | threshold_restart_bank(b, 0, old); | 310 | threshold_restart_bank(b, 0, old); |
311 | affinity_restore(oldmask); | 311 | affinity_restore(&oldmask); |
312 | 312 | ||
313 | return end - buf; | 313 | return end - buf; |
314 | } | 314 | } |
@@ -316,10 +316,10 @@ static ssize_t store_threshold_limit(struct threshold_block *b, | |||
316 | static ssize_t show_error_count(struct threshold_block *b, char *buf) | 316 | static ssize_t show_error_count(struct threshold_block *b, char *buf) |
317 | { | 317 | { |
318 | u32 high, low; | 318 | u32 high, low; |
319 | cpumask_t oldmask; | 319 | cpumask_t oldmask, newmask; |
320 | oldmask = affinity_set(b->cpu); | 320 | affinity_set(b->cpu, &oldmask, &newmask); |
321 | rdmsr(b->address, low, high); | 321 | rdmsr(b->address, low, high); |
322 | affinity_restore(oldmask); | 322 | affinity_restore(&oldmask); |
323 | return sprintf(buf, "%x\n", | 323 | return sprintf(buf, "%x\n", |
324 | (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); | 324 | (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); |
325 | } | 325 | } |
@@ -327,10 +327,10 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf) | |||
327 | static ssize_t store_error_count(struct threshold_block *b, | 327 | static ssize_t store_error_count(struct threshold_block *b, |
328 | const char *buf, size_t count) | 328 | const char *buf, size_t count) |
329 | { | 329 | { |
330 | cpumask_t oldmask; | 330 | cpumask_t oldmask, newmask; |
331 | oldmask = affinity_set(b->cpu); | 331 | affinity_set(b->cpu, &oldmask, &newmask); |
332 | threshold_restart_bank(b, 1, 0); | 332 | threshold_restart_bank(b, 1, 0); |
333 | affinity_restore(oldmask); | 333 | affinity_restore(&oldmask); |
334 | return 1; | 334 | return 1; |
335 | } | 335 | } |
336 | 336 | ||
@@ -468,7 +468,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
468 | { | 468 | { |
469 | int i, err = 0; | 469 | int i, err = 0; |
470 | struct threshold_bank *b = NULL; | 470 | struct threshold_bank *b = NULL; |
471 | cpumask_t oldmask = CPU_MASK_NONE; | 471 | cpumask_t oldmask, newmask; |
472 | char name[32]; | 472 | char name[32]; |
473 | 473 | ||
474 | sprintf(name, "threshold_bank%i", bank); | 474 | sprintf(name, "threshold_bank%i", bank); |
@@ -519,10 +519,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
519 | 519 | ||
520 | per_cpu(threshold_banks, cpu)[bank] = b; | 520 | per_cpu(threshold_banks, cpu)[bank] = b; |
521 | 521 | ||
522 | oldmask = affinity_set(cpu); | 522 | affinity_set(cpu, &oldmask, &newmask); |
523 | err = allocate_threshold_blocks(cpu, bank, 0, | 523 | err = allocate_threshold_blocks(cpu, bank, 0, |
524 | MSR_IA32_MC0_MISC + bank * 4); | 524 | MSR_IA32_MC0_MISC + bank * 4); |
525 | affinity_restore(oldmask); | 525 | affinity_restore(&oldmask); |
526 | 526 | ||
527 | if (err) | 527 | if (err) |
528 | goto out_free; | 528 | goto out_free; |
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index b54464b26658..9ba11d07920f 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c | |||
@@ -785,7 +785,7 @@ static void __clear_irq_vector(int irq) | |||
785 | per_cpu(vector_irq, cpu)[vector] = -1; | 785 | per_cpu(vector_irq, cpu)[vector] = -1; |
786 | 786 | ||
787 | cfg->vector = 0; | 787 | cfg->vector = 0; |
788 | cfg->domain = CPU_MASK_NONE; | 788 | cpus_clear(cfg->domain); |
789 | } | 789 | } |
790 | 790 | ||
791 | void __setup_vector_irq(int cpu) | 791 | void __setup_vector_irq(int cpu) |
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 25cf6dee4e56..69729e38b78a 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c | |||
@@ -402,7 +402,7 @@ static int do_microcode_update (void) | |||
402 | 402 | ||
403 | if (!uci->valid) | 403 | if (!uci->valid) |
404 | continue; | 404 | continue; |
405 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 405 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
406 | error = get_maching_microcode(new_mc, cpu); | 406 | error = get_maching_microcode(new_mc, cpu); |
407 | if (error < 0) | 407 | if (error < 0) |
408 | goto out; | 408 | goto out; |
@@ -416,7 +416,7 @@ out: | |||
416 | vfree(new_mc); | 416 | vfree(new_mc); |
417 | if (cursor < 0) | 417 | if (cursor < 0) |
418 | error = cursor; | 418 | error = cursor; |
419 | set_cpus_allowed(current, old); | 419 | set_cpus_allowed_ptr(current, &old); |
420 | return error; | 420 | return error; |
421 | } | 421 | } |
422 | 422 | ||
@@ -579,7 +579,7 @@ static int apply_microcode_check_cpu(int cpu) | |||
579 | return 0; | 579 | return 0; |
580 | 580 | ||
581 | old = current->cpus_allowed; | 581 | old = current->cpus_allowed; |
582 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 582 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
583 | 583 | ||
584 | /* Check if the microcode we have in memory matches the CPU */ | 584 | /* Check if the microcode we have in memory matches the CPU */ |
585 | if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || | 585 | if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || |
@@ -610,7 +610,7 @@ static int apply_microcode_check_cpu(int cpu) | |||
610 | " sig=0x%x, pf=0x%x, rev=0x%x\n", | 610 | " sig=0x%x, pf=0x%x, rev=0x%x\n", |
611 | cpu, uci->sig, uci->pf, uci->rev); | 611 | cpu, uci->sig, uci->pf, uci->rev); |
612 | 612 | ||
613 | set_cpus_allowed(current, old); | 613 | set_cpus_allowed_ptr(current, &old); |
614 | return err; | 614 | return err; |
615 | } | 615 | } |
616 | 616 | ||
@@ -621,13 +621,13 @@ static void microcode_init_cpu(int cpu, int resume) | |||
621 | 621 | ||
622 | old = current->cpus_allowed; | 622 | old = current->cpus_allowed; |
623 | 623 | ||
624 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 624 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
625 | mutex_lock(µcode_mutex); | 625 | mutex_lock(µcode_mutex); |
626 | collect_cpu_info(cpu); | 626 | collect_cpu_info(cpu); |
627 | if (uci->valid && system_state == SYSTEM_RUNNING && !resume) | 627 | if (uci->valid && system_state == SYSTEM_RUNNING && !resume) |
628 | cpu_request_microcode(cpu); | 628 | cpu_request_microcode(cpu); |
629 | mutex_unlock(µcode_mutex); | 629 | mutex_unlock(µcode_mutex); |
630 | set_cpus_allowed(current, old); | 630 | set_cpus_allowed_ptr(current, &old); |
631 | } | 631 | } |
632 | 632 | ||
633 | static void microcode_fini_cpu(int cpu) | 633 | static void microcode_fini_cpu(int cpu) |
@@ -657,14 +657,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) | |||
657 | old = current->cpus_allowed; | 657 | old = current->cpus_allowed; |
658 | 658 | ||
659 | get_online_cpus(); | 659 | get_online_cpus(); |
660 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 660 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
661 | 661 | ||
662 | mutex_lock(µcode_mutex); | 662 | mutex_lock(µcode_mutex); |
663 | if (uci->valid) | 663 | if (uci->valid) |
664 | err = cpu_request_microcode(cpu); | 664 | err = cpu_request_microcode(cpu); |
665 | mutex_unlock(µcode_mutex); | 665 | mutex_unlock(µcode_mutex); |
666 | put_online_cpus(); | 666 | put_online_cpus(); |
667 | set_cpus_allowed(current, old); | 667 | set_cpus_allowed_ptr(current, &old); |
668 | } | 668 | } |
669 | if (err) | 669 | if (err) |
670 | return err; | 670 | return err; |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9692202d3bfb..19c9386ac118 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -420,7 +420,7 @@ static void native_machine_shutdown(void) | |||
420 | reboot_cpu_id = smp_processor_id(); | 420 | reboot_cpu_id = smp_processor_id(); |
421 | 421 | ||
422 | /* Make certain I only run on the appropriate processor */ | 422 | /* Make certain I only run on the appropriate processor */ |
423 | set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); | 423 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); |
424 | 424 | ||
425 | /* O.K Now that I'm on the appropriate processor, | 425 | /* O.K Now that I'm on the appropriate processor, |
426 | * stop all of the others. | 426 | * stop all of the others. |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ed157c90412e..0d1f44ae6eea 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -54,6 +54,24 @@ static void __init setup_per_cpu_maps(void) | |||
54 | #endif | 54 | #endif |
55 | } | 55 | } |
56 | 56 | ||
57 | #ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP | ||
58 | cpumask_t *cpumask_of_cpu_map __read_mostly; | ||
59 | EXPORT_SYMBOL(cpumask_of_cpu_map); | ||
60 | |||
61 | /* requires nr_cpu_ids to be initialized */ | ||
62 | static void __init setup_cpumask_of_cpu(void) | ||
63 | { | ||
64 | int i; | ||
65 | |||
66 | /* alloc_bootmem zeroes memory */ | ||
67 | cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); | ||
68 | for (i = 0; i < nr_cpu_ids; i++) | ||
69 | cpu_set(i, cpumask_of_cpu_map[i]); | ||
70 | } | ||
71 | #else | ||
72 | static inline void setup_cpumask_of_cpu(void) { } | ||
73 | #endif | ||
74 | |||
57 | #ifdef CONFIG_X86_32 | 75 | #ifdef CONFIG_X86_32 |
58 | /* | 76 | /* |
59 | * Great future not-so-futuristic plan: make i386 and x86_64 do it | 77 | * Great future not-so-futuristic plan: make i386 and x86_64 do it |
@@ -70,7 +88,7 @@ EXPORT_SYMBOL(__per_cpu_offset); | |||
70 | */ | 88 | */ |
71 | void __init setup_per_cpu_areas(void) | 89 | void __init setup_per_cpu_areas(void) |
72 | { | 90 | { |
73 | int i; | 91 | int i, highest_cpu = 0; |
74 | unsigned long size; | 92 | unsigned long size; |
75 | 93 | ||
76 | #ifdef CONFIG_HOTPLUG_CPU | 94 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -104,10 +122,18 @@ void __init setup_per_cpu_areas(void) | |||
104 | __per_cpu_offset[i] = ptr - __per_cpu_start; | 122 | __per_cpu_offset[i] = ptr - __per_cpu_start; |
105 | #endif | 123 | #endif |
106 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | 124 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); |
125 | |||
126 | highest_cpu = i; | ||
107 | } | 127 | } |
108 | 128 | ||
129 | nr_cpu_ids = highest_cpu + 1; | ||
130 | printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); | ||
131 | |||
109 | /* Setup percpu data maps */ | 132 | /* Setup percpu data maps */ |
110 | setup_per_cpu_maps(); | 133 | setup_per_cpu_maps(); |
134 | |||
135 | /* Setup cpumask_of_cpu map */ | ||
136 | setup_cpumask_of_cpu(); | ||
111 | } | 137 | } |
112 | 138 | ||
113 | #endif | 139 | #endif |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index cb3170186355..9a6892200b27 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -386,9 +386,10 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | |||
386 | * Sets up the system RAM area from start_pfn to end_pfn according to the | 386 | * Sets up the system RAM area from start_pfn to end_pfn according to the |
387 | * numa=fake command-line option. | 387 | * numa=fake command-line option. |
388 | */ | 388 | */ |
389 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
390 | |||
389 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 391 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) |
390 | { | 392 | { |
391 | struct bootnode nodes[MAX_NUMNODES]; | ||
392 | u64 size, addr = start_pfn << PAGE_SHIFT; | 393 | u64 size, addr = start_pfn << PAGE_SHIFT; |
393 | u64 max_addr = end_pfn << PAGE_SHIFT; | 394 | u64 max_addr = end_pfn << PAGE_SHIFT; |
394 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; | 395 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; |
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 1f11cf0a307f..cc48d3fde545 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c | |||
@@ -23,8 +23,8 @@ | |||
23 | #include "op_x86_model.h" | 23 | #include "op_x86_model.h" |
24 | 24 | ||
25 | static struct op_x86_model_spec const *model; | 25 | static struct op_x86_model_spec const *model; |
26 | static struct op_msrs cpu_msrs[NR_CPUS]; | 26 | static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); |
27 | static unsigned long saved_lvtpc[NR_CPUS]; | 27 | static DEFINE_PER_CPU(unsigned long, saved_lvtpc); |
28 | 28 | ||
29 | static int nmi_start(void); | 29 | static int nmi_start(void); |
30 | static void nmi_stop(void); | 30 | static void nmi_stop(void); |
@@ -89,7 +89,7 @@ static int profile_exceptions_notify(struct notifier_block *self, | |||
89 | 89 | ||
90 | switch (val) { | 90 | switch (val) { |
91 | case DIE_NMI: | 91 | case DIE_NMI: |
92 | if (model->check_ctrs(args->regs, &cpu_msrs[cpu])) | 92 | if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) |
93 | ret = NOTIFY_STOP; | 93 | ret = NOTIFY_STOP; |
94 | break; | 94 | break; |
95 | default: | 95 | default: |
@@ -126,7 +126,7 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) | |||
126 | static void nmi_save_registers(void *dummy) | 126 | static void nmi_save_registers(void *dummy) |
127 | { | 127 | { |
128 | int cpu = smp_processor_id(); | 128 | int cpu = smp_processor_id(); |
129 | struct op_msrs *msrs = &cpu_msrs[cpu]; | 129 | struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); |
130 | nmi_cpu_save_registers(msrs); | 130 | nmi_cpu_save_registers(msrs); |
131 | } | 131 | } |
132 | 132 | ||
@@ -134,10 +134,10 @@ static void free_msrs(void) | |||
134 | { | 134 | { |
135 | int i; | 135 | int i; |
136 | for_each_possible_cpu(i) { | 136 | for_each_possible_cpu(i) { |
137 | kfree(cpu_msrs[i].counters); | 137 | kfree(per_cpu(cpu_msrs, i).counters); |
138 | cpu_msrs[i].counters = NULL; | 138 | per_cpu(cpu_msrs, i).counters = NULL; |
139 | kfree(cpu_msrs[i].controls); | 139 | kfree(per_cpu(cpu_msrs, i).controls); |
140 | cpu_msrs[i].controls = NULL; | 140 | per_cpu(cpu_msrs, i).controls = NULL; |
141 | } | 141 | } |
142 | } | 142 | } |
143 | 143 | ||
@@ -149,13 +149,15 @@ static int allocate_msrs(void) | |||
149 | 149 | ||
150 | int i; | 150 | int i; |
151 | for_each_possible_cpu(i) { | 151 | for_each_possible_cpu(i) { |
152 | cpu_msrs[i].counters = kmalloc(counters_size, GFP_KERNEL); | 152 | per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, |
153 | if (!cpu_msrs[i].counters) { | 153 | GFP_KERNEL); |
154 | if (!per_cpu(cpu_msrs, i).counters) { | ||
154 | success = 0; | 155 | success = 0; |
155 | break; | 156 | break; |
156 | } | 157 | } |
157 | cpu_msrs[i].controls = kmalloc(controls_size, GFP_KERNEL); | 158 | per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, |
158 | if (!cpu_msrs[i].controls) { | 159 | GFP_KERNEL); |
160 | if (!per_cpu(cpu_msrs, i).controls) { | ||
159 | success = 0; | 161 | success = 0; |
160 | break; | 162 | break; |
161 | } | 163 | } |
@@ -170,11 +172,11 @@ static int allocate_msrs(void) | |||
170 | static void nmi_cpu_setup(void *dummy) | 172 | static void nmi_cpu_setup(void *dummy) |
171 | { | 173 | { |
172 | int cpu = smp_processor_id(); | 174 | int cpu = smp_processor_id(); |
173 | struct op_msrs *msrs = &cpu_msrs[cpu]; | 175 | struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); |
174 | spin_lock(&oprofilefs_lock); | 176 | spin_lock(&oprofilefs_lock); |
175 | model->setup_ctrs(msrs); | 177 | model->setup_ctrs(msrs); |
176 | spin_unlock(&oprofilefs_lock); | 178 | spin_unlock(&oprofilefs_lock); |
177 | saved_lvtpc[cpu] = apic_read(APIC_LVTPC); | 179 | per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); |
178 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 180 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
179 | } | 181 | } |
180 | 182 | ||
@@ -203,13 +205,15 @@ static int nmi_setup(void) | |||
203 | */ | 205 | */ |
204 | 206 | ||
205 | /* Assume saved/restored counters are the same on all CPUs */ | 207 | /* Assume saved/restored counters are the same on all CPUs */ |
206 | model->fill_in_addresses(&cpu_msrs[0]); | 208 | model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); |
207 | for_each_possible_cpu(cpu) { | 209 | for_each_possible_cpu(cpu) { |
208 | if (cpu != 0) { | 210 | if (cpu != 0) { |
209 | memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters, | 211 | memcpy(per_cpu(cpu_msrs, cpu).counters, |
212 | per_cpu(cpu_msrs, 0).counters, | ||
210 | sizeof(struct op_msr) * model->num_counters); | 213 | sizeof(struct op_msr) * model->num_counters); |
211 | 214 | ||
212 | memcpy(cpu_msrs[cpu].controls, cpu_msrs[0].controls, | 215 | memcpy(per_cpu(cpu_msrs, cpu).controls, |
216 | per_cpu(cpu_msrs, 0).controls, | ||
213 | sizeof(struct op_msr) * model->num_controls); | 217 | sizeof(struct op_msr) * model->num_controls); |
214 | } | 218 | } |
215 | 219 | ||
@@ -249,7 +253,7 @@ static void nmi_cpu_shutdown(void *dummy) | |||
249 | { | 253 | { |
250 | unsigned int v; | 254 | unsigned int v; |
251 | int cpu = smp_processor_id(); | 255 | int cpu = smp_processor_id(); |
252 | struct op_msrs *msrs = &cpu_msrs[cpu]; | 256 | struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); |
253 | 257 | ||
254 | /* restoring APIC_LVTPC can trigger an apic error because the delivery | 258 | /* restoring APIC_LVTPC can trigger an apic error because the delivery |
255 | * mode and vector nr combination can be illegal. That's by design: on | 259 | * mode and vector nr combination can be illegal. That's by design: on |
@@ -258,23 +262,24 @@ static void nmi_cpu_shutdown(void *dummy) | |||
258 | */ | 262 | */ |
259 | v = apic_read(APIC_LVTERR); | 263 | v = apic_read(APIC_LVTERR); |
260 | apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); | 264 | apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); |
261 | apic_write(APIC_LVTPC, saved_lvtpc[cpu]); | 265 | apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); |
262 | apic_write(APIC_LVTERR, v); | 266 | apic_write(APIC_LVTERR, v); |
263 | nmi_restore_registers(msrs); | 267 | nmi_restore_registers(msrs); |
264 | } | 268 | } |
265 | 269 | ||
266 | static void nmi_shutdown(void) | 270 | static void nmi_shutdown(void) |
267 | { | 271 | { |
272 | struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); | ||
268 | nmi_enabled = 0; | 273 | nmi_enabled = 0; |
269 | on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); | 274 | on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); |
270 | unregister_die_notifier(&profile_exceptions_nb); | 275 | unregister_die_notifier(&profile_exceptions_nb); |
271 | model->shutdown(cpu_msrs); | 276 | model->shutdown(msrs); |
272 | free_msrs(); | 277 | free_msrs(); |
273 | } | 278 | } |
274 | 279 | ||
275 | static void nmi_cpu_start(void *dummy) | 280 | static void nmi_cpu_start(void *dummy) |
276 | { | 281 | { |
277 | struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()]; | 282 | struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); |
278 | model->start(msrs); | 283 | model->start(msrs); |
279 | } | 284 | } |
280 | 285 | ||
@@ -286,7 +291,7 @@ static int nmi_start(void) | |||
286 | 291 | ||
287 | static void nmi_cpu_stop(void *dummy) | 292 | static void nmi_cpu_stop(void *dummy) |
288 | { | 293 | { |
289 | struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()]; | 294 | struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); |
290 | model->stop(msrs); | 295 | model->stop(msrs); |
291 | } | 296 | } |
292 | 297 | ||
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index 1b8e592a8241..0bba3a914e86 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c | |||
@@ -838,10 +838,10 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr) | |||
838 | * Migrate task to the cpu pointed by pr. | 838 | * Migrate task to the cpu pointed by pr. |
839 | */ | 839 | */ |
840 | saved_mask = current->cpus_allowed; | 840 | saved_mask = current->cpus_allowed; |
841 | set_cpus_allowed(current, cpumask_of_cpu(pr->id)); | 841 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(pr->id)); |
842 | ret = pr->throttling.acpi_processor_get_throttling(pr); | 842 | ret = pr->throttling.acpi_processor_get_throttling(pr); |
843 | /* restore the previous state */ | 843 | /* restore the previous state */ |
844 | set_cpus_allowed(current, saved_mask); | 844 | set_cpus_allowed_ptr(current, &saved_mask); |
845 | 845 | ||
846 | return ret; | 846 | return ret; |
847 | } | 847 | } |
@@ -1025,7 +1025,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state) | |||
1025 | * it can be called only for the cpu pointed by pr. | 1025 | * it can be called only for the cpu pointed by pr. |
1026 | */ | 1026 | */ |
1027 | if (p_throttling->shared_type == DOMAIN_COORD_TYPE_SW_ANY) { | 1027 | if (p_throttling->shared_type == DOMAIN_COORD_TYPE_SW_ANY) { |
1028 | set_cpus_allowed(current, cpumask_of_cpu(pr->id)); | 1028 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(pr->id)); |
1029 | ret = p_throttling->acpi_processor_set_throttling(pr, | 1029 | ret = p_throttling->acpi_processor_set_throttling(pr, |
1030 | t_state.target_state); | 1030 | t_state.target_state); |
1031 | } else { | 1031 | } else { |
@@ -1056,7 +1056,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state) | |||
1056 | continue; | 1056 | continue; |
1057 | } | 1057 | } |
1058 | t_state.cpu = i; | 1058 | t_state.cpu = i; |
1059 | set_cpus_allowed(current, cpumask_of_cpu(i)); | 1059 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); |
1060 | ret = match_pr->throttling. | 1060 | ret = match_pr->throttling. |
1061 | acpi_processor_set_throttling( | 1061 | acpi_processor_set_throttling( |
1062 | match_pr, t_state.target_state); | 1062 | match_pr, t_state.target_state); |
@@ -1074,7 +1074,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state) | |||
1074 | &t_state); | 1074 | &t_state); |
1075 | } | 1075 | } |
1076 | /* restore the previous state */ | 1076 | /* restore the previous state */ |
1077 | set_cpus_allowed(current, saved_mask); | 1077 | set_cpus_allowed_ptr(current, &saved_mask); |
1078 | return ret; | 1078 | return ret; |
1079 | } | 1079 | } |
1080 | 1080 | ||
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 499b003f9278..2c76afff3b15 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c | |||
@@ -103,6 +103,51 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL); | |||
103 | #endif | 103 | #endif |
104 | 104 | ||
105 | /* | 105 | /* |
106 | * Print cpu online, possible, present, and system maps | ||
107 | */ | ||
108 | static ssize_t print_cpus_map(char *buf, cpumask_t *map) | ||
109 | { | ||
110 | int n = cpulist_scnprintf(buf, PAGE_SIZE-2, *map); | ||
111 | |||
112 | buf[n++] = '\n'; | ||
113 | buf[n] = '\0'; | ||
114 | return n; | ||
115 | } | ||
116 | |||
117 | #define print_cpus_func(type) \ | ||
118 | static ssize_t print_cpus_##type(struct sysdev_class *class, char *buf) \ | ||
119 | { \ | ||
120 | return print_cpus_map(buf, &cpu_##type##_map); \ | ||
121 | } \ | ||
122 | struct sysdev_class_attribute attr_##type##_map = \ | ||
123 | _SYSDEV_CLASS_ATTR(type, 0444, print_cpus_##type, NULL) | ||
124 | |||
125 | print_cpus_func(online); | ||
126 | print_cpus_func(possible); | ||
127 | print_cpus_func(present); | ||
128 | |||
129 | struct sysdev_class_attribute *cpu_state_attr[] = { | ||
130 | &attr_online_map, | ||
131 | &attr_possible_map, | ||
132 | &attr_present_map, | ||
133 | }; | ||
134 | |||
135 | static int cpu_states_init(void) | ||
136 | { | ||
137 | int i; | ||
138 | int err = 0; | ||
139 | |||
140 | for (i = 0; i < ARRAY_SIZE(cpu_state_attr); i++) { | ||
141 | int ret; | ||
142 | ret = sysdev_class_create_file(&cpu_sysdev_class, | ||
143 | cpu_state_attr[i]); | ||
144 | if (!err) | ||
145 | err = ret; | ||
146 | } | ||
147 | return err; | ||
148 | } | ||
149 | |||
150 | /* | ||
106 | * register_cpu - Setup a sysfs device for a CPU. | 151 | * register_cpu - Setup a sysfs device for a CPU. |
107 | * @cpu - cpu->hotpluggable field set to 1 will generate a control file in | 152 | * @cpu - cpu->hotpluggable field set to 1 will generate a control file in |
108 | * sysfs for this CPU. | 153 | * sysfs for this CPU. |
@@ -147,6 +192,9 @@ int __init cpu_dev_init(void) | |||
147 | int err; | 192 | int err; |
148 | 193 | ||
149 | err = sysdev_class_register(&cpu_sysdev_class); | 194 | err = sysdev_class_register(&cpu_sysdev_class); |
195 | if (!err) | ||
196 | err = cpu_states_init(); | ||
197 | |||
150 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 198 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
151 | if (!err) | 199 | if (!err) |
152 | err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class); | 200 | err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class); |
diff --git a/drivers/base/node.c b/drivers/base/node.c index e59861f18ce5..12fde2d03d69 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -19,21 +19,34 @@ static struct sysdev_class node_class = { | |||
19 | }; | 19 | }; |
20 | 20 | ||
21 | 21 | ||
22 | static ssize_t node_read_cpumap(struct sys_device * dev, char * buf) | 22 | static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf) |
23 | { | 23 | { |
24 | struct node *node_dev = to_node(dev); | 24 | struct node *node_dev = to_node(dev); |
25 | cpumask_t mask = node_to_cpumask(node_dev->sysdev.id); | 25 | node_to_cpumask_ptr(mask, node_dev->sysdev.id); |
26 | int len; | 26 | int len; |
27 | 27 | ||
28 | /* 2004/06/03: buf currently PAGE_SIZE, need > 1 char per 4 bits. */ | 28 | /* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */ |
29 | BUILD_BUG_ON(MAX_NUMNODES/4 > PAGE_SIZE/2); | 29 | BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1)); |
30 | 30 | ||
31 | len = cpumask_scnprintf(buf, PAGE_SIZE-1, mask); | 31 | len = type? |
32 | len += sprintf(buf + len, "\n"); | 32 | cpulist_scnprintf(buf, PAGE_SIZE-2, *mask): |
33 | cpumask_scnprintf(buf, PAGE_SIZE-2, *mask); | ||
34 | buf[len++] = '\n'; | ||
35 | buf[len] = '\0'; | ||
33 | return len; | 36 | return len; |
34 | } | 37 | } |
35 | 38 | ||
36 | static SYSDEV_ATTR(cpumap, S_IRUGO, node_read_cpumap, NULL); | 39 | static inline ssize_t node_read_cpumask(struct sys_device *dev, char *buf) |
40 | { | ||
41 | return node_read_cpumap(dev, 0, buf); | ||
42 | } | ||
43 | static inline ssize_t node_read_cpulist(struct sys_device *dev, char *buf) | ||
44 | { | ||
45 | return node_read_cpumap(dev, 1, buf); | ||
46 | } | ||
47 | |||
48 | static SYSDEV_ATTR(cpumap, S_IRUGO, node_read_cpumask, NULL); | ||
49 | static SYSDEV_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL); | ||
37 | 50 | ||
38 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 51 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
39 | static ssize_t node_read_meminfo(struct sys_device * dev, char * buf) | 52 | static ssize_t node_read_meminfo(struct sys_device * dev, char * buf) |
@@ -149,6 +162,7 @@ int register_node(struct node *node, int num, struct node *parent) | |||
149 | 162 | ||
150 | if (!error){ | 163 | if (!error){ |
151 | sysdev_create_file(&node->sysdev, &attr_cpumap); | 164 | sysdev_create_file(&node->sysdev, &attr_cpumap); |
165 | sysdev_create_file(&node->sysdev, &attr_cpulist); | ||
152 | sysdev_create_file(&node->sysdev, &attr_meminfo); | 166 | sysdev_create_file(&node->sysdev, &attr_meminfo); |
153 | sysdev_create_file(&node->sysdev, &attr_numastat); | 167 | sysdev_create_file(&node->sysdev, &attr_numastat); |
154 | sysdev_create_file(&node->sysdev, &attr_distance); | 168 | sysdev_create_file(&node->sysdev, &attr_distance); |
@@ -166,6 +180,7 @@ int register_node(struct node *node, int num, struct node *parent) | |||
166 | void unregister_node(struct node *node) | 180 | void unregister_node(struct node *node) |
167 | { | 181 | { |
168 | sysdev_remove_file(&node->sysdev, &attr_cpumap); | 182 | sysdev_remove_file(&node->sysdev, &attr_cpumap); |
183 | sysdev_remove_file(&node->sysdev, &attr_cpulist); | ||
169 | sysdev_remove_file(&node->sysdev, &attr_meminfo); | 184 | sysdev_remove_file(&node->sysdev, &attr_meminfo); |
170 | sysdev_remove_file(&node->sysdev, &attr_numastat); | 185 | sysdev_remove_file(&node->sysdev, &attr_numastat); |
171 | sysdev_remove_file(&node->sysdev, &attr_distance); | 186 | sysdev_remove_file(&node->sysdev, &attr_distance); |
diff --git a/drivers/base/topology.c b/drivers/base/topology.c index e1d3ad4db2f0..fdf4044d2e74 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c | |||
@@ -40,15 +40,38 @@ static ssize_t show_##name(struct sys_device *dev, char *buf) \ | |||
40 | return sprintf(buf, "%d\n", topology_##name(cpu)); \ | 40 | return sprintf(buf, "%d\n", topology_##name(cpu)); \ |
41 | } | 41 | } |
42 | 42 | ||
43 | #define define_siblings_show_func(name) \ | 43 | static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) |
44 | static ssize_t show_##name(struct sys_device *dev, char *buf) \ | 44 | { |
45 | ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; | ||
46 | int n = 0; | ||
47 | |||
48 | if (len > 1) { | ||
49 | n = type? | ||
50 | cpulist_scnprintf(buf, len-2, *mask): | ||
51 | cpumask_scnprintf(buf, len-2, *mask); | ||
52 | buf[n++] = '\n'; | ||
53 | buf[n] = '\0'; | ||
54 | } | ||
55 | return n; | ||
56 | } | ||
57 | |||
58 | #define define_siblings_show_map(name) \ | ||
59 | static inline ssize_t show_##name(struct sys_device *dev, char *buf) \ | ||
45 | { \ | 60 | { \ |
46 | ssize_t len = -1; \ | ||
47 | unsigned int cpu = dev->id; \ | 61 | unsigned int cpu = dev->id; \ |
48 | len = cpumask_scnprintf(buf, NR_CPUS+1, topology_##name(cpu)); \ | 62 | return show_cpumap(0, &(topology_##name(cpu)), buf); \ |
49 | return (len + sprintf(buf + len, "\n")); \ | ||
50 | } | 63 | } |
51 | 64 | ||
65 | #define define_siblings_show_list(name) \ | ||
66 | static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ | ||
67 | { \ | ||
68 | unsigned int cpu = dev->id; \ | ||
69 | return show_cpumap(1, &(topology_##name(cpu)), buf); \ | ||
70 | } | ||
71 | |||
72 | #define define_siblings_show_func(name) \ | ||
73 | define_siblings_show_map(name); define_siblings_show_list(name) | ||
74 | |||
52 | #ifdef topology_physical_package_id | 75 | #ifdef topology_physical_package_id |
53 | define_id_show_func(physical_package_id); | 76 | define_id_show_func(physical_package_id); |
54 | define_one_ro(physical_package_id); | 77 | define_one_ro(physical_package_id); |
@@ -68,7 +91,9 @@ define_one_ro(core_id); | |||
68 | #ifdef topology_thread_siblings | 91 | #ifdef topology_thread_siblings |
69 | define_siblings_show_func(thread_siblings); | 92 | define_siblings_show_func(thread_siblings); |
70 | define_one_ro(thread_siblings); | 93 | define_one_ro(thread_siblings); |
71 | #define ref_thread_siblings_attr &attr_thread_siblings.attr, | 94 | define_one_ro(thread_siblings_list); |
95 | #define ref_thread_siblings_attr \ | ||
96 | &attr_thread_siblings.attr, &attr_thread_siblings_list.attr, | ||
72 | #else | 97 | #else |
73 | #define ref_thread_siblings_attr | 98 | #define ref_thread_siblings_attr |
74 | #endif | 99 | #endif |
@@ -76,7 +101,9 @@ define_one_ro(thread_siblings); | |||
76 | #ifdef topology_core_siblings | 101 | #ifdef topology_core_siblings |
77 | define_siblings_show_func(core_siblings); | 102 | define_siblings_show_func(core_siblings); |
78 | define_one_ro(core_siblings); | 103 | define_one_ro(core_siblings); |
79 | #define ref_core_siblings_attr &attr_core_siblings.attr, | 104 | define_one_ro(core_siblings_list); |
105 | #define ref_core_siblings_attr \ | ||
106 | &attr_core_siblings.attr, &attr_core_siblings_list.attr, | ||
80 | #else | 107 | #else |
81 | #define ref_core_siblings_attr | 108 | #define ref_core_siblings_attr |
82 | #endif | 109 | #endif |
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c index 1636806ec55e..0ffef3b7c6ca 100644 --- a/drivers/firmware/dcdbas.c +++ b/drivers/firmware/dcdbas.c | |||
@@ -265,7 +265,7 @@ static int smi_request(struct smi_cmd *smi_cmd) | |||
265 | 265 | ||
266 | /* SMI requires CPU 0 */ | 266 | /* SMI requires CPU 0 */ |
267 | old_mask = current->cpus_allowed; | 267 | old_mask = current->cpus_allowed; |
268 | set_cpus_allowed(current, cpumask_of_cpu(0)); | 268 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(0)); |
269 | if (smp_processor_id() != 0) { | 269 | if (smp_processor_id() != 0) { |
270 | dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n", | 270 | dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n", |
271 | __FUNCTION__); | 271 | __FUNCTION__); |
@@ -285,7 +285,7 @@ static int smi_request(struct smi_cmd *smi_cmd) | |||
285 | ); | 285 | ); |
286 | 286 | ||
287 | out: | 287 | out: |
288 | set_cpus_allowed(current, old_mask); | 288 | set_cpus_allowed_ptr(current, &old_mask); |
289 | return ret; | 289 | return ret; |
290 | } | 290 | } |
291 | 291 | ||
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index e571c72e6753..e8d94fafc280 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c | |||
@@ -182,15 +182,18 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, | |||
182 | struct mempolicy *oldpol; | 182 | struct mempolicy *oldpol; |
183 | cpumask_t oldmask = current->cpus_allowed; | 183 | cpumask_t oldmask = current->cpus_allowed; |
184 | int node = pcibus_to_node(dev->bus); | 184 | int node = pcibus_to_node(dev->bus); |
185 | if (node >= 0 && node_online(node)) | 185 | |
186 | set_cpus_allowed(current, node_to_cpumask(node)); | 186 | if (node >= 0) { |
187 | node_to_cpumask_ptr(nodecpumask, node); | ||
188 | set_cpus_allowed_ptr(current, nodecpumask); | ||
189 | } | ||
187 | /* And set default memory allocation policy */ | 190 | /* And set default memory allocation policy */ |
188 | oldpol = current->mempolicy; | 191 | oldpol = current->mempolicy; |
189 | current->mempolicy = NULL; /* fall back to system default policy */ | 192 | current->mempolicy = NULL; /* fall back to system default policy */ |
190 | #endif | 193 | #endif |
191 | error = drv->probe(dev, id); | 194 | error = drv->probe(dev, id); |
192 | #ifdef CONFIG_NUMA | 195 | #ifdef CONFIG_NUMA |
193 | set_cpus_allowed(current, oldmask); | 196 | set_cpus_allowed_ptr(current, &oldmask); |
194 | current->mempolicy = oldpol; | 197 | current->mempolicy = oldpol; |
195 | #endif | 198 | #endif |
196 | return error; | 199 | return error; |
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 8dcf1458aa2f..8d9d648daeba 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c | |||
@@ -73,8 +73,23 @@ static ssize_t local_cpus_show(struct device *dev, | |||
73 | 73 | ||
74 | mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); | 74 | mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); |
75 | len = cpumask_scnprintf(buf, PAGE_SIZE-2, mask); | 75 | len = cpumask_scnprintf(buf, PAGE_SIZE-2, mask); |
76 | strcat(buf,"\n"); | 76 | buf[len++] = '\n'; |
77 | return 1+len; | 77 | buf[len] = '\0'; |
78 | return len; | ||
79 | } | ||
80 | |||
81 | |||
82 | static ssize_t local_cpulist_show(struct device *dev, | ||
83 | struct device_attribute *attr, char *buf) | ||
84 | { | ||
85 | cpumask_t mask; | ||
86 | int len; | ||
87 | |||
88 | mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); | ||
89 | len = cpulist_scnprintf(buf, PAGE_SIZE-2, mask); | ||
90 | buf[len++] = '\n'; | ||
91 | buf[len] = '\0'; | ||
92 | return len; | ||
78 | } | 93 | } |
79 | 94 | ||
80 | /* show resources */ | 95 | /* show resources */ |
@@ -201,6 +216,7 @@ struct device_attribute pci_dev_attrs[] = { | |||
201 | __ATTR_RO(class), | 216 | __ATTR_RO(class), |
202 | __ATTR_RO(irq), | 217 | __ATTR_RO(irq), |
203 | __ATTR_RO(local_cpus), | 218 | __ATTR_RO(local_cpus), |
219 | __ATTR_RO(local_cpulist), | ||
204 | __ATTR_RO(modalias), | 220 | __ATTR_RO(modalias), |
205 | #ifdef CONFIG_NUMA | 221 | #ifdef CONFIG_NUMA |
206 | __ATTR_RO(numa_node), | 222 | __ATTR_RO(numa_node), |
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 2db2e4bb0d1e..4b3011a23eff 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c | |||
@@ -82,6 +82,7 @@ void pci_remove_legacy_files(struct pci_bus *bus) { return; } | |||
82 | * PCI Bus Class Devices | 82 | * PCI Bus Class Devices |
83 | */ | 83 | */ |
84 | static ssize_t pci_bus_show_cpuaffinity(struct device *dev, | 84 | static ssize_t pci_bus_show_cpuaffinity(struct device *dev, |
85 | int type, | ||
85 | struct device_attribute *attr, | 86 | struct device_attribute *attr, |
86 | char *buf) | 87 | char *buf) |
87 | { | 88 | { |
@@ -89,12 +90,30 @@ static ssize_t pci_bus_show_cpuaffinity(struct device *dev, | |||
89 | cpumask_t cpumask; | 90 | cpumask_t cpumask; |
90 | 91 | ||
91 | cpumask = pcibus_to_cpumask(to_pci_bus(dev)); | 92 | cpumask = pcibus_to_cpumask(to_pci_bus(dev)); |
92 | ret = cpumask_scnprintf(buf, PAGE_SIZE, cpumask); | 93 | ret = type? |
93 | if (ret < PAGE_SIZE) | 94 | cpulist_scnprintf(buf, PAGE_SIZE-2, cpumask): |
94 | buf[ret++] = '\n'; | 95 | cpumask_scnprintf(buf, PAGE_SIZE-2, cpumask); |
96 | buf[ret++] = '\n'; | ||
97 | buf[ret] = '\0'; | ||
95 | return ret; | 98 | return ret; |
96 | } | 99 | } |
97 | DEVICE_ATTR(cpuaffinity, S_IRUGO, pci_bus_show_cpuaffinity, NULL); | 100 | |
101 | static ssize_t inline pci_bus_show_cpumaskaffinity(struct device *dev, | ||
102 | struct device_attribute *attr, | ||
103 | char *buf) | ||
104 | { | ||
105 | return pci_bus_show_cpuaffinity(dev, 0, attr, buf); | ||
106 | } | ||
107 | |||
108 | static ssize_t inline pci_bus_show_cpulistaffinity(struct device *dev, | ||
109 | struct device_attribute *attr, | ||
110 | char *buf) | ||
111 | { | ||
112 | return pci_bus_show_cpuaffinity(dev, 1, attr, buf); | ||
113 | } | ||
114 | |||
115 | DEVICE_ATTR(cpuaffinity, S_IRUGO, pci_bus_show_cpumaskaffinity, NULL); | ||
116 | DEVICE_ATTR(cpulistaffinity, S_IRUGO, pci_bus_show_cpulistaffinity, NULL); | ||
98 | 117 | ||
99 | /* | 118 | /* |
100 | * PCI Bus Class | 119 | * PCI Bus Class |
diff --git a/include/asm-alpha/topology.h b/include/asm-alpha/topology.h index 420ccde6b916..149532e162c4 100644 --- a/include/asm-alpha/topology.h +++ b/include/asm-alpha/topology.h | |||
@@ -41,8 +41,7 @@ static inline cpumask_t node_to_cpumask(int node) | |||
41 | 41 | ||
42 | #define pcibus_to_cpumask(bus) (cpu_online_map) | 42 | #define pcibus_to_cpumask(bus) (cpu_online_map) |
43 | 43 | ||
44 | #else /* CONFIG_NUMA */ | ||
45 | # include <asm-generic/topology.h> | ||
46 | #endif /* !CONFIG_NUMA */ | 44 | #endif /* !CONFIG_NUMA */ |
45 | # include <asm-generic/topology.h> | ||
47 | 46 | ||
48 | #endif /* _ASM_ALPHA_TOPOLOGY_H */ | 47 | #endif /* _ASM_ALPHA_TOPOLOGY_H */ |
diff --git a/include/asm-frv/topology.h b/include/asm-frv/topology.h index abe7298742ac..942724352705 100644 --- a/include/asm-frv/topology.h +++ b/include/asm-frv/topology.h | |||
@@ -5,10 +5,8 @@ | |||
5 | 5 | ||
6 | #error NUMA not supported yet | 6 | #error NUMA not supported yet |
7 | 7 | ||
8 | #else /* !CONFIG_NUMA */ | 8 | #endif /* CONFIG_NUMA */ |
9 | 9 | ||
10 | #include <asm-generic/topology.h> | 10 | #include <asm-generic/topology.h> |
11 | 11 | ||
12 | #endif /* CONFIG_NUMA */ | ||
13 | |||
14 | #endif /* _ASM_TOPOLOGY_H */ | 12 | #endif /* _ASM_TOPOLOGY_H */ |
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index 342a2a0105c4..a6aea79bca4f 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h | |||
@@ -27,6 +27,8 @@ | |||
27 | #ifndef _ASM_GENERIC_TOPOLOGY_H | 27 | #ifndef _ASM_GENERIC_TOPOLOGY_H |
28 | #define _ASM_GENERIC_TOPOLOGY_H | 28 | #define _ASM_GENERIC_TOPOLOGY_H |
29 | 29 | ||
30 | #ifndef CONFIG_NUMA | ||
31 | |||
30 | /* Other architectures wishing to use this simple topology API should fill | 32 | /* Other architectures wishing to use this simple topology API should fill |
31 | in the below functions as appropriate in their own <asm/topology.h> file. */ | 33 | in the below functions as appropriate in their own <asm/topology.h> file. */ |
32 | #ifndef cpu_to_node | 34 | #ifndef cpu_to_node |
@@ -52,4 +54,16 @@ | |||
52 | ) | 54 | ) |
53 | #endif | 55 | #endif |
54 | 56 | ||
57 | #endif /* CONFIG_NUMA */ | ||
58 | |||
59 | /* returns pointer to cpumask for specified node */ | ||
60 | #ifndef node_to_cpumask_ptr | ||
61 | |||
62 | #define node_to_cpumask_ptr(v, node) \ | ||
63 | cpumask_t _##v = node_to_cpumask(node), *v = &_##v | ||
64 | |||
65 | #define node_to_cpumask_ptr_next(v, node) \ | ||
66 | _##v = node_to_cpumask(node) | ||
67 | #endif | ||
68 | |||
55 | #endif /* _ASM_GENERIC_TOPOLOGY_H */ | 69 | #endif /* _ASM_GENERIC_TOPOLOGY_H */ |
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h index 2d67b72b18d0..f2f72ef2a897 100644 --- a/include/asm-ia64/topology.h +++ b/include/asm-ia64/topology.h | |||
@@ -93,7 +93,7 @@ void build_cpu_to_node_map(void); | |||
93 | .cache_nice_tries = 2, \ | 93 | .cache_nice_tries = 2, \ |
94 | .busy_idx = 3, \ | 94 | .busy_idx = 3, \ |
95 | .idle_idx = 2, \ | 95 | .idle_idx = 2, \ |
96 | .newidle_idx = 0, /* unused */ \ | 96 | .newidle_idx = 2, \ |
97 | .wake_idx = 1, \ | 97 | .wake_idx = 1, \ |
98 | .forkexec_idx = 1, \ | 98 | .forkexec_idx = 1, \ |
99 | .flags = SD_LOAD_BALANCE \ | 99 | .flags = SD_LOAD_BALANCE \ |
@@ -116,6 +116,11 @@ void build_cpu_to_node_map(void); | |||
116 | #define smt_capable() (smp_num_siblings > 1) | 116 | #define smt_capable() (smp_num_siblings > 1) |
117 | #endif | 117 | #endif |
118 | 118 | ||
119 | #define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \ | ||
120 | CPU_MASK_ALL : \ | ||
121 | node_to_cpumask(pcibus_to_node(bus)) \ | ||
122 | ) | ||
123 | |||
119 | #include <asm-generic/topology.h> | 124 | #include <asm-generic/topology.h> |
120 | 125 | ||
121 | #endif /* _ASM_IA64_TOPOLOGY_H */ | 126 | #endif /* _ASM_IA64_TOPOLOGY_H */ |
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h index ca23b681ad05..100c6fbfc587 100644 --- a/include/asm-powerpc/topology.h +++ b/include/asm-powerpc/topology.h | |||
@@ -96,11 +96,10 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev, | |||
96 | { | 96 | { |
97 | } | 97 | } |
98 | 98 | ||
99 | #endif /* CONFIG_NUMA */ | ||
99 | 100 | ||
100 | #include <asm-generic/topology.h> | 101 | #include <asm-generic/topology.h> |
101 | 102 | ||
102 | #endif /* CONFIG_NUMA */ | ||
103 | |||
104 | #ifdef CONFIG_SMP | 103 | #ifdef CONFIG_SMP |
105 | #include <asm/cputable.h> | 104 | #include <asm/cputable.h> |
106 | #define smt_capable() (cpu_has_feature(CPU_FTR_SMT)) | 105 | #define smt_capable() (cpu_has_feature(CPU_FTR_SMT)) |
diff --git a/include/asm-sh/topology.h b/include/asm-sh/topology.h index f402a3b1cfa4..34cdb28e8f44 100644 --- a/include/asm-sh/topology.h +++ b/include/asm-sh/topology.h | |||
@@ -16,7 +16,7 @@ | |||
16 | .cache_nice_tries = 2, \ | 16 | .cache_nice_tries = 2, \ |
17 | .busy_idx = 3, \ | 17 | .busy_idx = 3, \ |
18 | .idle_idx = 2, \ | 18 | .idle_idx = 2, \ |
19 | .newidle_idx = 0, \ | 19 | .newidle_idx = 2, \ |
20 | .wake_idx = 1, \ | 20 | .wake_idx = 1, \ |
21 | .forkexec_idx = 1, \ | 21 | .forkexec_idx = 1, \ |
22 | .flags = SD_LOAD_BALANCE \ | 22 | .flags = SD_LOAD_BALANCE \ |
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index 81a29eb08ac4..22073268b481 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h | |||
@@ -88,6 +88,17 @@ static inline int cpu_to_node(int cpu) | |||
88 | #endif | 88 | #endif |
89 | return per_cpu(x86_cpu_to_node_map, cpu); | 89 | return per_cpu(x86_cpu_to_node_map, cpu); |
90 | } | 90 | } |
91 | |||
92 | #ifdef CONFIG_NUMA | ||
93 | |||
94 | /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ | ||
95 | #define node_to_cpumask_ptr(v, node) \ | ||
96 | cpumask_t *v = &(node_to_cpumask_map[node]) | ||
97 | |||
98 | #define node_to_cpumask_ptr_next(v, node) \ | ||
99 | v = &(node_to_cpumask_map[node]) | ||
100 | #endif | ||
101 | |||
91 | #endif /* CONFIG_X86_64 */ | 102 | #endif /* CONFIG_X86_64 */ |
92 | 103 | ||
93 | /* | 104 | /* |
@@ -136,17 +147,13 @@ extern unsigned long node_remap_size[]; | |||
136 | 147 | ||
137 | # define SD_CACHE_NICE_TRIES 2 | 148 | # define SD_CACHE_NICE_TRIES 2 |
138 | # define SD_IDLE_IDX 2 | 149 | # define SD_IDLE_IDX 2 |
139 | # define SD_NEWIDLE_IDX 0 | 150 | # define SD_NEWIDLE_IDX 2 |
140 | # define SD_FORKEXEC_IDX 1 | 151 | # define SD_FORKEXEC_IDX 1 |
141 | 152 | ||
142 | #endif | 153 | #endif |
143 | 154 | ||
144 | /* sched_domains SD_NODE_INIT for NUMAQ machines */ | 155 | /* sched_domains SD_NODE_INIT for NUMAQ machines */ |
145 | #define SD_NODE_INIT (struct sched_domain) { \ | 156 | #define SD_NODE_INIT (struct sched_domain) { \ |
146 | .span = CPU_MASK_NONE, \ | ||
147 | .parent = NULL, \ | ||
148 | .child = NULL, \ | ||
149 | .groups = NULL, \ | ||
150 | .min_interval = 8, \ | 157 | .min_interval = 8, \ |
151 | .max_interval = 32, \ | 158 | .max_interval = 32, \ |
152 | .busy_factor = 32, \ | 159 | .busy_factor = 32, \ |
@@ -164,7 +171,6 @@ extern unsigned long node_remap_size[]; | |||
164 | | SD_WAKE_BALANCE, \ | 171 | | SD_WAKE_BALANCE, \ |
165 | .last_balance = jiffies, \ | 172 | .last_balance = jiffies, \ |
166 | .balance_interval = 1, \ | 173 | .balance_interval = 1, \ |
167 | .nr_balance_failed = 0, \ | ||
168 | } | 174 | } |
169 | 175 | ||
170 | #ifdef CONFIG_X86_64_ACPI_NUMA | 176 | #ifdef CONFIG_X86_64_ACPI_NUMA |
@@ -174,10 +180,10 @@ extern int __node_distance(int, int); | |||
174 | 180 | ||
175 | #else /* CONFIG_NUMA */ | 181 | #else /* CONFIG_NUMA */ |
176 | 182 | ||
177 | #include <asm-generic/topology.h> | ||
178 | |||
179 | #endif | 183 | #endif |
180 | 184 | ||
185 | #include <asm-generic/topology.h> | ||
186 | |||
181 | extern cpumask_t cpu_coregroup_map(int cpu); | 187 | extern cpumask_t cpu_coregroup_map(int cpu); |
182 | 188 | ||
183 | #ifdef ENABLE_TOPO_DEFINES | 189 | #ifdef ENABLE_TOPO_DEFINES |
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index acad1105d942..1dbe074f1c64 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h | |||
@@ -108,6 +108,7 @@ extern int __bitmap_weight(const unsigned long *bitmap, int bits); | |||
108 | 108 | ||
109 | extern int bitmap_scnprintf(char *buf, unsigned int len, | 109 | extern int bitmap_scnprintf(char *buf, unsigned int len, |
110 | const unsigned long *src, int nbits); | 110 | const unsigned long *src, int nbits); |
111 | extern int bitmap_scnprintf_len(unsigned int len); | ||
111 | extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user, | 112 | extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user, |
112 | unsigned long *dst, int nbits); | 113 | unsigned long *dst, int nbits); |
113 | extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, | 114 | extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, |
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 7047f58306a7..259c8051155d 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h | |||
@@ -222,8 +222,13 @@ int __next_cpu(int n, const cpumask_t *srcp); | |||
222 | #define next_cpu(n, src) ({ (void)(src); 1; }) | 222 | #define next_cpu(n, src) ({ (void)(src); 1; }) |
223 | #endif | 223 | #endif |
224 | 224 | ||
225 | #ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP | ||
226 | extern cpumask_t *cpumask_of_cpu_map; | ||
227 | #define cpumask_of_cpu(cpu) (cpumask_of_cpu_map[cpu]) | ||
228 | |||
229 | #else | ||
225 | #define cpumask_of_cpu(cpu) \ | 230 | #define cpumask_of_cpu(cpu) \ |
226 | ({ \ | 231 | (*({ \ |
227 | typeof(_unused_cpumask_arg_) m; \ | 232 | typeof(_unused_cpumask_arg_) m; \ |
228 | if (sizeof(m) == sizeof(unsigned long)) { \ | 233 | if (sizeof(m) == sizeof(unsigned long)) { \ |
229 | m.bits[0] = 1UL<<(cpu); \ | 234 | m.bits[0] = 1UL<<(cpu); \ |
@@ -231,8 +236,9 @@ int __next_cpu(int n, const cpumask_t *srcp); | |||
231 | cpus_clear(m); \ | 236 | cpus_clear(m); \ |
232 | cpu_set((cpu), m); \ | 237 | cpu_set((cpu), m); \ |
233 | } \ | 238 | } \ |
234 | m; \ | 239 | &m; \ |
235 | }) | 240 | })) |
241 | #endif | ||
236 | 242 | ||
237 | #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS) | 243 | #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS) |
238 | 244 | ||
@@ -243,6 +249,8 @@ int __next_cpu(int n, const cpumask_t *srcp); | |||
243 | [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ | 249 | [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ |
244 | } } | 250 | } } |
245 | 251 | ||
252 | #define CPU_MASK_ALL_PTR (&CPU_MASK_ALL) | ||
253 | |||
246 | #else | 254 | #else |
247 | 255 | ||
248 | #define CPU_MASK_ALL \ | 256 | #define CPU_MASK_ALL \ |
@@ -251,6 +259,10 @@ int __next_cpu(int n, const cpumask_t *srcp); | |||
251 | [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ | 259 | [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ |
252 | } } | 260 | } } |
253 | 261 | ||
262 | /* cpu_mask_all is in init/main.c */ | ||
263 | extern cpumask_t cpu_mask_all; | ||
264 | #define CPU_MASK_ALL_PTR (&cpu_mask_all) | ||
265 | |||
254 | #endif | 266 | #endif |
255 | 267 | ||
256 | #define CPU_MASK_NONE \ | 268 | #define CPU_MASK_NONE \ |
@@ -273,6 +285,13 @@ static inline int __cpumask_scnprintf(char *buf, int len, | |||
273 | return bitmap_scnprintf(buf, len, srcp->bits, nbits); | 285 | return bitmap_scnprintf(buf, len, srcp->bits, nbits); |
274 | } | 286 | } |
275 | 287 | ||
288 | #define cpumask_scnprintf_len(len) \ | ||
289 | __cpumask_scnprintf_len((len)) | ||
290 | static inline int __cpumask_scnprintf_len(int len) | ||
291 | { | ||
292 | return bitmap_scnprintf_len(len); | ||
293 | } | ||
294 | |||
276 | #define cpumask_parse_user(ubuf, ulen, dst) \ | 295 | #define cpumask_parse_user(ubuf, ulen, dst) \ |
277 | __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS) | 296 | __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS) |
278 | static inline int __cpumask_parse_user(const char __user *buf, int len, | 297 | static inline int __cpumask_parse_user(const char __user *buf, int len, |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 0a26be353cb3..726761e24003 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -20,8 +20,8 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ | |||
20 | extern int cpuset_init_early(void); | 20 | extern int cpuset_init_early(void); |
21 | extern int cpuset_init(void); | 21 | extern int cpuset_init(void); |
22 | extern void cpuset_init_smp(void); | 22 | extern void cpuset_init_smp(void); |
23 | extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); | 23 | extern void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask); |
24 | extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p); | 24 | extern void cpuset_cpus_allowed_locked(struct task_struct *p, cpumask_t *mask); |
25 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); | 25 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); |
26 | #define cpuset_current_mems_allowed (current->mems_allowed) | 26 | #define cpuset_current_mems_allowed (current->mems_allowed) |
27 | void cpuset_init_current_mems_allowed(void); | 27 | void cpuset_init_current_mems_allowed(void); |
@@ -84,13 +84,14 @@ static inline int cpuset_init_early(void) { return 0; } | |||
84 | static inline int cpuset_init(void) { return 0; } | 84 | static inline int cpuset_init(void) { return 0; } |
85 | static inline void cpuset_init_smp(void) {} | 85 | static inline void cpuset_init_smp(void) {} |
86 | 86 | ||
87 | static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p) | 87 | static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask) |
88 | { | 88 | { |
89 | return cpu_possible_map; | 89 | *mask = cpu_possible_map; |
90 | } | 90 | } |
91 | static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p) | 91 | static inline void cpuset_cpus_allowed_locked(struct task_struct *p, |
92 | cpumask_t *mask) | ||
92 | { | 93 | { |
93 | return cpu_possible_map; | 94 | *mask = cpu_possible_map; |
94 | } | 95 | } |
95 | 96 | ||
96 | static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) | 97 | static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) |
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1f74e1d7415f..37a6f5bc4a92 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -151,6 +151,9 @@ extern struct group_info init_groups; | |||
151 | .cpus_allowed = CPU_MASK_ALL, \ | 151 | .cpus_allowed = CPU_MASK_ALL, \ |
152 | .mm = NULL, \ | 152 | .mm = NULL, \ |
153 | .active_mm = &init_mm, \ | 153 | .active_mm = &init_mm, \ |
154 | .se = { \ | ||
155 | .group_node = LIST_HEAD_INIT(tsk.se.group_node), \ | ||
156 | }, \ | ||
154 | .rt = { \ | 157 | .rt = { \ |
155 | .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ | 158 | .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ |
156 | .time_slice = HZ, \ | 159 | .time_slice = HZ, \ |
diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 2cd7fa73d1af..ce5983225be4 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h | |||
@@ -327,4 +327,10 @@ extern void ktime_get_ts(struct timespec *ts); | |||
327 | /* Get the real (wall-) time in timespec format: */ | 327 | /* Get the real (wall-) time in timespec format: */ |
328 | #define ktime_get_real_ts(ts) getnstimeofday(ts) | 328 | #define ktime_get_real_ts(ts) getnstimeofday(ts) |
329 | 329 | ||
330 | static inline ktime_t ns_to_ktime(u64 ns) | ||
331 | { | ||
332 | static const ktime_t ktime_zero = { .tv64 = 0 }; | ||
333 | return ktime_add_ns(ktime_zero, ns); | ||
334 | } | ||
335 | |||
330 | #endif | 336 | #endif |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6a1e7afb099b..be6914014c70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -704,6 +704,7 @@ enum cpu_idle_type { | |||
704 | #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ | 704 | #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ |
705 | #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ | 705 | #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ |
706 | #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ | 706 | #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ |
707 | #define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ | ||
707 | 708 | ||
708 | #define BALANCE_FOR_MC_POWER \ | 709 | #define BALANCE_FOR_MC_POWER \ |
709 | (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) | 710 | (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) |
@@ -733,12 +734,31 @@ struct sched_group { | |||
733 | u32 reciprocal_cpu_power; | 734 | u32 reciprocal_cpu_power; |
734 | }; | 735 | }; |
735 | 736 | ||
737 | enum sched_domain_level { | ||
738 | SD_LV_NONE = 0, | ||
739 | SD_LV_SIBLING, | ||
740 | SD_LV_MC, | ||
741 | SD_LV_CPU, | ||
742 | SD_LV_NODE, | ||
743 | SD_LV_ALLNODES, | ||
744 | SD_LV_MAX | ||
745 | }; | ||
746 | |||
747 | struct sched_domain_attr { | ||
748 | int relax_domain_level; | ||
749 | }; | ||
750 | |||
751 | #define SD_ATTR_INIT (struct sched_domain_attr) { \ | ||
752 | .relax_domain_level = -1, \ | ||
753 | } | ||
754 | |||
736 | struct sched_domain { | 755 | struct sched_domain { |
737 | /* These fields must be setup */ | 756 | /* These fields must be setup */ |
738 | struct sched_domain *parent; /* top domain must be null terminated */ | 757 | struct sched_domain *parent; /* top domain must be null terminated */ |
739 | struct sched_domain *child; /* bottom domain must be null terminated */ | 758 | struct sched_domain *child; /* bottom domain must be null terminated */ |
740 | struct sched_group *groups; /* the balancing groups of the domain */ | 759 | struct sched_group *groups; /* the balancing groups of the domain */ |
741 | cpumask_t span; /* span of all CPUs in this domain */ | 760 | cpumask_t span; /* span of all CPUs in this domain */ |
761 | int first_cpu; /* cache of the first cpu in this domain */ | ||
742 | unsigned long min_interval; /* Minimum balance interval ms */ | 762 | unsigned long min_interval; /* Minimum balance interval ms */ |
743 | unsigned long max_interval; /* Maximum balance interval ms */ | 763 | unsigned long max_interval; /* Maximum balance interval ms */ |
744 | unsigned int busy_factor; /* less balancing by factor if busy */ | 764 | unsigned int busy_factor; /* less balancing by factor if busy */ |
@@ -750,6 +770,7 @@ struct sched_domain { | |||
750 | unsigned int wake_idx; | 770 | unsigned int wake_idx; |
751 | unsigned int forkexec_idx; | 771 | unsigned int forkexec_idx; |
752 | int flags; /* See SD_* */ | 772 | int flags; /* See SD_* */ |
773 | enum sched_domain_level level; | ||
753 | 774 | ||
754 | /* Runtime fields. */ | 775 | /* Runtime fields. */ |
755 | unsigned long last_balance; /* init to jiffies. units in jiffies */ | 776 | unsigned long last_balance; /* init to jiffies. units in jiffies */ |
@@ -789,7 +810,8 @@ struct sched_domain { | |||
789 | #endif | 810 | #endif |
790 | }; | 811 | }; |
791 | 812 | ||
792 | extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new); | 813 | extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
814 | struct sched_domain_attr *dattr_new); | ||
793 | extern int arch_reinit_sched_domains(void); | 815 | extern int arch_reinit_sched_domains(void); |
794 | 816 | ||
795 | #endif /* CONFIG_SMP */ | 817 | #endif /* CONFIG_SMP */ |
@@ -889,7 +911,8 @@ struct sched_class { | |||
889 | void (*set_curr_task) (struct rq *rq); | 911 | void (*set_curr_task) (struct rq *rq); |
890 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 912 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); |
891 | void (*task_new) (struct rq *rq, struct task_struct *p); | 913 | void (*task_new) (struct rq *rq, struct task_struct *p); |
892 | void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask); | 914 | void (*set_cpus_allowed)(struct task_struct *p, |
915 | const cpumask_t *newmask); | ||
893 | 916 | ||
894 | void (*join_domain)(struct rq *rq); | 917 | void (*join_domain)(struct rq *rq); |
895 | void (*leave_domain)(struct rq *rq); | 918 | void (*leave_domain)(struct rq *rq); |
@@ -923,6 +946,7 @@ struct load_weight { | |||
923 | struct sched_entity { | 946 | struct sched_entity { |
924 | struct load_weight load; /* for load-balancing */ | 947 | struct load_weight load; /* for load-balancing */ |
925 | struct rb_node run_node; | 948 | struct rb_node run_node; |
949 | struct list_head group_node; | ||
926 | unsigned int on_rq; | 950 | unsigned int on_rq; |
927 | 951 | ||
928 | u64 exec_start; | 952 | u64 exec_start; |
@@ -982,6 +1006,7 @@ struct sched_rt_entity { | |||
982 | unsigned long timeout; | 1006 | unsigned long timeout; |
983 | int nr_cpus_allowed; | 1007 | int nr_cpus_allowed; |
984 | 1008 | ||
1009 | struct sched_rt_entity *back; | ||
985 | #ifdef CONFIG_RT_GROUP_SCHED | 1010 | #ifdef CONFIG_RT_GROUP_SCHED |
986 | struct sched_rt_entity *parent; | 1011 | struct sched_rt_entity *parent; |
987 | /* rq on which this entity is (to be) queued: */ | 1012 | /* rq on which this entity is (to be) queued: */ |
@@ -1502,15 +1527,21 @@ static inline void put_task_struct(struct task_struct *t) | |||
1502 | #define used_math() tsk_used_math(current) | 1527 | #define used_math() tsk_used_math(current) |
1503 | 1528 | ||
1504 | #ifdef CONFIG_SMP | 1529 | #ifdef CONFIG_SMP |
1505 | extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask); | 1530 | extern int set_cpus_allowed_ptr(struct task_struct *p, |
1531 | const cpumask_t *new_mask); | ||
1506 | #else | 1532 | #else |
1507 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 1533 | static inline int set_cpus_allowed_ptr(struct task_struct *p, |
1534 | const cpumask_t *new_mask) | ||
1508 | { | 1535 | { |
1509 | if (!cpu_isset(0, new_mask)) | 1536 | if (!cpu_isset(0, *new_mask)) |
1510 | return -EINVAL; | 1537 | return -EINVAL; |
1511 | return 0; | 1538 | return 0; |
1512 | } | 1539 | } |
1513 | #endif | 1540 | #endif |
1541 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | ||
1542 | { | ||
1543 | return set_cpus_allowed_ptr(p, &new_mask); | ||
1544 | } | ||
1514 | 1545 | ||
1515 | extern unsigned long long sched_clock(void); | 1546 | extern unsigned long long sched_clock(void); |
1516 | 1547 | ||
@@ -1551,7 +1582,6 @@ static inline void wake_up_idle_cpu(int cpu) { } | |||
1551 | extern unsigned int sysctl_sched_latency; | 1582 | extern unsigned int sysctl_sched_latency; |
1552 | extern unsigned int sysctl_sched_min_granularity; | 1583 | extern unsigned int sysctl_sched_min_granularity; |
1553 | extern unsigned int sysctl_sched_wakeup_granularity; | 1584 | extern unsigned int sysctl_sched_wakeup_granularity; |
1554 | extern unsigned int sysctl_sched_batch_wakeup_granularity; | ||
1555 | extern unsigned int sysctl_sched_child_runs_first; | 1585 | extern unsigned int sysctl_sched_child_runs_first; |
1556 | extern unsigned int sysctl_sched_features; | 1586 | extern unsigned int sysctl_sched_features; |
1557 | extern unsigned int sysctl_sched_migration_cost; | 1587 | extern unsigned int sysctl_sched_migration_cost; |
@@ -1564,6 +1594,10 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
1564 | extern unsigned int sysctl_sched_rt_period; | 1594 | extern unsigned int sysctl_sched_rt_period; |
1565 | extern int sysctl_sched_rt_runtime; | 1595 | extern int sysctl_sched_rt_runtime; |
1566 | 1596 | ||
1597 | int sched_rt_handler(struct ctl_table *table, int write, | ||
1598 | struct file *filp, void __user *buffer, size_t *lenp, | ||
1599 | loff_t *ppos); | ||
1600 | |||
1567 | extern unsigned int sysctl_sched_compat_yield; | 1601 | extern unsigned int sysctl_sched_compat_yield; |
1568 | 1602 | ||
1569 | #ifdef CONFIG_RT_MUTEXES | 1603 | #ifdef CONFIG_RT_MUTEXES |
@@ -2031,7 +2065,7 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) | |||
2031 | } | 2065 | } |
2032 | #endif | 2066 | #endif |
2033 | 2067 | ||
2034 | extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); | 2068 | extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); |
2035 | extern long sched_getaffinity(pid_t pid, cpumask_t *mask); | 2069 | extern long sched_getaffinity(pid_t pid, cpumask_t *mask); |
2036 | 2070 | ||
2037 | extern int sched_mc_power_savings, sched_smt_power_savings; | 2071 | extern int sched_mc_power_savings, sched_smt_power_savings; |
@@ -2041,8 +2075,11 @@ extern void normalize_rt_tasks(void); | |||
2041 | #ifdef CONFIG_GROUP_SCHED | 2075 | #ifdef CONFIG_GROUP_SCHED |
2042 | 2076 | ||
2043 | extern struct task_group init_task_group; | 2077 | extern struct task_group init_task_group; |
2078 | #ifdef CONFIG_USER_SCHED | ||
2079 | extern struct task_group root_task_group; | ||
2080 | #endif | ||
2044 | 2081 | ||
2045 | extern struct task_group *sched_create_group(void); | 2082 | extern struct task_group *sched_create_group(struct task_group *parent); |
2046 | extern void sched_destroy_group(struct task_group *tg); | 2083 | extern void sched_destroy_group(struct task_group *tg); |
2047 | extern void sched_move_task(struct task_struct *tsk); | 2084 | extern void sched_move_task(struct task_struct *tsk); |
2048 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2085 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -2053,6 +2090,9 @@ extern unsigned long sched_group_shares(struct task_group *tg); | |||
2053 | extern int sched_group_set_rt_runtime(struct task_group *tg, | 2090 | extern int sched_group_set_rt_runtime(struct task_group *tg, |
2054 | long rt_runtime_us); | 2091 | long rt_runtime_us); |
2055 | extern long sched_group_rt_runtime(struct task_group *tg); | 2092 | extern long sched_group_rt_runtime(struct task_group *tg); |
2093 | extern int sched_group_set_rt_period(struct task_group *tg, | ||
2094 | long rt_period_us); | ||
2095 | extern long sched_group_rt_period(struct task_group *tg); | ||
2056 | #endif | 2096 | #endif |
2057 | #endif | 2097 | #endif |
2058 | 2098 | ||
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h index f752e73bf977..f2767bc6b735 100644 --- a/include/linux/sysdev.h +++ b/include/linux/sysdev.h | |||
@@ -45,12 +45,16 @@ struct sysdev_class_attribute { | |||
45 | ssize_t (*store)(struct sysdev_class *, const char *, size_t); | 45 | ssize_t (*store)(struct sysdev_class *, const char *, size_t); |
46 | }; | 46 | }; |
47 | 47 | ||
48 | #define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) \ | 48 | #define _SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) \ |
49 | struct sysdev_class_attribute attr_##_name = { \ | 49 | { \ |
50 | .attr = {.name = __stringify(_name), .mode = _mode }, \ | 50 | .attr = {.name = __stringify(_name), .mode = _mode }, \ |
51 | .show = _show, \ | 51 | .show = _show, \ |
52 | .store = _store, \ | 52 | .store = _store, \ |
53 | }; | 53 | } |
54 | |||
55 | #define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) \ | ||
56 | struct sysdev_class_attribute attr_##_name = \ | ||
57 | _SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) | ||
54 | 58 | ||
55 | 59 | ||
56 | extern int sysdev_class_register(struct sysdev_class *); | 60 | extern int sysdev_class_register(struct sysdev_class *); |
@@ -100,15 +104,16 @@ struct sysdev_attribute { | |||
100 | }; | 104 | }; |
101 | 105 | ||
102 | 106 | ||
103 | #define _SYSDEV_ATTR(_name,_mode,_show,_store) \ | 107 | #define _SYSDEV_ATTR(_name, _mode, _show, _store) \ |
104 | { \ | 108 | { \ |
105 | .attr = { .name = __stringify(_name), .mode = _mode }, \ | 109 | .attr = { .name = __stringify(_name), .mode = _mode }, \ |
106 | .show = _show, \ | 110 | .show = _show, \ |
107 | .store = _store, \ | 111 | .store = _store, \ |
108 | } | 112 | } |
109 | 113 | ||
110 | #define SYSDEV_ATTR(_name,_mode,_show,_store) \ | 114 | #define SYSDEV_ATTR(_name, _mode, _show, _store) \ |
111 | struct sysdev_attribute attr_##_name = _SYSDEV_ATTR(_name,_mode,_show,_store); | 115 | struct sysdev_attribute attr_##_name = \ |
116 | _SYSDEV_ATTR(_name, _mode, _show, _store); | ||
112 | 117 | ||
113 | extern int sysdev_create_file(struct sys_device *, struct sysdev_attribute *); | 118 | extern int sysdev_create_file(struct sys_device *, struct sysdev_attribute *); |
114 | extern void sysdev_remove_file(struct sys_device *, struct sysdev_attribute *); | 119 | extern void sysdev_remove_file(struct sys_device *, struct sysdev_attribute *); |
diff --git a/include/linux/topology.h b/include/linux/topology.h index bd14f8b30f09..4bb7074a2c3a 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -38,16 +38,15 @@ | |||
38 | #endif | 38 | #endif |
39 | 39 | ||
40 | #ifndef nr_cpus_node | 40 | #ifndef nr_cpus_node |
41 | #define nr_cpus_node(node) \ | 41 | #define nr_cpus_node(node) \ |
42 | ({ \ | 42 | ({ \ |
43 | cpumask_t __tmp__; \ | 43 | node_to_cpumask_ptr(__tmp__, node); \ |
44 | __tmp__ = node_to_cpumask(node); \ | 44 | cpus_weight(*__tmp__); \ |
45 | cpus_weight(__tmp__); \ | ||
46 | }) | 45 | }) |
47 | #endif | 46 | #endif |
48 | 47 | ||
49 | #define for_each_node_with_cpus(node) \ | 48 | #define for_each_node_with_cpus(node) \ |
50 | for_each_online_node(node) \ | 49 | for_each_online_node(node) \ |
51 | if (nr_cpus_node(node)) | 50 | if (nr_cpus_node(node)) |
52 | 51 | ||
53 | void arch_update_cpu_topology(void); | 52 | void arch_update_cpu_topology(void); |
@@ -80,7 +79,9 @@ void arch_update_cpu_topology(void); | |||
80 | * by defining their own arch-specific initializer in include/asm/topology.h. | 79 | * by defining their own arch-specific initializer in include/asm/topology.h. |
81 | * A definition there will automagically override these default initializers | 80 | * A definition there will automagically override these default initializers |
82 | * and allow arch-specific performance tuning of sched_domains. | 81 | * and allow arch-specific performance tuning of sched_domains. |
82 | * (Only non-zero and non-null fields need be specified.) | ||
83 | */ | 83 | */ |
84 | |||
84 | #ifdef CONFIG_SCHED_SMT | 85 | #ifdef CONFIG_SCHED_SMT |
85 | /* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, | 86 | /* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, |
86 | * so can't we drop this in favor of CONFIG_SCHED_SMT? | 87 | * so can't we drop this in favor of CONFIG_SCHED_SMT? |
@@ -89,20 +90,10 @@ void arch_update_cpu_topology(void); | |||
89 | /* Common values for SMT siblings */ | 90 | /* Common values for SMT siblings */ |
90 | #ifndef SD_SIBLING_INIT | 91 | #ifndef SD_SIBLING_INIT |
91 | #define SD_SIBLING_INIT (struct sched_domain) { \ | 92 | #define SD_SIBLING_INIT (struct sched_domain) { \ |
92 | .span = CPU_MASK_NONE, \ | ||
93 | .parent = NULL, \ | ||
94 | .child = NULL, \ | ||
95 | .groups = NULL, \ | ||
96 | .min_interval = 1, \ | 93 | .min_interval = 1, \ |
97 | .max_interval = 2, \ | 94 | .max_interval = 2, \ |
98 | .busy_factor = 64, \ | 95 | .busy_factor = 64, \ |
99 | .imbalance_pct = 110, \ | 96 | .imbalance_pct = 110, \ |
100 | .cache_nice_tries = 0, \ | ||
101 | .busy_idx = 0, \ | ||
102 | .idle_idx = 0, \ | ||
103 | .newidle_idx = 0, \ | ||
104 | .wake_idx = 0, \ | ||
105 | .forkexec_idx = 0, \ | ||
106 | .flags = SD_LOAD_BALANCE \ | 97 | .flags = SD_LOAD_BALANCE \ |
107 | | SD_BALANCE_NEWIDLE \ | 98 | | SD_BALANCE_NEWIDLE \ |
108 | | SD_BALANCE_FORK \ | 99 | | SD_BALANCE_FORK \ |
@@ -112,7 +103,6 @@ void arch_update_cpu_topology(void); | |||
112 | | SD_SHARE_CPUPOWER, \ | 103 | | SD_SHARE_CPUPOWER, \ |
113 | .last_balance = jiffies, \ | 104 | .last_balance = jiffies, \ |
114 | .balance_interval = 1, \ | 105 | .balance_interval = 1, \ |
115 | .nr_balance_failed = 0, \ | ||
116 | } | 106 | } |
117 | #endif | 107 | #endif |
118 | #endif /* CONFIG_SCHED_SMT */ | 108 | #endif /* CONFIG_SCHED_SMT */ |
@@ -121,18 +111,12 @@ void arch_update_cpu_topology(void); | |||
121 | /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ | 111 | /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ |
122 | #ifndef SD_MC_INIT | 112 | #ifndef SD_MC_INIT |
123 | #define SD_MC_INIT (struct sched_domain) { \ | 113 | #define SD_MC_INIT (struct sched_domain) { \ |
124 | .span = CPU_MASK_NONE, \ | ||
125 | .parent = NULL, \ | ||
126 | .child = NULL, \ | ||
127 | .groups = NULL, \ | ||
128 | .min_interval = 1, \ | 114 | .min_interval = 1, \ |
129 | .max_interval = 4, \ | 115 | .max_interval = 4, \ |
130 | .busy_factor = 64, \ | 116 | .busy_factor = 64, \ |
131 | .imbalance_pct = 125, \ | 117 | .imbalance_pct = 125, \ |
132 | .cache_nice_tries = 1, \ | 118 | .cache_nice_tries = 1, \ |
133 | .busy_idx = 2, \ | 119 | .busy_idx = 2, \ |
134 | .idle_idx = 0, \ | ||
135 | .newidle_idx = 0, \ | ||
136 | .wake_idx = 1, \ | 120 | .wake_idx = 1, \ |
137 | .forkexec_idx = 1, \ | 121 | .forkexec_idx = 1, \ |
138 | .flags = SD_LOAD_BALANCE \ | 122 | .flags = SD_LOAD_BALANCE \ |
@@ -144,7 +128,6 @@ void arch_update_cpu_topology(void); | |||
144 | | BALANCE_FOR_MC_POWER, \ | 128 | | BALANCE_FOR_MC_POWER, \ |
145 | .last_balance = jiffies, \ | 129 | .last_balance = jiffies, \ |
146 | .balance_interval = 1, \ | 130 | .balance_interval = 1, \ |
147 | .nr_balance_failed = 0, \ | ||
148 | } | 131 | } |
149 | #endif | 132 | #endif |
150 | #endif /* CONFIG_SCHED_MC */ | 133 | #endif /* CONFIG_SCHED_MC */ |
@@ -152,10 +135,6 @@ void arch_update_cpu_topology(void); | |||
152 | /* Common values for CPUs */ | 135 | /* Common values for CPUs */ |
153 | #ifndef SD_CPU_INIT | 136 | #ifndef SD_CPU_INIT |
154 | #define SD_CPU_INIT (struct sched_domain) { \ | 137 | #define SD_CPU_INIT (struct sched_domain) { \ |
155 | .span = CPU_MASK_NONE, \ | ||
156 | .parent = NULL, \ | ||
157 | .child = NULL, \ | ||
158 | .groups = NULL, \ | ||
159 | .min_interval = 1, \ | 138 | .min_interval = 1, \ |
160 | .max_interval = 4, \ | 139 | .max_interval = 4, \ |
161 | .busy_factor = 64, \ | 140 | .busy_factor = 64, \ |
@@ -174,16 +153,11 @@ void arch_update_cpu_topology(void); | |||
174 | | BALANCE_FOR_PKG_POWER,\ | 153 | | BALANCE_FOR_PKG_POWER,\ |
175 | .last_balance = jiffies, \ | 154 | .last_balance = jiffies, \ |
176 | .balance_interval = 1, \ | 155 | .balance_interval = 1, \ |
177 | .nr_balance_failed = 0, \ | ||
178 | } | 156 | } |
179 | #endif | 157 | #endif |
180 | 158 | ||
181 | /* sched_domains SD_ALLNODES_INIT for NUMA machines */ | 159 | /* sched_domains SD_ALLNODES_INIT for NUMA machines */ |
182 | #define SD_ALLNODES_INIT (struct sched_domain) { \ | 160 | #define SD_ALLNODES_INIT (struct sched_domain) { \ |
183 | .span = CPU_MASK_NONE, \ | ||
184 | .parent = NULL, \ | ||
185 | .child = NULL, \ | ||
186 | .groups = NULL, \ | ||
187 | .min_interval = 64, \ | 161 | .min_interval = 64, \ |
188 | .max_interval = 64*num_online_cpus(), \ | 162 | .max_interval = 64*num_online_cpus(), \ |
189 | .busy_factor = 128, \ | 163 | .busy_factor = 128, \ |
@@ -191,14 +165,10 @@ void arch_update_cpu_topology(void); | |||
191 | .cache_nice_tries = 1, \ | 165 | .cache_nice_tries = 1, \ |
192 | .busy_idx = 3, \ | 166 | .busy_idx = 3, \ |
193 | .idle_idx = 3, \ | 167 | .idle_idx = 3, \ |
194 | .newidle_idx = 0, /* unused */ \ | ||
195 | .wake_idx = 0, /* unused */ \ | ||
196 | .forkexec_idx = 0, /* unused */ \ | ||
197 | .flags = SD_LOAD_BALANCE \ | 168 | .flags = SD_LOAD_BALANCE \ |
198 | | SD_SERIALIZE, \ | 169 | | SD_SERIALIZE, \ |
199 | .last_balance = jiffies, \ | 170 | .last_balance = jiffies, \ |
200 | .balance_interval = 64, \ | 171 | .balance_interval = 64, \ |
201 | .nr_balance_failed = 0, \ | ||
202 | } | 172 | } |
203 | 173 | ||
204 | #ifdef CONFIG_NUMA | 174 | #ifdef CONFIG_NUMA |
diff --git a/init/Kconfig b/init/Kconfig index 7fccf09bb95a..ba3a389fab94 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -328,6 +328,13 @@ config RT_GROUP_SCHED | |||
328 | depends on EXPERIMENTAL | 328 | depends on EXPERIMENTAL |
329 | depends on GROUP_SCHED | 329 | depends on GROUP_SCHED |
330 | default n | 330 | default n |
331 | help | ||
332 | This feature lets you explicitly allocate real CPU bandwidth | ||
333 | to users or control groups (depending on the "Basis for grouping tasks" | ||
334 | setting below. If enabled, it will also make it impossible to | ||
335 | schedule realtime tasks for non-root users until you allocate | ||
336 | realtime bandwidth for them. | ||
337 | See Documentation/sched-rt-group.txt for more information. | ||
331 | 338 | ||
332 | choice | 339 | choice |
333 | depends on GROUP_SCHED | 340 | depends on GROUP_SCHED |
diff --git a/init/main.c b/init/main.c index 99ce94930b09..833a67df1f7e 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -359,10 +359,31 @@ static void __init smp_init(void) | |||
359 | #endif | 359 | #endif |
360 | 360 | ||
361 | static inline void setup_per_cpu_areas(void) { } | 361 | static inline void setup_per_cpu_areas(void) { } |
362 | static inline void setup_nr_cpu_ids(void) { } | ||
362 | static inline void smp_prepare_cpus(unsigned int maxcpus) { } | 363 | static inline void smp_prepare_cpus(unsigned int maxcpus) { } |
363 | 364 | ||
364 | #else | 365 | #else |
365 | 366 | ||
367 | #if NR_CPUS > BITS_PER_LONG | ||
368 | cpumask_t cpu_mask_all __read_mostly = CPU_MASK_ALL; | ||
369 | EXPORT_SYMBOL(cpu_mask_all); | ||
370 | #endif | ||
371 | |||
372 | /* Setup number of possible processor ids */ | ||
373 | int nr_cpu_ids __read_mostly = NR_CPUS; | ||
374 | EXPORT_SYMBOL(nr_cpu_ids); | ||
375 | |||
376 | /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ | ||
377 | static void __init setup_nr_cpu_ids(void) | ||
378 | { | ||
379 | int cpu, highest_cpu = 0; | ||
380 | |||
381 | for_each_possible_cpu(cpu) | ||
382 | highest_cpu = cpu; | ||
383 | |||
384 | nr_cpu_ids = highest_cpu + 1; | ||
385 | } | ||
386 | |||
366 | #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA | 387 | #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA |
367 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; | 388 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; |
368 | 389 | ||
@@ -537,6 +558,7 @@ asmlinkage void __init start_kernel(void) | |||
537 | setup_command_line(command_line); | 558 | setup_command_line(command_line); |
538 | unwind_setup(); | 559 | unwind_setup(); |
539 | setup_per_cpu_areas(); | 560 | setup_per_cpu_areas(); |
561 | setup_nr_cpu_ids(); | ||
540 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ | 562 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ |
541 | 563 | ||
542 | /* | 564 | /* |
@@ -811,7 +833,7 @@ static int __init kernel_init(void * unused) | |||
811 | /* | 833 | /* |
812 | * init can run on any cpu. | 834 | * init can run on any cpu. |
813 | */ | 835 | */ |
814 | set_cpus_allowed(current, CPU_MASK_ALL); | 836 | set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); |
815 | /* | 837 | /* |
816 | * Tell the world that we're going to be the grim | 838 | * Tell the world that we're going to be the grim |
817 | * reaper of innocent orphaned children. | 839 | * reaper of innocent orphaned children. |
diff --git a/kernel/compat.c b/kernel/compat.c index 9c48abfcd4a5..e1ef04870c2a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -445,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, | |||
445 | if (retval) | 445 | if (retval) |
446 | return retval; | 446 | return retval; |
447 | 447 | ||
448 | return sched_setaffinity(pid, new_mask); | 448 | return sched_setaffinity(pid, &new_mask); |
449 | } | 449 | } |
450 | 450 | ||
451 | asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, | 451 | asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2eff3f63abed..2011ad8d2697 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -232,9 +232,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
232 | 232 | ||
233 | /* Ensure that we are not runnable on dying cpu */ | 233 | /* Ensure that we are not runnable on dying cpu */ |
234 | old_allowed = current->cpus_allowed; | 234 | old_allowed = current->cpus_allowed; |
235 | tmp = CPU_MASK_ALL; | 235 | cpus_setall(tmp); |
236 | cpu_clear(cpu, tmp); | 236 | cpu_clear(cpu, tmp); |
237 | set_cpus_allowed(current, tmp); | 237 | set_cpus_allowed_ptr(current, &tmp); |
238 | 238 | ||
239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); | 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
240 | 240 | ||
@@ -268,7 +268,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
268 | out_thread: | 268 | out_thread: |
269 | err = kthread_stop(p); | 269 | err = kthread_stop(p); |
270 | out_allowed: | 270 | out_allowed: |
271 | set_cpus_allowed(current, old_allowed); | 271 | set_cpus_allowed_ptr(current, &old_allowed); |
272 | out_release: | 272 | out_release: |
273 | cpu_hotplug_done(); | 273 | cpu_hotplug_done(); |
274 | return err; | 274 | return err; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a1b61f414228..8b35fbd8292f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -98,6 +98,9 @@ struct cpuset { | |||
98 | /* partition number for rebuild_sched_domains() */ | 98 | /* partition number for rebuild_sched_domains() */ |
99 | int pn; | 99 | int pn; |
100 | 100 | ||
101 | /* for custom sched domain */ | ||
102 | int relax_domain_level; | ||
103 | |||
101 | /* used for walking a cpuset heirarchy */ | 104 | /* used for walking a cpuset heirarchy */ |
102 | struct list_head stack_list; | 105 | struct list_head stack_list; |
103 | }; | 106 | }; |
@@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
478 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 481 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
479 | } | 482 | } |
480 | 483 | ||
484 | static void | ||
485 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | ||
486 | { | ||
487 | if (!dattr) | ||
488 | return; | ||
489 | if (dattr->relax_domain_level < c->relax_domain_level) | ||
490 | dattr->relax_domain_level = c->relax_domain_level; | ||
491 | return; | ||
492 | } | ||
493 | |||
481 | /* | 494 | /* |
482 | * rebuild_sched_domains() | 495 | * rebuild_sched_domains() |
483 | * | 496 | * |
@@ -553,12 +566,14 @@ static void rebuild_sched_domains(void) | |||
553 | int csn; /* how many cpuset ptrs in csa so far */ | 566 | int csn; /* how many cpuset ptrs in csa so far */ |
554 | int i, j, k; /* indices for partition finding loops */ | 567 | int i, j, k; /* indices for partition finding loops */ |
555 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ | 568 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ |
569 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | ||
556 | int ndoms; /* number of sched domains in result */ | 570 | int ndoms; /* number of sched domains in result */ |
557 | int nslot; /* next empty doms[] cpumask_t slot */ | 571 | int nslot; /* next empty doms[] cpumask_t slot */ |
558 | 572 | ||
559 | q = NULL; | 573 | q = NULL; |
560 | csa = NULL; | 574 | csa = NULL; |
561 | doms = NULL; | 575 | doms = NULL; |
576 | dattr = NULL; | ||
562 | 577 | ||
563 | /* Special case for the 99% of systems with one, full, sched domain */ | 578 | /* Special case for the 99% of systems with one, full, sched domain */ |
564 | if (is_sched_load_balance(&top_cpuset)) { | 579 | if (is_sched_load_balance(&top_cpuset)) { |
@@ -566,6 +581,11 @@ static void rebuild_sched_domains(void) | |||
566 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 581 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
567 | if (!doms) | 582 | if (!doms) |
568 | goto rebuild; | 583 | goto rebuild; |
584 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | ||
585 | if (dattr) { | ||
586 | *dattr = SD_ATTR_INIT; | ||
587 | update_domain_attr(dattr, &top_cpuset); | ||
588 | } | ||
569 | *doms = top_cpuset.cpus_allowed; | 589 | *doms = top_cpuset.cpus_allowed; |
570 | goto rebuild; | 590 | goto rebuild; |
571 | } | 591 | } |
@@ -622,6 +642,7 @@ restart: | |||
622 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 642 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
623 | if (!doms) | 643 | if (!doms) |
624 | goto rebuild; | 644 | goto rebuild; |
645 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | ||
625 | 646 | ||
626 | for (nslot = 0, i = 0; i < csn; i++) { | 647 | for (nslot = 0, i = 0; i < csn; i++) { |
627 | struct cpuset *a = csa[i]; | 648 | struct cpuset *a = csa[i]; |
@@ -644,12 +665,15 @@ restart: | |||
644 | } | 665 | } |
645 | 666 | ||
646 | cpus_clear(*dp); | 667 | cpus_clear(*dp); |
668 | if (dattr) | ||
669 | *(dattr + nslot) = SD_ATTR_INIT; | ||
647 | for (j = i; j < csn; j++) { | 670 | for (j = i; j < csn; j++) { |
648 | struct cpuset *b = csa[j]; | 671 | struct cpuset *b = csa[j]; |
649 | 672 | ||
650 | if (apn == b->pn) { | 673 | if (apn == b->pn) { |
651 | cpus_or(*dp, *dp, b->cpus_allowed); | 674 | cpus_or(*dp, *dp, b->cpus_allowed); |
652 | b->pn = -1; | 675 | b->pn = -1; |
676 | update_domain_attr(dattr, b); | ||
653 | } | 677 | } |
654 | } | 678 | } |
655 | nslot++; | 679 | nslot++; |
@@ -660,7 +684,7 @@ restart: | |||
660 | rebuild: | 684 | rebuild: |
661 | /* Have scheduler rebuild sched domains */ | 685 | /* Have scheduler rebuild sched domains */ |
662 | get_online_cpus(); | 686 | get_online_cpus(); |
663 | partition_sched_domains(ndoms, doms); | 687 | partition_sched_domains(ndoms, doms, dattr); |
664 | put_online_cpus(); | 688 | put_online_cpus(); |
665 | 689 | ||
666 | done: | 690 | done: |
@@ -668,6 +692,7 @@ done: | |||
668 | kfifo_free(q); | 692 | kfifo_free(q); |
669 | kfree(csa); | 693 | kfree(csa); |
670 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 694 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
695 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | ||
671 | } | 696 | } |
672 | 697 | ||
673 | static inline int started_after_time(struct task_struct *t1, | 698 | static inline int started_after_time(struct task_struct *t1, |
@@ -729,7 +754,7 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | |||
729 | */ | 754 | */ |
730 | void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | 755 | void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) |
731 | { | 756 | { |
732 | set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); | 757 | set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); |
733 | } | 758 | } |
734 | 759 | ||
735 | /** | 760 | /** |
@@ -1011,6 +1036,21 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
1011 | return 0; | 1036 | return 0; |
1012 | } | 1037 | } |
1013 | 1038 | ||
1039 | static int update_relax_domain_level(struct cpuset *cs, char *buf) | ||
1040 | { | ||
1041 | int val = simple_strtol(buf, NULL, 10); | ||
1042 | |||
1043 | if (val < 0) | ||
1044 | val = -1; | ||
1045 | |||
1046 | if (val != cs->relax_domain_level) { | ||
1047 | cs->relax_domain_level = val; | ||
1048 | rebuild_sched_domains(); | ||
1049 | } | ||
1050 | |||
1051 | return 0; | ||
1052 | } | ||
1053 | |||
1014 | /* | 1054 | /* |
1015 | * update_flag - read a 0 or a 1 in a file and update associated flag | 1055 | * update_flag - read a 0 or a 1 in a file and update associated flag |
1016 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 1056 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
@@ -1178,7 +1218,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
1178 | 1218 | ||
1179 | mutex_lock(&callback_mutex); | 1219 | mutex_lock(&callback_mutex); |
1180 | guarantee_online_cpus(cs, &cpus); | 1220 | guarantee_online_cpus(cs, &cpus); |
1181 | set_cpus_allowed(tsk, cpus); | 1221 | set_cpus_allowed_ptr(tsk, &cpus); |
1182 | mutex_unlock(&callback_mutex); | 1222 | mutex_unlock(&callback_mutex); |
1183 | 1223 | ||
1184 | from = oldcs->mems_allowed; | 1224 | from = oldcs->mems_allowed; |
@@ -1202,6 +1242,7 @@ typedef enum { | |||
1202 | FILE_CPU_EXCLUSIVE, | 1242 | FILE_CPU_EXCLUSIVE, |
1203 | FILE_MEM_EXCLUSIVE, | 1243 | FILE_MEM_EXCLUSIVE, |
1204 | FILE_SCHED_LOAD_BALANCE, | 1244 | FILE_SCHED_LOAD_BALANCE, |
1245 | FILE_SCHED_RELAX_DOMAIN_LEVEL, | ||
1205 | FILE_MEMORY_PRESSURE_ENABLED, | 1246 | FILE_MEMORY_PRESSURE_ENABLED, |
1206 | FILE_MEMORY_PRESSURE, | 1247 | FILE_MEMORY_PRESSURE, |
1207 | FILE_SPREAD_PAGE, | 1248 | FILE_SPREAD_PAGE, |
@@ -1256,6 +1297,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, | |||
1256 | case FILE_SCHED_LOAD_BALANCE: | 1297 | case FILE_SCHED_LOAD_BALANCE: |
1257 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); | 1298 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); |
1258 | break; | 1299 | break; |
1300 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | ||
1301 | retval = update_relax_domain_level(cs, buffer); | ||
1302 | break; | ||
1259 | case FILE_MEMORY_MIGRATE: | 1303 | case FILE_MEMORY_MIGRATE: |
1260 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | 1304 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); |
1261 | break; | 1305 | break; |
@@ -1354,6 +1398,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, | |||
1354 | case FILE_SCHED_LOAD_BALANCE: | 1398 | case FILE_SCHED_LOAD_BALANCE: |
1355 | *s++ = is_sched_load_balance(cs) ? '1' : '0'; | 1399 | *s++ = is_sched_load_balance(cs) ? '1' : '0'; |
1356 | break; | 1400 | break; |
1401 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | ||
1402 | s += sprintf(s, "%d", cs->relax_domain_level); | ||
1403 | break; | ||
1357 | case FILE_MEMORY_MIGRATE: | 1404 | case FILE_MEMORY_MIGRATE: |
1358 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | 1405 | *s++ = is_memory_migrate(cs) ? '1' : '0'; |
1359 | break; | 1406 | break; |
@@ -1424,6 +1471,13 @@ static struct cftype cft_sched_load_balance = { | |||
1424 | .private = FILE_SCHED_LOAD_BALANCE, | 1471 | .private = FILE_SCHED_LOAD_BALANCE, |
1425 | }; | 1472 | }; |
1426 | 1473 | ||
1474 | static struct cftype cft_sched_relax_domain_level = { | ||
1475 | .name = "sched_relax_domain_level", | ||
1476 | .read = cpuset_common_file_read, | ||
1477 | .write = cpuset_common_file_write, | ||
1478 | .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, | ||
1479 | }; | ||
1480 | |||
1427 | static struct cftype cft_memory_migrate = { | 1481 | static struct cftype cft_memory_migrate = { |
1428 | .name = "memory_migrate", | 1482 | .name = "memory_migrate", |
1429 | .read = cpuset_common_file_read, | 1483 | .read = cpuset_common_file_read, |
@@ -1475,6 +1529,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1475 | return err; | 1529 | return err; |
1476 | if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) | 1530 | if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) |
1477 | return err; | 1531 | return err; |
1532 | if ((err = cgroup_add_file(cont, ss, | ||
1533 | &cft_sched_relax_domain_level)) < 0) | ||
1534 | return err; | ||
1478 | if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) | 1535 | if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) |
1479 | return err; | 1536 | return err; |
1480 | if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) | 1537 | if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) |
@@ -1555,10 +1612,11 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1555 | if (is_spread_slab(parent)) | 1612 | if (is_spread_slab(parent)) |
1556 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1613 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
1557 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1614 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
1558 | cs->cpus_allowed = CPU_MASK_NONE; | 1615 | cpus_clear(cs->cpus_allowed); |
1559 | cs->mems_allowed = NODE_MASK_NONE; | 1616 | nodes_clear(cs->mems_allowed); |
1560 | cs->mems_generation = cpuset_mems_generation++; | 1617 | cs->mems_generation = cpuset_mems_generation++; |
1561 | fmeter_init(&cs->fmeter); | 1618 | fmeter_init(&cs->fmeter); |
1619 | cs->relax_domain_level = -1; | ||
1562 | 1620 | ||
1563 | cs->parent = parent; | 1621 | cs->parent = parent; |
1564 | number_of_cpusets++; | 1622 | number_of_cpusets++; |
@@ -1625,12 +1683,13 @@ int __init cpuset_init(void) | |||
1625 | { | 1683 | { |
1626 | int err = 0; | 1684 | int err = 0; |
1627 | 1685 | ||
1628 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | 1686 | cpus_setall(top_cpuset.cpus_allowed); |
1629 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1687 | nodes_setall(top_cpuset.mems_allowed); |
1630 | 1688 | ||
1631 | fmeter_init(&top_cpuset.fmeter); | 1689 | fmeter_init(&top_cpuset.fmeter); |
1632 | top_cpuset.mems_generation = cpuset_mems_generation++; | 1690 | top_cpuset.mems_generation = cpuset_mems_generation++; |
1633 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); | 1691 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); |
1692 | top_cpuset.relax_domain_level = -1; | ||
1634 | 1693 | ||
1635 | err = register_filesystem(&cpuset_fs_type); | 1694 | err = register_filesystem(&cpuset_fs_type); |
1636 | if (err < 0) | 1695 | if (err < 0) |
@@ -1844,6 +1903,7 @@ void __init cpuset_init_smp(void) | |||
1844 | 1903 | ||
1845 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 1904 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
1846 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 1905 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
1906 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. | ||
1847 | * | 1907 | * |
1848 | * Description: Returns the cpumask_t cpus_allowed of the cpuset | 1908 | * Description: Returns the cpumask_t cpus_allowed of the cpuset |
1849 | * attached to the specified @tsk. Guaranteed to return some non-empty | 1909 | * attached to the specified @tsk. Guaranteed to return some non-empty |
@@ -1851,35 +1911,27 @@ void __init cpuset_init_smp(void) | |||
1851 | * tasks cpuset. | 1911 | * tasks cpuset. |
1852 | **/ | 1912 | **/ |
1853 | 1913 | ||
1854 | cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) | 1914 | void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) |
1855 | { | 1915 | { |
1856 | cpumask_t mask; | ||
1857 | |||
1858 | mutex_lock(&callback_mutex); | 1916 | mutex_lock(&callback_mutex); |
1859 | mask = cpuset_cpus_allowed_locked(tsk); | 1917 | cpuset_cpus_allowed_locked(tsk, pmask); |
1860 | mutex_unlock(&callback_mutex); | 1918 | mutex_unlock(&callback_mutex); |
1861 | |||
1862 | return mask; | ||
1863 | } | 1919 | } |
1864 | 1920 | ||
1865 | /** | 1921 | /** |
1866 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. | 1922 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. |
1867 | * Must be called with callback_mutex held. | 1923 | * Must be called with callback_mutex held. |
1868 | **/ | 1924 | **/ |
1869 | cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) | 1925 | void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) |
1870 | { | 1926 | { |
1871 | cpumask_t mask; | ||
1872 | |||
1873 | task_lock(tsk); | 1927 | task_lock(tsk); |
1874 | guarantee_online_cpus(task_cs(tsk), &mask); | 1928 | guarantee_online_cpus(task_cs(tsk), pmask); |
1875 | task_unlock(tsk); | 1929 | task_unlock(tsk); |
1876 | |||
1877 | return mask; | ||
1878 | } | 1930 | } |
1879 | 1931 | ||
1880 | void cpuset_init_current_mems_allowed(void) | 1932 | void cpuset_init_current_mems_allowed(void) |
1881 | { | 1933 | { |
1882 | current->mems_allowed = NODE_MASK_ALL; | 1934 | nodes_setall(current->mems_allowed); |
1883 | } | 1935 | } |
1884 | 1936 | ||
1885 | /** | 1937 | /** |
@@ -2261,8 +2313,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) | |||
2261 | m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, | 2313 | m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, |
2262 | task->cpus_allowed); | 2314 | task->cpus_allowed); |
2263 | seq_printf(m, "\n"); | 2315 | seq_printf(m, "\n"); |
2316 | seq_printf(m, "Cpus_allowed_list:\t"); | ||
2317 | m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, | ||
2318 | task->cpus_allowed); | ||
2319 | seq_printf(m, "\n"); | ||
2264 | seq_printf(m, "Mems_allowed:\t"); | 2320 | seq_printf(m, "Mems_allowed:\t"); |
2265 | m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, | 2321 | m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, |
2266 | task->mems_allowed); | 2322 | task->mems_allowed); |
2267 | seq_printf(m, "\n"); | 2323 | seq_printf(m, "\n"); |
2324 | seq_printf(m, "Mems_allowed_list:\t"); | ||
2325 | m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, | ||
2326 | task->mems_allowed); | ||
2327 | seq_printf(m, "\n"); | ||
2268 | } | 2328 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fdb3fbe2b0c4..964964baefa2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq) | |||
47 | desc->irq_count = 0; | 47 | desc->irq_count = 0; |
48 | desc->irqs_unhandled = 0; | 48 | desc->irqs_unhandled = 0; |
49 | #ifdef CONFIG_SMP | 49 | #ifdef CONFIG_SMP |
50 | desc->affinity = CPU_MASK_ALL; | 50 | cpus_setall(desc->affinity); |
51 | #endif | 51 | #endif |
52 | spin_unlock_irqrestore(&desc->lock, flags); | 52 | spin_unlock_irqrestore(&desc->lock, flags); |
53 | } | 53 | } |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 22be3ff3f363..e2764047ec03 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -165,7 +165,7 @@ static int ____call_usermodehelper(void *data) | |||
165 | } | 165 | } |
166 | 166 | ||
167 | /* We can run anywhere, unlike our parent keventd(). */ | 167 | /* We can run anywhere, unlike our parent keventd(). */ |
168 | set_cpus_allowed(current, CPU_MASK_ALL); | 168 | set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); |
169 | 169 | ||
170 | /* | 170 | /* |
171 | * Our parent is keventd, which runs with elevated scheduling priority. | 171 | * Our parent is keventd, which runs with elevated scheduling priority. |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 0ac887882f90..25241d6ec8cd 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) | |||
180 | wait_task_inactive(k); | 180 | wait_task_inactive(k); |
181 | set_task_cpu(k, cpu); | 181 | set_task_cpu(k, cpu); |
182 | k->cpus_allowed = cpumask_of_cpu(cpu); | 182 | k->cpus_allowed = cpumask_of_cpu(cpu); |
183 | k->rt.nr_cpus_allowed = 1; | ||
183 | } | 184 | } |
184 | EXPORT_SYMBOL(kthread_bind); | 185 | EXPORT_SYMBOL(kthread_bind); |
185 | 186 | ||
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index b4e3c85abe74..7c74dab0d21b 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record | |||
64 | return; | 64 | return; |
65 | 65 | ||
66 | for (i = 0; i < MAXLR; i++) { | 66 | for (i = 0; i < MAXLR; i++) { |
67 | int q; | 67 | int q, same = 1; |
68 | int same = 1; | 68 | |
69 | /* Nothing stored: */ | 69 | /* Nothing stored: */ |
70 | if (!latency_record[i].backtrace[0]) { | 70 | if (!latency_record[i].backtrace[0]) { |
71 | if (firstnonnull > i) | 71 | if (firstnonnull > i) |
@@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record | |||
73 | continue; | 73 | continue; |
74 | } | 74 | } |
75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | 75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { |
76 | if (latency_record[i].backtrace[q] != | 76 | unsigned long record = lat->backtrace[q]; |
77 | lat->backtrace[q]) | 77 | |
78 | if (latency_record[i].backtrace[q] != record) { | ||
78 | same = 0; | 79 | same = 0; |
79 | if (same && lat->backtrace[q] == 0) | ||
80 | break; | 80 | break; |
81 | if (same && lat->backtrace[q] == ULONG_MAX) | 81 | } |
82 | |||
83 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | ||
84 | if (record == 0 || record == ULONG_MAX) | ||
82 | break; | 85 | break; |
83 | } | 86 | } |
84 | if (same) { | 87 | if (same) { |
@@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
143 | for (i = 0; i < LT_SAVECOUNT ; i++) { | 146 | for (i = 0; i < LT_SAVECOUNT ; i++) { |
144 | struct latency_record *mylat; | 147 | struct latency_record *mylat; |
145 | int same = 1; | 148 | int same = 1; |
149 | |||
146 | mylat = &tsk->latency_record[i]; | 150 | mylat = &tsk->latency_record[i]; |
147 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | 151 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { |
148 | if (mylat->backtrace[q] != | 152 | unsigned long record = lat.backtrace[q]; |
149 | lat.backtrace[q]) | 153 | |
154 | if (mylat->backtrace[q] != record) { | ||
150 | same = 0; | 155 | same = 0; |
151 | if (same && lat.backtrace[q] == 0) | ||
152 | break; | 156 | break; |
153 | if (same && lat.backtrace[q] == ULONG_MAX) | 157 | } |
158 | |||
159 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | ||
160 | if (record == 0 || record == ULONG_MAX) | ||
154 | break; | 161 | break; |
155 | } | 162 | } |
156 | if (same) { | 163 | if (same) { |
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e9517014b57c..e1cdf196a515 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
@@ -1007,10 +1007,10 @@ void __synchronize_sched(void) | |||
1007 | if (sched_getaffinity(0, &oldmask) < 0) | 1007 | if (sched_getaffinity(0, &oldmask) < 0) |
1008 | oldmask = cpu_possible_map; | 1008 | oldmask = cpu_possible_map; |
1009 | for_each_online_cpu(cpu) { | 1009 | for_each_online_cpu(cpu) { |
1010 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | 1010 | sched_setaffinity(0, &cpumask_of_cpu(cpu)); |
1011 | schedule(); | 1011 | schedule(); |
1012 | } | 1012 | } |
1013 | sched_setaffinity(0, oldmask); | 1013 | sched_setaffinity(0, &oldmask); |
1014 | } | 1014 | } |
1015 | EXPORT_SYMBOL_GPL(__synchronize_sched); | 1015 | EXPORT_SYMBOL_GPL(__synchronize_sched); |
1016 | 1016 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index fd599829e72a..47894f919d4e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -723,9 +723,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ | |||
723 | */ | 723 | */ |
724 | static void rcu_torture_shuffle_tasks(void) | 724 | static void rcu_torture_shuffle_tasks(void) |
725 | { | 725 | { |
726 | cpumask_t tmp_mask = CPU_MASK_ALL; | 726 | cpumask_t tmp_mask; |
727 | int i; | 727 | int i; |
728 | 728 | ||
729 | cpus_setall(tmp_mask); | ||
729 | get_online_cpus(); | 730 | get_online_cpus(); |
730 | 731 | ||
731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 732 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
@@ -737,25 +738,27 @@ static void rcu_torture_shuffle_tasks(void) | |||
737 | if (rcu_idle_cpu != -1) | 738 | if (rcu_idle_cpu != -1) |
738 | cpu_clear(rcu_idle_cpu, tmp_mask); | 739 | cpu_clear(rcu_idle_cpu, tmp_mask); |
739 | 740 | ||
740 | set_cpus_allowed(current, tmp_mask); | 741 | set_cpus_allowed_ptr(current, &tmp_mask); |
741 | 742 | ||
742 | if (reader_tasks) { | 743 | if (reader_tasks) { |
743 | for (i = 0; i < nrealreaders; i++) | 744 | for (i = 0; i < nrealreaders; i++) |
744 | if (reader_tasks[i]) | 745 | if (reader_tasks[i]) |
745 | set_cpus_allowed(reader_tasks[i], tmp_mask); | 746 | set_cpus_allowed_ptr(reader_tasks[i], |
747 | &tmp_mask); | ||
746 | } | 748 | } |
747 | 749 | ||
748 | if (fakewriter_tasks) { | 750 | if (fakewriter_tasks) { |
749 | for (i = 0; i < nfakewriters; i++) | 751 | for (i = 0; i < nfakewriters; i++) |
750 | if (fakewriter_tasks[i]) | 752 | if (fakewriter_tasks[i]) |
751 | set_cpus_allowed(fakewriter_tasks[i], tmp_mask); | 753 | set_cpus_allowed_ptr(fakewriter_tasks[i], |
754 | &tmp_mask); | ||
752 | } | 755 | } |
753 | 756 | ||
754 | if (writer_task) | 757 | if (writer_task) |
755 | set_cpus_allowed(writer_task, tmp_mask); | 758 | set_cpus_allowed_ptr(writer_task, &tmp_mask); |
756 | 759 | ||
757 | if (stats_task) | 760 | if (stats_task) |
758 | set_cpus_allowed(stats_task, tmp_mask); | 761 | set_cpus_allowed_ptr(stats_task, &tmp_mask); |
759 | 762 | ||
760 | if (rcu_idle_cpu == -1) | 763 | if (rcu_idle_cpu == -1) |
761 | rcu_idle_cpu = num_online_cpus() - 1; | 764 | rcu_idle_cpu = num_online_cpus() - 1; |
diff --git a/kernel/sched.c b/kernel/sched.c index 8dcdec6fe0fe..57ba7ea9b744 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -66,6 +66,10 @@ | |||
66 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
67 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | 68 | #include <linux/hrtimer.h> |
69 | #include <linux/tick.h> | ||
70 | #include <linux/bootmem.h> | ||
71 | #include <linux/debugfs.h> | ||
72 | #include <linux/ctype.h> | ||
69 | 73 | ||
70 | #include <asm/tlb.h> | 74 | #include <asm/tlb.h> |
71 | #include <asm/irq_regs.h> | 75 | #include <asm/irq_regs.h> |
@@ -114,6 +118,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
114 | */ | 118 | */ |
115 | #define DEF_TIMESLICE (100 * HZ / 1000) | 119 | #define DEF_TIMESLICE (100 * HZ / 1000) |
116 | 120 | ||
121 | /* | ||
122 | * single value that denotes runtime == period, ie unlimited time. | ||
123 | */ | ||
124 | #define RUNTIME_INF ((u64)~0ULL) | ||
125 | |||
117 | #ifdef CONFIG_SMP | 126 | #ifdef CONFIG_SMP |
118 | /* | 127 | /* |
119 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | 128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) |
@@ -155,6 +164,84 @@ struct rt_prio_array { | |||
155 | struct list_head queue[MAX_RT_PRIO]; | 164 | struct list_head queue[MAX_RT_PRIO]; |
156 | }; | 165 | }; |
157 | 166 | ||
167 | struct rt_bandwidth { | ||
168 | /* nests inside the rq lock: */ | ||
169 | spinlock_t rt_runtime_lock; | ||
170 | ktime_t rt_period; | ||
171 | u64 rt_runtime; | ||
172 | struct hrtimer rt_period_timer; | ||
173 | }; | ||
174 | |||
175 | static struct rt_bandwidth def_rt_bandwidth; | ||
176 | |||
177 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
178 | |||
179 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
180 | { | ||
181 | struct rt_bandwidth *rt_b = | ||
182 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
183 | ktime_t now; | ||
184 | int overrun; | ||
185 | int idle = 0; | ||
186 | |||
187 | for (;;) { | ||
188 | now = hrtimer_cb_get_time(timer); | ||
189 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
190 | |||
191 | if (!overrun) | ||
192 | break; | ||
193 | |||
194 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
195 | } | ||
196 | |||
197 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
198 | } | ||
199 | |||
200 | static | ||
201 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
202 | { | ||
203 | rt_b->rt_period = ns_to_ktime(period); | ||
204 | rt_b->rt_runtime = runtime; | ||
205 | |||
206 | spin_lock_init(&rt_b->rt_runtime_lock); | ||
207 | |||
208 | hrtimer_init(&rt_b->rt_period_timer, | ||
209 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
210 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
211 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
212 | } | ||
213 | |||
214 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
215 | { | ||
216 | ktime_t now; | ||
217 | |||
218 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
219 | return; | ||
220 | |||
221 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
222 | return; | ||
223 | |||
224 | spin_lock(&rt_b->rt_runtime_lock); | ||
225 | for (;;) { | ||
226 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
227 | break; | ||
228 | |||
229 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
230 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
231 | hrtimer_start(&rt_b->rt_period_timer, | ||
232 | rt_b->rt_period_timer.expires, | ||
233 | HRTIMER_MODE_ABS); | ||
234 | } | ||
235 | spin_unlock(&rt_b->rt_runtime_lock); | ||
236 | } | ||
237 | |||
238 | #ifdef CONFIG_RT_GROUP_SCHED | ||
239 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
240 | { | ||
241 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
242 | } | ||
243 | #endif | ||
244 | |||
158 | #ifdef CONFIG_GROUP_SCHED | 245 | #ifdef CONFIG_GROUP_SCHED |
159 | 246 | ||
160 | #include <linux/cgroup.h> | 247 | #include <linux/cgroup.h> |
@@ -181,29 +268,39 @@ struct task_group { | |||
181 | struct sched_rt_entity **rt_se; | 268 | struct sched_rt_entity **rt_se; |
182 | struct rt_rq **rt_rq; | 269 | struct rt_rq **rt_rq; |
183 | 270 | ||
184 | u64 rt_runtime; | 271 | struct rt_bandwidth rt_bandwidth; |
185 | #endif | 272 | #endif |
186 | 273 | ||
187 | struct rcu_head rcu; | 274 | struct rcu_head rcu; |
188 | struct list_head list; | 275 | struct list_head list; |
276 | |||
277 | struct task_group *parent; | ||
278 | struct list_head siblings; | ||
279 | struct list_head children; | ||
189 | }; | 280 | }; |
190 | 281 | ||
282 | #ifdef CONFIG_USER_SCHED | ||
283 | |||
284 | /* | ||
285 | * Root task group. | ||
286 | * Every UID task group (including init_task_group aka UID-0) will | ||
287 | * be a child to this group. | ||
288 | */ | ||
289 | struct task_group root_task_group; | ||
290 | |||
191 | #ifdef CONFIG_FAIR_GROUP_SCHED | 291 | #ifdef CONFIG_FAIR_GROUP_SCHED |
192 | /* Default task group's sched entity on each cpu */ | 292 | /* Default task group's sched entity on each cpu */ |
193 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 293 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
194 | /* Default task group's cfs_rq on each cpu */ | 294 | /* Default task group's cfs_rq on each cpu */ |
195 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 295 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
196 | |||
197 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | ||
198 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | ||
199 | #endif | 296 | #endif |
200 | 297 | ||
201 | #ifdef CONFIG_RT_GROUP_SCHED | 298 | #ifdef CONFIG_RT_GROUP_SCHED |
202 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 299 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
203 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 300 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
204 | 301 | #endif | |
205 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | 302 | #else |
206 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | 303 | #define root_task_group init_task_group |
207 | #endif | 304 | #endif |
208 | 305 | ||
209 | /* task_group_lock serializes add/remove of task groups and also changes to | 306 | /* task_group_lock serializes add/remove of task groups and also changes to |
@@ -221,23 +318,15 @@ static DEFINE_MUTEX(doms_cur_mutex); | |||
221 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 318 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
222 | #endif | 319 | #endif |
223 | 320 | ||
321 | #define MIN_SHARES 2 | ||
322 | |||
224 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 323 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
225 | #endif | 324 | #endif |
226 | 325 | ||
227 | /* Default task group. | 326 | /* Default task group. |
228 | * Every task in system belong to this group at bootup. | 327 | * Every task in system belong to this group at bootup. |
229 | */ | 328 | */ |
230 | struct task_group init_task_group = { | 329 | struct task_group init_task_group; |
231 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
232 | .se = init_sched_entity_p, | ||
233 | .cfs_rq = init_cfs_rq_p, | ||
234 | #endif | ||
235 | |||
236 | #ifdef CONFIG_RT_GROUP_SCHED | ||
237 | .rt_se = init_sched_rt_entity_p, | ||
238 | .rt_rq = init_rt_rq_p, | ||
239 | #endif | ||
240 | }; | ||
241 | 330 | ||
242 | /* return group to which a task belongs */ | 331 | /* return group to which a task belongs */ |
243 | static inline struct task_group *task_group(struct task_struct *p) | 332 | static inline struct task_group *task_group(struct task_struct *p) |
@@ -297,8 +386,12 @@ struct cfs_rq { | |||
297 | 386 | ||
298 | struct rb_root tasks_timeline; | 387 | struct rb_root tasks_timeline; |
299 | struct rb_node *rb_leftmost; | 388 | struct rb_node *rb_leftmost; |
300 | struct rb_node *rb_load_balance_curr; | 389 | |
301 | /* 'curr' points to currently running entity on this cfs_rq. | 390 | struct list_head tasks; |
391 | struct list_head *balance_iterator; | ||
392 | |||
393 | /* | ||
394 | * 'curr' points to currently running entity on this cfs_rq. | ||
302 | * It is set to NULL otherwise (i.e when none are currently running). | 395 | * It is set to NULL otherwise (i.e when none are currently running). |
303 | */ | 396 | */ |
304 | struct sched_entity *curr, *next; | 397 | struct sched_entity *curr, *next; |
@@ -318,6 +411,43 @@ struct cfs_rq { | |||
318 | */ | 411 | */ |
319 | struct list_head leaf_cfs_rq_list; | 412 | struct list_head leaf_cfs_rq_list; |
320 | struct task_group *tg; /* group that "owns" this runqueue */ | 413 | struct task_group *tg; /* group that "owns" this runqueue */ |
414 | |||
415 | #ifdef CONFIG_SMP | ||
416 | unsigned long task_weight; | ||
417 | unsigned long shares; | ||
418 | /* | ||
419 | * We need space to build a sched_domain wide view of the full task | ||
420 | * group tree, in order to avoid depending on dynamic memory allocation | ||
421 | * during the load balancing we place this in the per cpu task group | ||
422 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
423 | * but more should not be needed anyway. | ||
424 | */ | ||
425 | struct aggregate_struct { | ||
426 | /* | ||
427 | * load = weight(cpus) * f(tg) | ||
428 | * | ||
429 | * Where f(tg) is the recursive weight fraction assigned to | ||
430 | * this group. | ||
431 | */ | ||
432 | unsigned long load; | ||
433 | |||
434 | /* | ||
435 | * part of the group weight distributed to this span. | ||
436 | */ | ||
437 | unsigned long shares; | ||
438 | |||
439 | /* | ||
440 | * The sum of all runqueue weights within this span. | ||
441 | */ | ||
442 | unsigned long rq_weight; | ||
443 | |||
444 | /* | ||
445 | * Weight contributed by tasks; this is the part we can | ||
446 | * influence by moving tasks around. | ||
447 | */ | ||
448 | unsigned long task_weight; | ||
449 | } aggregate; | ||
450 | #endif | ||
321 | #endif | 451 | #endif |
322 | }; | 452 | }; |
323 | 453 | ||
@@ -334,6 +464,9 @@ struct rt_rq { | |||
334 | #endif | 464 | #endif |
335 | int rt_throttled; | 465 | int rt_throttled; |
336 | u64 rt_time; | 466 | u64 rt_time; |
467 | u64 rt_runtime; | ||
468 | /* Nests inside the rq lock: */ | ||
469 | spinlock_t rt_runtime_lock; | ||
337 | 470 | ||
338 | #ifdef CONFIG_RT_GROUP_SCHED | 471 | #ifdef CONFIG_RT_GROUP_SCHED |
339 | unsigned long rt_nr_boosted; | 472 | unsigned long rt_nr_boosted; |
@@ -396,6 +529,7 @@ struct rq { | |||
396 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 529 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
397 | unsigned char idle_at_tick; | 530 | unsigned char idle_at_tick; |
398 | #ifdef CONFIG_NO_HZ | 531 | #ifdef CONFIG_NO_HZ |
532 | unsigned long last_tick_seen; | ||
399 | unsigned char in_nohz_recently; | 533 | unsigned char in_nohz_recently; |
400 | #endif | 534 | #endif |
401 | /* capture load from *all* tasks on this cpu: */ | 535 | /* capture load from *all* tasks on this cpu: */ |
@@ -405,8 +539,6 @@ struct rq { | |||
405 | 539 | ||
406 | struct cfs_rq cfs; | 540 | struct cfs_rq cfs; |
407 | struct rt_rq rt; | 541 | struct rt_rq rt; |
408 | u64 rt_period_expire; | ||
409 | int rt_throttled; | ||
410 | 542 | ||
411 | #ifdef CONFIG_FAIR_GROUP_SCHED | 543 | #ifdef CONFIG_FAIR_GROUP_SCHED |
412 | /* list of leaf cfs_rq on this cpu: */ | 544 | /* list of leaf cfs_rq on this cpu: */ |
@@ -499,6 +631,32 @@ static inline int cpu_of(struct rq *rq) | |||
499 | #endif | 631 | #endif |
500 | } | 632 | } |
501 | 633 | ||
634 | #ifdef CONFIG_NO_HZ | ||
635 | static inline bool nohz_on(int cpu) | ||
636 | { | ||
637 | return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; | ||
638 | } | ||
639 | |||
640 | static inline u64 max_skipped_ticks(struct rq *rq) | ||
641 | { | ||
642 | return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; | ||
643 | } | ||
644 | |||
645 | static inline void update_last_tick_seen(struct rq *rq) | ||
646 | { | ||
647 | rq->last_tick_seen = jiffies; | ||
648 | } | ||
649 | #else | ||
650 | static inline u64 max_skipped_ticks(struct rq *rq) | ||
651 | { | ||
652 | return 1; | ||
653 | } | ||
654 | |||
655 | static inline void update_last_tick_seen(struct rq *rq) | ||
656 | { | ||
657 | } | ||
658 | #endif | ||
659 | |||
502 | /* | 660 | /* |
503 | * Update the per-runqueue clock, as finegrained as the platform can give | 661 | * Update the per-runqueue clock, as finegrained as the platform can give |
504 | * us, but without assuming monotonicity, etc.: | 662 | * us, but without assuming monotonicity, etc.: |
@@ -523,9 +681,12 @@ static void __update_rq_clock(struct rq *rq) | |||
523 | /* | 681 | /* |
524 | * Catch too large forward jumps too: | 682 | * Catch too large forward jumps too: |
525 | */ | 683 | */ |
526 | if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { | 684 | u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; |
527 | if (clock < rq->tick_timestamp + TICK_NSEC) | 685 | u64 max_time = rq->tick_timestamp + max_jump; |
528 | clock = rq->tick_timestamp + TICK_NSEC; | 686 | |
687 | if (unlikely(clock + delta > max_time)) { | ||
688 | if (clock < max_time) | ||
689 | clock = max_time; | ||
529 | else | 690 | else |
530 | clock++; | 691 | clock++; |
531 | rq->clock_overflows++; | 692 | rq->clock_overflows++; |
@@ -561,23 +722,6 @@ static void update_rq_clock(struct rq *rq) | |||
561 | #define task_rq(p) cpu_rq(task_cpu(p)) | 722 | #define task_rq(p) cpu_rq(task_cpu(p)) |
562 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 723 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
563 | 724 | ||
564 | unsigned long rt_needs_cpu(int cpu) | ||
565 | { | ||
566 | struct rq *rq = cpu_rq(cpu); | ||
567 | u64 delta; | ||
568 | |||
569 | if (!rq->rt_throttled) | ||
570 | return 0; | ||
571 | |||
572 | if (rq->clock > rq->rt_period_expire) | ||
573 | return 1; | ||
574 | |||
575 | delta = rq->rt_period_expire - rq->clock; | ||
576 | do_div(delta, NSEC_PER_SEC / HZ); | ||
577 | |||
578 | return (unsigned long)delta; | ||
579 | } | ||
580 | |||
581 | /* | 725 | /* |
582 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 726 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
583 | */ | 727 | */ |
@@ -590,22 +734,137 @@ unsigned long rt_needs_cpu(int cpu) | |||
590 | /* | 734 | /* |
591 | * Debugging: various feature bits | 735 | * Debugging: various feature bits |
592 | */ | 736 | */ |
737 | |||
738 | #define SCHED_FEAT(name, enabled) \ | ||
739 | __SCHED_FEAT_##name , | ||
740 | |||
593 | enum { | 741 | enum { |
594 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | 742 | #include "sched_features.h" |
595 | SCHED_FEAT_WAKEUP_PREEMPT = 2, | ||
596 | SCHED_FEAT_START_DEBIT = 4, | ||
597 | SCHED_FEAT_HRTICK = 8, | ||
598 | SCHED_FEAT_DOUBLE_TICK = 16, | ||
599 | }; | 743 | }; |
600 | 744 | ||
745 | #undef SCHED_FEAT | ||
746 | |||
747 | #define SCHED_FEAT(name, enabled) \ | ||
748 | (1UL << __SCHED_FEAT_##name) * enabled | | ||
749 | |||
601 | const_debug unsigned int sysctl_sched_features = | 750 | const_debug unsigned int sysctl_sched_features = |
602 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | | 751 | #include "sched_features.h" |
603 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 752 | 0; |
604 | SCHED_FEAT_START_DEBIT * 1 | | 753 | |
605 | SCHED_FEAT_HRTICK * 1 | | 754 | #undef SCHED_FEAT |
606 | SCHED_FEAT_DOUBLE_TICK * 0; | 755 | |
756 | #ifdef CONFIG_SCHED_DEBUG | ||
757 | #define SCHED_FEAT(name, enabled) \ | ||
758 | #name , | ||
759 | |||
760 | __read_mostly char *sched_feat_names[] = { | ||
761 | #include "sched_features.h" | ||
762 | NULL | ||
763 | }; | ||
764 | |||
765 | #undef SCHED_FEAT | ||
766 | |||
767 | int sched_feat_open(struct inode *inode, struct file *filp) | ||
768 | { | ||
769 | filp->private_data = inode->i_private; | ||
770 | return 0; | ||
771 | } | ||
772 | |||
773 | static ssize_t | ||
774 | sched_feat_read(struct file *filp, char __user *ubuf, | ||
775 | size_t cnt, loff_t *ppos) | ||
776 | { | ||
777 | char *buf; | ||
778 | int r = 0; | ||
779 | int len = 0; | ||
780 | int i; | ||
607 | 781 | ||
608 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 782 | for (i = 0; sched_feat_names[i]; i++) { |
783 | len += strlen(sched_feat_names[i]); | ||
784 | len += 4; | ||
785 | } | ||
786 | |||
787 | buf = kmalloc(len + 2, GFP_KERNEL); | ||
788 | if (!buf) | ||
789 | return -ENOMEM; | ||
790 | |||
791 | for (i = 0; sched_feat_names[i]; i++) { | ||
792 | if (sysctl_sched_features & (1UL << i)) | ||
793 | r += sprintf(buf + r, "%s ", sched_feat_names[i]); | ||
794 | else | ||
795 | r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); | ||
796 | } | ||
797 | |||
798 | r += sprintf(buf + r, "\n"); | ||
799 | WARN_ON(r >= len + 2); | ||
800 | |||
801 | r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
802 | |||
803 | kfree(buf); | ||
804 | |||
805 | return r; | ||
806 | } | ||
807 | |||
808 | static ssize_t | ||
809 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
810 | size_t cnt, loff_t *ppos) | ||
811 | { | ||
812 | char buf[64]; | ||
813 | char *cmp = buf; | ||
814 | int neg = 0; | ||
815 | int i; | ||
816 | |||
817 | if (cnt > 63) | ||
818 | cnt = 63; | ||
819 | |||
820 | if (copy_from_user(&buf, ubuf, cnt)) | ||
821 | return -EFAULT; | ||
822 | |||
823 | buf[cnt] = 0; | ||
824 | |||
825 | if (strncmp(buf, "NO_", 3) == 0) { | ||
826 | neg = 1; | ||
827 | cmp += 3; | ||
828 | } | ||
829 | |||
830 | for (i = 0; sched_feat_names[i]; i++) { | ||
831 | int len = strlen(sched_feat_names[i]); | ||
832 | |||
833 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
834 | if (neg) | ||
835 | sysctl_sched_features &= ~(1UL << i); | ||
836 | else | ||
837 | sysctl_sched_features |= (1UL << i); | ||
838 | break; | ||
839 | } | ||
840 | } | ||
841 | |||
842 | if (!sched_feat_names[i]) | ||
843 | return -EINVAL; | ||
844 | |||
845 | filp->f_pos += cnt; | ||
846 | |||
847 | return cnt; | ||
848 | } | ||
849 | |||
850 | static struct file_operations sched_feat_fops = { | ||
851 | .open = sched_feat_open, | ||
852 | .read = sched_feat_read, | ||
853 | .write = sched_feat_write, | ||
854 | }; | ||
855 | |||
856 | static __init int sched_init_debug(void) | ||
857 | { | ||
858 | debugfs_create_file("sched_features", 0644, NULL, NULL, | ||
859 | &sched_feat_fops); | ||
860 | |||
861 | return 0; | ||
862 | } | ||
863 | late_initcall(sched_init_debug); | ||
864 | |||
865 | #endif | ||
866 | |||
867 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
609 | 868 | ||
610 | /* | 869 | /* |
611 | * Number of tasks to iterate in a single balance run. | 870 | * Number of tasks to iterate in a single balance run. |
@@ -627,16 +886,52 @@ static __read_mostly int scheduler_running; | |||
627 | */ | 886 | */ |
628 | int sysctl_sched_rt_runtime = 950000; | 887 | int sysctl_sched_rt_runtime = 950000; |
629 | 888 | ||
630 | /* | 889 | static inline u64 global_rt_period(void) |
631 | * single value that denotes runtime == period, ie unlimited time. | 890 | { |
632 | */ | 891 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
633 | #define RUNTIME_INF ((u64)~0ULL) | 892 | } |
893 | |||
894 | static inline u64 global_rt_runtime(void) | ||
895 | { | ||
896 | if (sysctl_sched_rt_period < 0) | ||
897 | return RUNTIME_INF; | ||
898 | |||
899 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
900 | } | ||
901 | |||
902 | static const unsigned long long time_sync_thresh = 100000; | ||
903 | |||
904 | static DEFINE_PER_CPU(unsigned long long, time_offset); | ||
905 | static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); | ||
634 | 906 | ||
635 | /* | 907 | /* |
636 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 908 | * Global lock which we take every now and then to synchronize |
637 | * clock constructed from sched_clock(): | 909 | * the CPUs time. This method is not warp-safe, but it's good |
910 | * enough to synchronize slowly diverging time sources and thus | ||
911 | * it's good enough for tracing: | ||
638 | */ | 912 | */ |
639 | unsigned long long cpu_clock(int cpu) | 913 | static DEFINE_SPINLOCK(time_sync_lock); |
914 | static unsigned long long prev_global_time; | ||
915 | |||
916 | static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) | ||
917 | { | ||
918 | unsigned long flags; | ||
919 | |||
920 | spin_lock_irqsave(&time_sync_lock, flags); | ||
921 | |||
922 | if (time < prev_global_time) { | ||
923 | per_cpu(time_offset, cpu) += prev_global_time - time; | ||
924 | time = prev_global_time; | ||
925 | } else { | ||
926 | prev_global_time = time; | ||
927 | } | ||
928 | |||
929 | spin_unlock_irqrestore(&time_sync_lock, flags); | ||
930 | |||
931 | return time; | ||
932 | } | ||
933 | |||
934 | static unsigned long long __cpu_clock(int cpu) | ||
640 | { | 935 | { |
641 | unsigned long long now; | 936 | unsigned long long now; |
642 | unsigned long flags; | 937 | unsigned long flags; |
@@ -657,6 +952,24 @@ unsigned long long cpu_clock(int cpu) | |||
657 | 952 | ||
658 | return now; | 953 | return now; |
659 | } | 954 | } |
955 | |||
956 | /* | ||
957 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
958 | * clock constructed from sched_clock(): | ||
959 | */ | ||
960 | unsigned long long cpu_clock(int cpu) | ||
961 | { | ||
962 | unsigned long long prev_cpu_time, time, delta_time; | ||
963 | |||
964 | prev_cpu_time = per_cpu(prev_cpu_time, cpu); | ||
965 | time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); | ||
966 | delta_time = time-prev_cpu_time; | ||
967 | |||
968 | if (unlikely(delta_time > time_sync_thresh)) | ||
969 | time = __sync_cpu_clock(time, cpu); | ||
970 | |||
971 | return time; | ||
972 | } | ||
660 | EXPORT_SYMBOL_GPL(cpu_clock); | 973 | EXPORT_SYMBOL_GPL(cpu_clock); |
661 | 974 | ||
662 | #ifndef prepare_arch_switch | 975 | #ifndef prepare_arch_switch |
@@ -1116,6 +1429,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) | |||
1116 | */ | 1429 | */ |
1117 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1430 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
1118 | 1431 | ||
1432 | /* | ||
1433 | * delta *= weight / lw | ||
1434 | */ | ||
1119 | static unsigned long | 1435 | static unsigned long |
1120 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1436 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
1121 | struct load_weight *lw) | 1437 | struct load_weight *lw) |
@@ -1138,12 +1454,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1138 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1454 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
1139 | } | 1455 | } |
1140 | 1456 | ||
1141 | static inline unsigned long | ||
1142 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | ||
1143 | { | ||
1144 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | ||
1145 | } | ||
1146 | |||
1147 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1457 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
1148 | { | 1458 | { |
1149 | lw->weight += inc; | 1459 | lw->weight += inc; |
@@ -1241,11 +1551,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
1241 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1551 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
1242 | #endif | 1552 | #endif |
1243 | 1553 | ||
1554 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1555 | { | ||
1556 | update_load_add(&rq->load, load); | ||
1557 | } | ||
1558 | |||
1559 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1560 | { | ||
1561 | update_load_sub(&rq->load, load); | ||
1562 | } | ||
1563 | |||
1244 | #ifdef CONFIG_SMP | 1564 | #ifdef CONFIG_SMP |
1245 | static unsigned long source_load(int cpu, int type); | 1565 | static unsigned long source_load(int cpu, int type); |
1246 | static unsigned long target_load(int cpu, int type); | 1566 | static unsigned long target_load(int cpu, int type); |
1247 | static unsigned long cpu_avg_load_per_task(int cpu); | 1567 | static unsigned long cpu_avg_load_per_task(int cpu); |
1248 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1568 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1569 | |||
1570 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1571 | |||
1572 | /* | ||
1573 | * Group load balancing. | ||
1574 | * | ||
1575 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
1576 | * Given the pictures below, and assuming each item has equal weight: | ||
1577 | * | ||
1578 | * root 1 - thread | ||
1579 | * / | \ A - group | ||
1580 | * A 1 B | ||
1581 | * /|\ / \ | ||
1582 | * C 2 D 3 4 | ||
1583 | * | | | ||
1584 | * 5 6 | ||
1585 | * | ||
1586 | * load: | ||
1587 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
1588 | * which equals 1/9-th of the total load. | ||
1589 | * | ||
1590 | * shares: | ||
1591 | * The weight of this group on the selected cpus. | ||
1592 | * | ||
1593 | * rq_weight: | ||
1594 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
1595 | * B would get 2. | ||
1596 | * | ||
1597 | * task_weight: | ||
1598 | * Part of the rq_weight contributed by tasks; all groups except B would | ||
1599 | * get 1, B gets 2. | ||
1600 | */ | ||
1601 | |||
1602 | static inline struct aggregate_struct * | ||
1603 | aggregate(struct task_group *tg, struct sched_domain *sd) | ||
1604 | { | ||
1605 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | ||
1606 | } | ||
1607 | |||
1608 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | ||
1609 | |||
1610 | /* | ||
1611 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1612 | * leaving it for the final time. | ||
1613 | */ | ||
1614 | static | ||
1615 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | ||
1616 | struct sched_domain *sd) | ||
1617 | { | ||
1618 | struct task_group *parent, *child; | ||
1619 | |||
1620 | rcu_read_lock(); | ||
1621 | parent = &root_task_group; | ||
1622 | down: | ||
1623 | (*down)(parent, sd); | ||
1624 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
1625 | parent = child; | ||
1626 | goto down; | ||
1627 | |||
1628 | up: | ||
1629 | continue; | ||
1630 | } | ||
1631 | (*up)(parent, sd); | ||
1632 | |||
1633 | child = parent; | ||
1634 | parent = parent->parent; | ||
1635 | if (parent) | ||
1636 | goto up; | ||
1637 | rcu_read_unlock(); | ||
1638 | } | ||
1639 | |||
1640 | /* | ||
1641 | * Calculate the aggregate runqueue weight. | ||
1642 | */ | ||
1643 | static | ||
1644 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | ||
1645 | { | ||
1646 | unsigned long rq_weight = 0; | ||
1647 | unsigned long task_weight = 0; | ||
1648 | int i; | ||
1649 | |||
1650 | for_each_cpu_mask(i, sd->span) { | ||
1651 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1652 | task_weight += tg->cfs_rq[i]->task_weight; | ||
1653 | } | ||
1654 | |||
1655 | aggregate(tg, sd)->rq_weight = rq_weight; | ||
1656 | aggregate(tg, sd)->task_weight = task_weight; | ||
1657 | } | ||
1658 | |||
1659 | /* | ||
1660 | * Redistribute tg->shares amongst all tg->cfs_rq[]s. | ||
1661 | */ | ||
1662 | static void __aggregate_redistribute_shares(struct task_group *tg) | ||
1663 | { | ||
1664 | int i, max_cpu = smp_processor_id(); | ||
1665 | unsigned long rq_weight = 0; | ||
1666 | unsigned long shares, max_shares = 0, shares_rem = tg->shares; | ||
1667 | |||
1668 | for_each_possible_cpu(i) | ||
1669 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1670 | |||
1671 | for_each_possible_cpu(i) { | ||
1672 | /* | ||
1673 | * divide shares proportional to the rq_weights. | ||
1674 | */ | ||
1675 | shares = tg->shares * tg->cfs_rq[i]->load.weight; | ||
1676 | shares /= rq_weight + 1; | ||
1677 | |||
1678 | tg->cfs_rq[i]->shares = shares; | ||
1679 | |||
1680 | if (shares > max_shares) { | ||
1681 | max_shares = shares; | ||
1682 | max_cpu = i; | ||
1683 | } | ||
1684 | shares_rem -= shares; | ||
1685 | } | ||
1686 | |||
1687 | /* | ||
1688 | * Ensure it all adds up to tg->shares; we can loose a few | ||
1689 | * due to rounding down when computing the per-cpu shares. | ||
1690 | */ | ||
1691 | if (shares_rem) | ||
1692 | tg->cfs_rq[max_cpu]->shares += shares_rem; | ||
1693 | } | ||
1694 | |||
1695 | /* | ||
1696 | * Compute the weight of this group on the given cpus. | ||
1697 | */ | ||
1698 | static | ||
1699 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | ||
1700 | { | ||
1701 | unsigned long shares = 0; | ||
1702 | int i; | ||
1703 | |||
1704 | again: | ||
1705 | for_each_cpu_mask(i, sd->span) | ||
1706 | shares += tg->cfs_rq[i]->shares; | ||
1707 | |||
1708 | /* | ||
1709 | * When the span doesn't have any shares assigned, but does have | ||
1710 | * tasks to run do a machine wide rebalance (should be rare). | ||
1711 | */ | ||
1712 | if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) { | ||
1713 | __aggregate_redistribute_shares(tg); | ||
1714 | goto again; | ||
1715 | } | ||
1716 | |||
1717 | aggregate(tg, sd)->shares = shares; | ||
1718 | } | ||
1719 | |||
1720 | /* | ||
1721 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
1722 | * weight and this group's parent's load, i.e. top-down. | ||
1723 | */ | ||
1724 | static | ||
1725 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | ||
1726 | { | ||
1727 | unsigned long load; | ||
1728 | |||
1729 | if (!tg->parent) { | ||
1730 | int i; | ||
1731 | |||
1732 | load = 0; | ||
1733 | for_each_cpu_mask(i, sd->span) | ||
1734 | load += cpu_rq(i)->load.weight; | ||
1735 | |||
1736 | } else { | ||
1737 | load = aggregate(tg->parent, sd)->load; | ||
1738 | |||
1739 | /* | ||
1740 | * shares is our weight in the parent's rq so | ||
1741 | * shares/parent->rq_weight gives our fraction of the load | ||
1742 | */ | ||
1743 | load *= aggregate(tg, sd)->shares; | ||
1744 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | ||
1745 | } | ||
1746 | |||
1747 | aggregate(tg, sd)->load = load; | ||
1748 | } | ||
1749 | |||
1750 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1751 | |||
1752 | /* | ||
1753 | * Calculate and set the cpu's group shares. | ||
1754 | */ | ||
1755 | static void | ||
1756 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | ||
1757 | int tcpu) | ||
1758 | { | ||
1759 | int boost = 0; | ||
1760 | unsigned long shares; | ||
1761 | unsigned long rq_weight; | ||
1762 | |||
1763 | if (!tg->se[tcpu]) | ||
1764 | return; | ||
1765 | |||
1766 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | ||
1767 | |||
1768 | /* | ||
1769 | * If there are currently no tasks on the cpu pretend there is one of | ||
1770 | * average load so that when a new task gets to run here it will not | ||
1771 | * get delayed by group starvation. | ||
1772 | */ | ||
1773 | if (!rq_weight) { | ||
1774 | boost = 1; | ||
1775 | rq_weight = NICE_0_LOAD; | ||
1776 | } | ||
1777 | |||
1778 | /* | ||
1779 | * \Sum shares * rq_weight | ||
1780 | * shares = ----------------------- | ||
1781 | * \Sum rq_weight | ||
1782 | * | ||
1783 | */ | ||
1784 | shares = aggregate(tg, sd)->shares * rq_weight; | ||
1785 | shares /= aggregate(tg, sd)->rq_weight + 1; | ||
1786 | |||
1787 | /* | ||
1788 | * record the actual number of shares, not the boosted amount. | ||
1789 | */ | ||
1790 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | ||
1791 | |||
1792 | if (shares < MIN_SHARES) | ||
1793 | shares = MIN_SHARES; | ||
1794 | |||
1795 | __set_se_shares(tg->se[tcpu], shares); | ||
1796 | } | ||
1797 | |||
1798 | /* | ||
1799 | * Re-adjust the weights on the cpu the task came from and on the cpu the | ||
1800 | * task went to. | ||
1801 | */ | ||
1802 | static void | ||
1803 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1804 | int scpu, int dcpu) | ||
1805 | { | ||
1806 | unsigned long shares; | ||
1807 | |||
1808 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1809 | |||
1810 | __update_group_shares_cpu(tg, sd, scpu); | ||
1811 | __update_group_shares_cpu(tg, sd, dcpu); | ||
1812 | |||
1813 | /* | ||
1814 | * ensure we never loose shares due to rounding errors in the | ||
1815 | * above redistribution. | ||
1816 | */ | ||
1817 | shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1818 | if (shares) | ||
1819 | tg->cfs_rq[dcpu]->shares += shares; | ||
1820 | } | ||
1821 | |||
1822 | /* | ||
1823 | * Because changing a group's shares changes the weight of the super-group | ||
1824 | * we need to walk up the tree and change all shares until we hit the root. | ||
1825 | */ | ||
1826 | static void | ||
1827 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1828 | int scpu, int dcpu) | ||
1829 | { | ||
1830 | while (tg) { | ||
1831 | __move_group_shares(tg, sd, scpu, dcpu); | ||
1832 | tg = tg->parent; | ||
1833 | } | ||
1834 | } | ||
1835 | |||
1836 | static | ||
1837 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | ||
1838 | { | ||
1839 | unsigned long shares = aggregate(tg, sd)->shares; | ||
1840 | int i; | ||
1841 | |||
1842 | for_each_cpu_mask(i, sd->span) { | ||
1843 | struct rq *rq = cpu_rq(i); | ||
1844 | unsigned long flags; | ||
1845 | |||
1846 | spin_lock_irqsave(&rq->lock, flags); | ||
1847 | __update_group_shares_cpu(tg, sd, i); | ||
1848 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1849 | } | ||
1850 | |||
1851 | aggregate_group_shares(tg, sd); | ||
1852 | |||
1853 | /* | ||
1854 | * ensure we never loose shares due to rounding errors in the | ||
1855 | * above redistribution. | ||
1856 | */ | ||
1857 | shares -= aggregate(tg, sd)->shares; | ||
1858 | if (shares) { | ||
1859 | tg->cfs_rq[sd->first_cpu]->shares += shares; | ||
1860 | aggregate(tg, sd)->shares += shares; | ||
1861 | } | ||
1862 | } | ||
1863 | |||
1864 | /* | ||
1865 | * Calculate the accumulative weight and recursive load of each task group | ||
1866 | * while walking down the tree. | ||
1867 | */ | ||
1868 | static | ||
1869 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | ||
1870 | { | ||
1871 | aggregate_group_weight(tg, sd); | ||
1872 | aggregate_group_shares(tg, sd); | ||
1873 | aggregate_group_load(tg, sd); | ||
1874 | } | ||
1875 | |||
1876 | /* | ||
1877 | * Rebalance the cpu shares while walking back up the tree. | ||
1878 | */ | ||
1879 | static | ||
1880 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | ||
1881 | { | ||
1882 | aggregate_group_set_shares(tg, sd); | ||
1883 | } | ||
1884 | |||
1885 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | ||
1886 | |||
1887 | static void __init init_aggregate(void) | ||
1888 | { | ||
1889 | int i; | ||
1890 | |||
1891 | for_each_possible_cpu(i) | ||
1892 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
1893 | } | ||
1894 | |||
1895 | static int get_aggregate(struct sched_domain *sd) | ||
1896 | { | ||
1897 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | ||
1898 | return 0; | ||
1899 | |||
1900 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | ||
1901 | return 1; | ||
1902 | } | ||
1903 | |||
1904 | static void put_aggregate(struct sched_domain *sd) | ||
1905 | { | ||
1906 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | ||
1907 | } | ||
1908 | |||
1909 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1910 | { | ||
1911 | cfs_rq->shares = shares; | ||
1912 | } | ||
1913 | |||
1914 | #else | ||
1915 | |||
1916 | static inline void init_aggregate(void) | ||
1917 | { | ||
1918 | } | ||
1919 | |||
1920 | static inline int get_aggregate(struct sched_domain *sd) | ||
1921 | { | ||
1922 | return 0; | ||
1923 | } | ||
1924 | |||
1925 | static inline void put_aggregate(struct sched_domain *sd) | ||
1926 | { | ||
1927 | } | ||
1928 | #endif | ||
1929 | |||
1930 | #else /* CONFIG_SMP */ | ||
1931 | |||
1932 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1933 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1934 | { | ||
1935 | } | ||
1936 | #endif | ||
1937 | |||
1249 | #endif /* CONFIG_SMP */ | 1938 | #endif /* CONFIG_SMP */ |
1250 | 1939 | ||
1251 | #include "sched_stats.h" | 1940 | #include "sched_stats.h" |
@@ -1258,26 +1947,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1258 | 1947 | ||
1259 | #define sched_class_highest (&rt_sched_class) | 1948 | #define sched_class_highest (&rt_sched_class) |
1260 | 1949 | ||
1261 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1950 | static void inc_nr_running(struct rq *rq) |
1262 | { | ||
1263 | update_load_add(&rq->load, p->se.load.weight); | ||
1264 | } | ||
1265 | |||
1266 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1267 | { | ||
1268 | update_load_sub(&rq->load, p->se.load.weight); | ||
1269 | } | ||
1270 | |||
1271 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1272 | { | 1951 | { |
1273 | rq->nr_running++; | 1952 | rq->nr_running++; |
1274 | inc_load(rq, p); | ||
1275 | } | 1953 | } |
1276 | 1954 | ||
1277 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1955 | static void dec_nr_running(struct rq *rq) |
1278 | { | 1956 | { |
1279 | rq->nr_running--; | 1957 | rq->nr_running--; |
1280 | dec_load(rq, p); | ||
1281 | } | 1958 | } |
1282 | 1959 | ||
1283 | static void set_load_weight(struct task_struct *p) | 1960 | static void set_load_weight(struct task_struct *p) |
@@ -1369,7 +2046,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1369 | rq->nr_uninterruptible--; | 2046 | rq->nr_uninterruptible--; |
1370 | 2047 | ||
1371 | enqueue_task(rq, p, wakeup); | 2048 | enqueue_task(rq, p, wakeup); |
1372 | inc_nr_running(p, rq); | 2049 | inc_nr_running(rq); |
1373 | } | 2050 | } |
1374 | 2051 | ||
1375 | /* | 2052 | /* |
@@ -1381,7 +2058,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1381 | rq->nr_uninterruptible++; | 2058 | rq->nr_uninterruptible++; |
1382 | 2059 | ||
1383 | dequeue_task(rq, p, sleep); | 2060 | dequeue_task(rq, p, sleep); |
1384 | dec_nr_running(p, rq); | 2061 | dec_nr_running(rq); |
1385 | } | 2062 | } |
1386 | 2063 | ||
1387 | /** | 2064 | /** |
@@ -1438,7 +2115,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
1438 | /* | 2115 | /* |
1439 | * Buddy candidates are cache hot: | 2116 | * Buddy candidates are cache hot: |
1440 | */ | 2117 | */ |
1441 | if (&p->se == cfs_rq_of(&p->se)->next) | 2118 | if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) |
1442 | return 1; | 2119 | return 1; |
1443 | 2120 | ||
1444 | if (p->sched_class != &fair_sched_class) | 2121 | if (p->sched_class != &fair_sched_class) |
@@ -1728,17 +2405,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1728 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | 2405 | * find_idlest_cpu - find the idlest cpu among the cpus in group. |
1729 | */ | 2406 | */ |
1730 | static int | 2407 | static int |
1731 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 2408 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, |
2409 | cpumask_t *tmp) | ||
1732 | { | 2410 | { |
1733 | cpumask_t tmp; | ||
1734 | unsigned long load, min_load = ULONG_MAX; | 2411 | unsigned long load, min_load = ULONG_MAX; |
1735 | int idlest = -1; | 2412 | int idlest = -1; |
1736 | int i; | 2413 | int i; |
1737 | 2414 | ||
1738 | /* Traverse only the allowed CPUs */ | 2415 | /* Traverse only the allowed CPUs */ |
1739 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 2416 | cpus_and(*tmp, group->cpumask, p->cpus_allowed); |
1740 | 2417 | ||
1741 | for_each_cpu_mask(i, tmp) { | 2418 | for_each_cpu_mask(i, *tmp) { |
1742 | load = weighted_cpuload(i); | 2419 | load = weighted_cpuload(i); |
1743 | 2420 | ||
1744 | if (load < min_load || (load == min_load && i == this_cpu)) { | 2421 | if (load < min_load || (load == min_load && i == this_cpu)) { |
@@ -1777,7 +2454,7 @@ static int sched_balance_self(int cpu, int flag) | |||
1777 | } | 2454 | } |
1778 | 2455 | ||
1779 | while (sd) { | 2456 | while (sd) { |
1780 | cpumask_t span; | 2457 | cpumask_t span, tmpmask; |
1781 | struct sched_group *group; | 2458 | struct sched_group *group; |
1782 | int new_cpu, weight; | 2459 | int new_cpu, weight; |
1783 | 2460 | ||
@@ -1793,7 +2470,7 @@ static int sched_balance_self(int cpu, int flag) | |||
1793 | continue; | 2470 | continue; |
1794 | } | 2471 | } |
1795 | 2472 | ||
1796 | new_cpu = find_idlest_cpu(group, t, cpu); | 2473 | new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); |
1797 | if (new_cpu == -1 || new_cpu == cpu) { | 2474 | if (new_cpu == -1 || new_cpu == cpu) { |
1798 | /* Now try balancing at a lower domain level of cpu */ | 2475 | /* Now try balancing at a lower domain level of cpu */ |
1799 | sd = sd->child; | 2476 | sd = sd->child; |
@@ -1839,6 +2516,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1839 | long old_state; | 2516 | long old_state; |
1840 | struct rq *rq; | 2517 | struct rq *rq; |
1841 | 2518 | ||
2519 | if (!sched_feat(SYNC_WAKEUPS)) | ||
2520 | sync = 0; | ||
2521 | |||
1842 | smp_wmb(); | 2522 | smp_wmb(); |
1843 | rq = task_rq_lock(p, &flags); | 2523 | rq = task_rq_lock(p, &flags); |
1844 | old_state = p->state; | 2524 | old_state = p->state; |
@@ -1955,6 +2635,7 @@ static void __sched_fork(struct task_struct *p) | |||
1955 | 2635 | ||
1956 | INIT_LIST_HEAD(&p->rt.run_list); | 2636 | INIT_LIST_HEAD(&p->rt.run_list); |
1957 | p->se.on_rq = 0; | 2637 | p->se.on_rq = 0; |
2638 | INIT_LIST_HEAD(&p->se.group_node); | ||
1958 | 2639 | ||
1959 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2640 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1960 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2641 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2030,7 +2711,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2030 | * management (if any): | 2711 | * management (if any): |
2031 | */ | 2712 | */ |
2032 | p->sched_class->task_new(rq, p); | 2713 | p->sched_class->task_new(rq, p); |
2033 | inc_nr_running(p, rq); | 2714 | inc_nr_running(rq); |
2034 | } | 2715 | } |
2035 | check_preempt_curr(rq, p); | 2716 | check_preempt_curr(rq, p); |
2036 | #ifdef CONFIG_SMP | 2717 | #ifdef CONFIG_SMP |
@@ -2674,7 +3355,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2674 | static struct sched_group * | 3355 | static struct sched_group * |
2675 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3356 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2676 | unsigned long *imbalance, enum cpu_idle_type idle, | 3357 | unsigned long *imbalance, enum cpu_idle_type idle, |
2677 | int *sd_idle, cpumask_t *cpus, int *balance) | 3358 | int *sd_idle, const cpumask_t *cpus, int *balance) |
2678 | { | 3359 | { |
2679 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 3360 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
2680 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 3361 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
@@ -2975,7 +3656,7 @@ ret: | |||
2975 | */ | 3656 | */ |
2976 | static struct rq * | 3657 | static struct rq * |
2977 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 3658 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, |
2978 | unsigned long imbalance, cpumask_t *cpus) | 3659 | unsigned long imbalance, const cpumask_t *cpus) |
2979 | { | 3660 | { |
2980 | struct rq *busiest = NULL, *rq; | 3661 | struct rq *busiest = NULL, *rq; |
2981 | unsigned long max_load = 0; | 3662 | unsigned long max_load = 0; |
@@ -3014,14 +3695,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
3014 | */ | 3695 | */ |
3015 | static int load_balance(int this_cpu, struct rq *this_rq, | 3696 | static int load_balance(int this_cpu, struct rq *this_rq, |
3016 | struct sched_domain *sd, enum cpu_idle_type idle, | 3697 | struct sched_domain *sd, enum cpu_idle_type idle, |
3017 | int *balance) | 3698 | int *balance, cpumask_t *cpus) |
3018 | { | 3699 | { |
3019 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3700 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
3020 | struct sched_group *group; | 3701 | struct sched_group *group; |
3021 | unsigned long imbalance; | 3702 | unsigned long imbalance; |
3022 | struct rq *busiest; | 3703 | struct rq *busiest; |
3023 | cpumask_t cpus = CPU_MASK_ALL; | ||
3024 | unsigned long flags; | 3704 | unsigned long flags; |
3705 | int unlock_aggregate; | ||
3706 | |||
3707 | cpus_setall(*cpus); | ||
3708 | |||
3709 | unlock_aggregate = get_aggregate(sd); | ||
3025 | 3710 | ||
3026 | /* | 3711 | /* |
3027 | * When power savings policy is enabled for the parent domain, idle | 3712 | * When power savings policy is enabled for the parent domain, idle |
@@ -3037,7 +3722,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3037 | 3722 | ||
3038 | redo: | 3723 | redo: |
3039 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3724 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3040 | &cpus, balance); | 3725 | cpus, balance); |
3041 | 3726 | ||
3042 | if (*balance == 0) | 3727 | if (*balance == 0) |
3043 | goto out_balanced; | 3728 | goto out_balanced; |
@@ -3047,7 +3732,7 @@ redo: | |||
3047 | goto out_balanced; | 3732 | goto out_balanced; |
3048 | } | 3733 | } |
3049 | 3734 | ||
3050 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); | 3735 | busiest = find_busiest_queue(group, idle, imbalance, cpus); |
3051 | if (!busiest) { | 3736 | if (!busiest) { |
3052 | schedstat_inc(sd, lb_nobusyq[idle]); | 3737 | schedstat_inc(sd, lb_nobusyq[idle]); |
3053 | goto out_balanced; | 3738 | goto out_balanced; |
@@ -3080,8 +3765,8 @@ redo: | |||
3080 | 3765 | ||
3081 | /* All tasks on this runqueue were pinned by CPU affinity */ | 3766 | /* All tasks on this runqueue were pinned by CPU affinity */ |
3082 | if (unlikely(all_pinned)) { | 3767 | if (unlikely(all_pinned)) { |
3083 | cpu_clear(cpu_of(busiest), cpus); | 3768 | cpu_clear(cpu_of(busiest), *cpus); |
3084 | if (!cpus_empty(cpus)) | 3769 | if (!cpus_empty(*cpus)) |
3085 | goto redo; | 3770 | goto redo; |
3086 | goto out_balanced; | 3771 | goto out_balanced; |
3087 | } | 3772 | } |
@@ -3138,8 +3823,9 @@ redo: | |||
3138 | 3823 | ||
3139 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3824 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3140 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3825 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3141 | return -1; | 3826 | ld_moved = -1; |
3142 | return ld_moved; | 3827 | |
3828 | goto out; | ||
3143 | 3829 | ||
3144 | out_balanced: | 3830 | out_balanced: |
3145 | schedstat_inc(sd, lb_balanced[idle]); | 3831 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -3154,8 +3840,13 @@ out_one_pinned: | |||
3154 | 3840 | ||
3155 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3841 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3156 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3842 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3157 | return -1; | 3843 | ld_moved = -1; |
3158 | return 0; | 3844 | else |
3845 | ld_moved = 0; | ||
3846 | out: | ||
3847 | if (unlock_aggregate) | ||
3848 | put_aggregate(sd); | ||
3849 | return ld_moved; | ||
3159 | } | 3850 | } |
3160 | 3851 | ||
3161 | /* | 3852 | /* |
@@ -3166,7 +3857,8 @@ out_one_pinned: | |||
3166 | * this_rq is locked. | 3857 | * this_rq is locked. |
3167 | */ | 3858 | */ |
3168 | static int | 3859 | static int |
3169 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | 3860 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, |
3861 | cpumask_t *cpus) | ||
3170 | { | 3862 | { |
3171 | struct sched_group *group; | 3863 | struct sched_group *group; |
3172 | struct rq *busiest = NULL; | 3864 | struct rq *busiest = NULL; |
@@ -3174,7 +3866,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
3174 | int ld_moved = 0; | 3866 | int ld_moved = 0; |
3175 | int sd_idle = 0; | 3867 | int sd_idle = 0; |
3176 | int all_pinned = 0; | 3868 | int all_pinned = 0; |
3177 | cpumask_t cpus = CPU_MASK_ALL; | 3869 | |
3870 | cpus_setall(*cpus); | ||
3178 | 3871 | ||
3179 | /* | 3872 | /* |
3180 | * When power savings policy is enabled for the parent domain, idle | 3873 | * When power savings policy is enabled for the parent domain, idle |
@@ -3189,14 +3882,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
3189 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3882 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
3190 | redo: | 3883 | redo: |
3191 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3884 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
3192 | &sd_idle, &cpus, NULL); | 3885 | &sd_idle, cpus, NULL); |
3193 | if (!group) { | 3886 | if (!group) { |
3194 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | 3887 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); |
3195 | goto out_balanced; | 3888 | goto out_balanced; |
3196 | } | 3889 | } |
3197 | 3890 | ||
3198 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, | 3891 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); |
3199 | &cpus); | ||
3200 | if (!busiest) { | 3892 | if (!busiest) { |
3201 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | 3893 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); |
3202 | goto out_balanced; | 3894 | goto out_balanced; |
@@ -3218,8 +3910,8 @@ redo: | |||
3218 | spin_unlock(&busiest->lock); | 3910 | spin_unlock(&busiest->lock); |
3219 | 3911 | ||
3220 | if (unlikely(all_pinned)) { | 3912 | if (unlikely(all_pinned)) { |
3221 | cpu_clear(cpu_of(busiest), cpus); | 3913 | cpu_clear(cpu_of(busiest), *cpus); |
3222 | if (!cpus_empty(cpus)) | 3914 | if (!cpus_empty(*cpus)) |
3223 | goto redo; | 3915 | goto redo; |
3224 | } | 3916 | } |
3225 | } | 3917 | } |
@@ -3253,6 +3945,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3253 | struct sched_domain *sd; | 3945 | struct sched_domain *sd; |
3254 | int pulled_task = -1; | 3946 | int pulled_task = -1; |
3255 | unsigned long next_balance = jiffies + HZ; | 3947 | unsigned long next_balance = jiffies + HZ; |
3948 | cpumask_t tmpmask; | ||
3256 | 3949 | ||
3257 | for_each_domain(this_cpu, sd) { | 3950 | for_each_domain(this_cpu, sd) { |
3258 | unsigned long interval; | 3951 | unsigned long interval; |
@@ -3262,8 +3955,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3262 | 3955 | ||
3263 | if (sd->flags & SD_BALANCE_NEWIDLE) | 3956 | if (sd->flags & SD_BALANCE_NEWIDLE) |
3264 | /* If we've pulled tasks over stop searching: */ | 3957 | /* If we've pulled tasks over stop searching: */ |
3265 | pulled_task = load_balance_newidle(this_cpu, | 3958 | pulled_task = load_balance_newidle(this_cpu, this_rq, |
3266 | this_rq, sd); | 3959 | sd, &tmpmask); |
3267 | 3960 | ||
3268 | interval = msecs_to_jiffies(sd->balance_interval); | 3961 | interval = msecs_to_jiffies(sd->balance_interval); |
3269 | if (time_after(next_balance, sd->last_balance + interval)) | 3962 | if (time_after(next_balance, sd->last_balance + interval)) |
@@ -3422,6 +4115,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3422 | /* Earliest time when we have to do rebalance again */ | 4115 | /* Earliest time when we have to do rebalance again */ |
3423 | unsigned long next_balance = jiffies + 60*HZ; | 4116 | unsigned long next_balance = jiffies + 60*HZ; |
3424 | int update_next_balance = 0; | 4117 | int update_next_balance = 0; |
4118 | cpumask_t tmp; | ||
3425 | 4119 | ||
3426 | for_each_domain(cpu, sd) { | 4120 | for_each_domain(cpu, sd) { |
3427 | if (!(sd->flags & SD_LOAD_BALANCE)) | 4121 | if (!(sd->flags & SD_LOAD_BALANCE)) |
@@ -3445,7 +4139,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3445 | } | 4139 | } |
3446 | 4140 | ||
3447 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 4141 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
3448 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 4142 | if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { |
3449 | /* | 4143 | /* |
3450 | * We've pulled tasks over so either we're no | 4144 | * We've pulled tasks over so either we're no |
3451 | * longer idle, or one of our SMT siblings is | 4145 | * longer idle, or one of our SMT siblings is |
@@ -3561,7 +4255,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) | |||
3561 | */ | 4255 | */ |
3562 | int ilb = first_cpu(nohz.cpu_mask); | 4256 | int ilb = first_cpu(nohz.cpu_mask); |
3563 | 4257 | ||
3564 | if (ilb != NR_CPUS) | 4258 | if (ilb < nr_cpu_ids) |
3565 | resched_cpu(ilb); | 4259 | resched_cpu(ilb); |
3566 | } | 4260 | } |
3567 | } | 4261 | } |
@@ -3765,9 +4459,9 @@ void scheduler_tick(void) | |||
3765 | rq->clock_underflows++; | 4459 | rq->clock_underflows++; |
3766 | } | 4460 | } |
3767 | rq->tick_timestamp = rq->clock; | 4461 | rq->tick_timestamp = rq->clock; |
4462 | update_last_tick_seen(rq); | ||
3768 | update_cpu_load(rq); | 4463 | update_cpu_load(rq); |
3769 | curr->sched_class->task_tick(rq, curr, 0); | 4464 | curr->sched_class->task_tick(rq, curr, 0); |
3770 | update_sched_rt_period(rq); | ||
3771 | spin_unlock(&rq->lock); | 4465 | spin_unlock(&rq->lock); |
3772 | 4466 | ||
3773 | #ifdef CONFIG_SMP | 4467 | #ifdef CONFIG_SMP |
@@ -4367,10 +5061,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4367 | goto out_unlock; | 5061 | goto out_unlock; |
4368 | } | 5062 | } |
4369 | on_rq = p->se.on_rq; | 5063 | on_rq = p->se.on_rq; |
4370 | if (on_rq) { | 5064 | if (on_rq) |
4371 | dequeue_task(rq, p, 0); | 5065 | dequeue_task(rq, p, 0); |
4372 | dec_load(rq, p); | ||
4373 | } | ||
4374 | 5066 | ||
4375 | p->static_prio = NICE_TO_PRIO(nice); | 5067 | p->static_prio = NICE_TO_PRIO(nice); |
4376 | set_load_weight(p); | 5068 | set_load_weight(p); |
@@ -4380,7 +5072,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4380 | 5072 | ||
4381 | if (on_rq) { | 5073 | if (on_rq) { |
4382 | enqueue_task(rq, p, 0); | 5074 | enqueue_task(rq, p, 0); |
4383 | inc_load(rq, p); | ||
4384 | /* | 5075 | /* |
4385 | * If the task increased its priority or is running and | 5076 | * If the task increased its priority or is running and |
4386 | * lowered its priority, then reschedule its CPU: | 5077 | * lowered its priority, then reschedule its CPU: |
@@ -4602,7 +5293,7 @@ recheck: | |||
4602 | * Do not allow realtime tasks into groups that have no runtime | 5293 | * Do not allow realtime tasks into groups that have no runtime |
4603 | * assigned. | 5294 | * assigned. |
4604 | */ | 5295 | */ |
4605 | if (rt_policy(policy) && task_group(p)->rt_runtime == 0) | 5296 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) |
4606 | return -EPERM; | 5297 | return -EPERM; |
4607 | #endif | 5298 | #endif |
4608 | 5299 | ||
@@ -4764,9 +5455,10 @@ out_unlock: | |||
4764 | return retval; | 5455 | return retval; |
4765 | } | 5456 | } |
4766 | 5457 | ||
4767 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 5458 | long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) |
4768 | { | 5459 | { |
4769 | cpumask_t cpus_allowed; | 5460 | cpumask_t cpus_allowed; |
5461 | cpumask_t new_mask = *in_mask; | ||
4770 | struct task_struct *p; | 5462 | struct task_struct *p; |
4771 | int retval; | 5463 | int retval; |
4772 | 5464 | ||
@@ -4797,13 +5489,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4797 | if (retval) | 5489 | if (retval) |
4798 | goto out_unlock; | 5490 | goto out_unlock; |
4799 | 5491 | ||
4800 | cpus_allowed = cpuset_cpus_allowed(p); | 5492 | cpuset_cpus_allowed(p, &cpus_allowed); |
4801 | cpus_and(new_mask, new_mask, cpus_allowed); | 5493 | cpus_and(new_mask, new_mask, cpus_allowed); |
4802 | again: | 5494 | again: |
4803 | retval = set_cpus_allowed(p, new_mask); | 5495 | retval = set_cpus_allowed_ptr(p, &new_mask); |
4804 | 5496 | ||
4805 | if (!retval) { | 5497 | if (!retval) { |
4806 | cpus_allowed = cpuset_cpus_allowed(p); | 5498 | cpuset_cpus_allowed(p, &cpus_allowed); |
4807 | if (!cpus_subset(new_mask, cpus_allowed)) { | 5499 | if (!cpus_subset(new_mask, cpus_allowed)) { |
4808 | /* | 5500 | /* |
4809 | * We must have raced with a concurrent cpuset | 5501 | * We must have raced with a concurrent cpuset |
@@ -4847,7 +5539,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
4847 | if (retval) | 5539 | if (retval) |
4848 | return retval; | 5540 | return retval; |
4849 | 5541 | ||
4850 | return sched_setaffinity(pid, new_mask); | 5542 | return sched_setaffinity(pid, &new_mask); |
4851 | } | 5543 | } |
4852 | 5544 | ||
4853 | /* | 5545 | /* |
@@ -5309,7 +6001,6 @@ static inline void sched_init_granularity(void) | |||
5309 | sysctl_sched_latency = limit; | 6001 | sysctl_sched_latency = limit; |
5310 | 6002 | ||
5311 | sysctl_sched_wakeup_granularity *= factor; | 6003 | sysctl_sched_wakeup_granularity *= factor; |
5312 | sysctl_sched_batch_wakeup_granularity *= factor; | ||
5313 | } | 6004 | } |
5314 | 6005 | ||
5315 | #ifdef CONFIG_SMP | 6006 | #ifdef CONFIG_SMP |
@@ -5338,7 +6029,7 @@ static inline void sched_init_granularity(void) | |||
5338 | * task must not exit() & deallocate itself prematurely. The | 6029 | * task must not exit() & deallocate itself prematurely. The |
5339 | * call is not atomic; no spinlocks may be held. | 6030 | * call is not atomic; no spinlocks may be held. |
5340 | */ | 6031 | */ |
5341 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 6032 | int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) |
5342 | { | 6033 | { |
5343 | struct migration_req req; | 6034 | struct migration_req req; |
5344 | unsigned long flags; | 6035 | unsigned long flags; |
@@ -5346,23 +6037,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
5346 | int ret = 0; | 6037 | int ret = 0; |
5347 | 6038 | ||
5348 | rq = task_rq_lock(p, &flags); | 6039 | rq = task_rq_lock(p, &flags); |
5349 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 6040 | if (!cpus_intersects(*new_mask, cpu_online_map)) { |
5350 | ret = -EINVAL; | 6041 | ret = -EINVAL; |
5351 | goto out; | 6042 | goto out; |
5352 | } | 6043 | } |
5353 | 6044 | ||
5354 | if (p->sched_class->set_cpus_allowed) | 6045 | if (p->sched_class->set_cpus_allowed) |
5355 | p->sched_class->set_cpus_allowed(p, &new_mask); | 6046 | p->sched_class->set_cpus_allowed(p, new_mask); |
5356 | else { | 6047 | else { |
5357 | p->cpus_allowed = new_mask; | 6048 | p->cpus_allowed = *new_mask; |
5358 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | 6049 | p->rt.nr_cpus_allowed = cpus_weight(*new_mask); |
5359 | } | 6050 | } |
5360 | 6051 | ||
5361 | /* Can the task run on the task's current CPU? If so, we're done */ | 6052 | /* Can the task run on the task's current CPU? If so, we're done */ |
5362 | if (cpu_isset(task_cpu(p), new_mask)) | 6053 | if (cpu_isset(task_cpu(p), *new_mask)) |
5363 | goto out; | 6054 | goto out; |
5364 | 6055 | ||
5365 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { | 6056 | if (migrate_task(p, any_online_cpu(*new_mask), &req)) { |
5366 | /* Need help from migration thread: drop lock and wait. */ | 6057 | /* Need help from migration thread: drop lock and wait. */ |
5367 | task_rq_unlock(rq, &flags); | 6058 | task_rq_unlock(rq, &flags); |
5368 | wake_up_process(rq->migration_thread); | 6059 | wake_up_process(rq->migration_thread); |
@@ -5375,7 +6066,7 @@ out: | |||
5375 | 6066 | ||
5376 | return ret; | 6067 | return ret; |
5377 | } | 6068 | } |
5378 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 6069 | EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); |
5379 | 6070 | ||
5380 | /* | 6071 | /* |
5381 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 6072 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
@@ -5513,12 +6204,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5513 | dest_cpu = any_online_cpu(mask); | 6204 | dest_cpu = any_online_cpu(mask); |
5514 | 6205 | ||
5515 | /* On any allowed CPU? */ | 6206 | /* On any allowed CPU? */ |
5516 | if (dest_cpu == NR_CPUS) | 6207 | if (dest_cpu >= nr_cpu_ids) |
5517 | dest_cpu = any_online_cpu(p->cpus_allowed); | 6208 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5518 | 6209 | ||
5519 | /* No more Mr. Nice Guy. */ | 6210 | /* No more Mr. Nice Guy. */ |
5520 | if (dest_cpu == NR_CPUS) { | 6211 | if (dest_cpu >= nr_cpu_ids) { |
5521 | cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); | 6212 | cpumask_t cpus_allowed; |
6213 | |||
6214 | cpuset_cpus_allowed_locked(p, &cpus_allowed); | ||
5522 | /* | 6215 | /* |
5523 | * Try to stay on the same cpuset, where the | 6216 | * Try to stay on the same cpuset, where the |
5524 | * current cpuset may be a subset of all cpus. | 6217 | * current cpuset may be a subset of all cpus. |
@@ -5554,7 +6247,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5554 | */ | 6247 | */ |
5555 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 6248 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5556 | { | 6249 | { |
5557 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 6250 | struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); |
5558 | unsigned long flags; | 6251 | unsigned long flags; |
5559 | 6252 | ||
5560 | local_irq_save(flags); | 6253 | local_irq_save(flags); |
@@ -5966,20 +6659,16 @@ void __init migration_init(void) | |||
5966 | 6659 | ||
5967 | #ifdef CONFIG_SMP | 6660 | #ifdef CONFIG_SMP |
5968 | 6661 | ||
5969 | /* Number of possible processor ids */ | ||
5970 | int nr_cpu_ids __read_mostly = NR_CPUS; | ||
5971 | EXPORT_SYMBOL(nr_cpu_ids); | ||
5972 | |||
5973 | #ifdef CONFIG_SCHED_DEBUG | 6662 | #ifdef CONFIG_SCHED_DEBUG |
5974 | 6663 | ||
5975 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | 6664 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
6665 | cpumask_t *groupmask) | ||
5976 | { | 6666 | { |
5977 | struct sched_group *group = sd->groups; | 6667 | struct sched_group *group = sd->groups; |
5978 | cpumask_t groupmask; | 6668 | char str[256]; |
5979 | char str[NR_CPUS]; | ||
5980 | 6669 | ||
5981 | cpumask_scnprintf(str, NR_CPUS, sd->span); | 6670 | cpulist_scnprintf(str, sizeof(str), sd->span); |
5982 | cpus_clear(groupmask); | 6671 | cpus_clear(*groupmask); |
5983 | 6672 | ||
5984 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | 6673 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); |
5985 | 6674 | ||
@@ -6023,25 +6712,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | |||
6023 | break; | 6712 | break; |
6024 | } | 6713 | } |
6025 | 6714 | ||
6026 | if (cpus_intersects(groupmask, group->cpumask)) { | 6715 | if (cpus_intersects(*groupmask, group->cpumask)) { |
6027 | printk(KERN_CONT "\n"); | 6716 | printk(KERN_CONT "\n"); |
6028 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 6717 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
6029 | break; | 6718 | break; |
6030 | } | 6719 | } |
6031 | 6720 | ||
6032 | cpus_or(groupmask, groupmask, group->cpumask); | 6721 | cpus_or(*groupmask, *groupmask, group->cpumask); |
6033 | 6722 | ||
6034 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); | 6723 | cpulist_scnprintf(str, sizeof(str), group->cpumask); |
6035 | printk(KERN_CONT " %s", str); | 6724 | printk(KERN_CONT " %s", str); |
6036 | 6725 | ||
6037 | group = group->next; | 6726 | group = group->next; |
6038 | } while (group != sd->groups); | 6727 | } while (group != sd->groups); |
6039 | printk(KERN_CONT "\n"); | 6728 | printk(KERN_CONT "\n"); |
6040 | 6729 | ||
6041 | if (!cpus_equal(sd->span, groupmask)) | 6730 | if (!cpus_equal(sd->span, *groupmask)) |
6042 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 6731 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); |
6043 | 6732 | ||
6044 | if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) | 6733 | if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) |
6045 | printk(KERN_ERR "ERROR: parent span is not a superset " | 6734 | printk(KERN_ERR "ERROR: parent span is not a superset " |
6046 | "of domain->span\n"); | 6735 | "of domain->span\n"); |
6047 | return 0; | 6736 | return 0; |
@@ -6049,6 +6738,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | |||
6049 | 6738 | ||
6050 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6739 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6051 | { | 6740 | { |
6741 | cpumask_t *groupmask; | ||
6052 | int level = 0; | 6742 | int level = 0; |
6053 | 6743 | ||
6054 | if (!sd) { | 6744 | if (!sd) { |
@@ -6058,14 +6748,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6058 | 6748 | ||
6059 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6749 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6060 | 6750 | ||
6751 | groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | ||
6752 | if (!groupmask) { | ||
6753 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
6754 | return; | ||
6755 | } | ||
6756 | |||
6061 | for (;;) { | 6757 | for (;;) { |
6062 | if (sched_domain_debug_one(sd, cpu, level)) | 6758 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) |
6063 | break; | 6759 | break; |
6064 | level++; | 6760 | level++; |
6065 | sd = sd->parent; | 6761 | sd = sd->parent; |
6066 | if (!sd) | 6762 | if (!sd) |
6067 | break; | 6763 | break; |
6068 | } | 6764 | } |
6765 | kfree(groupmask); | ||
6069 | } | 6766 | } |
6070 | #else | 6767 | #else |
6071 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6768 | # define sched_domain_debug(sd, cpu) do { } while (0) |
@@ -6253,30 +6950,33 @@ __setup("isolcpus=", isolated_cpu_setup); | |||
6253 | * and ->cpu_power to 0. | 6950 | * and ->cpu_power to 0. |
6254 | */ | 6951 | */ |
6255 | static void | 6952 | static void |
6256 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | 6953 | init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, |
6257 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, | 6954 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
6258 | struct sched_group **sg)) | 6955 | struct sched_group **sg, |
6956 | cpumask_t *tmpmask), | ||
6957 | cpumask_t *covered, cpumask_t *tmpmask) | ||
6259 | { | 6958 | { |
6260 | struct sched_group *first = NULL, *last = NULL; | 6959 | struct sched_group *first = NULL, *last = NULL; |
6261 | cpumask_t covered = CPU_MASK_NONE; | ||
6262 | int i; | 6960 | int i; |
6263 | 6961 | ||
6264 | for_each_cpu_mask(i, span) { | 6962 | cpus_clear(*covered); |
6963 | |||
6964 | for_each_cpu_mask(i, *span) { | ||
6265 | struct sched_group *sg; | 6965 | struct sched_group *sg; |
6266 | int group = group_fn(i, cpu_map, &sg); | 6966 | int group = group_fn(i, cpu_map, &sg, tmpmask); |
6267 | int j; | 6967 | int j; |
6268 | 6968 | ||
6269 | if (cpu_isset(i, covered)) | 6969 | if (cpu_isset(i, *covered)) |
6270 | continue; | 6970 | continue; |
6271 | 6971 | ||
6272 | sg->cpumask = CPU_MASK_NONE; | 6972 | cpus_clear(sg->cpumask); |
6273 | sg->__cpu_power = 0; | 6973 | sg->__cpu_power = 0; |
6274 | 6974 | ||
6275 | for_each_cpu_mask(j, span) { | 6975 | for_each_cpu_mask(j, *span) { |
6276 | if (group_fn(j, cpu_map, NULL) != group) | 6976 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
6277 | continue; | 6977 | continue; |
6278 | 6978 | ||
6279 | cpu_set(j, covered); | 6979 | cpu_set(j, *covered); |
6280 | cpu_set(j, sg->cpumask); | 6980 | cpu_set(j, sg->cpumask); |
6281 | } | 6981 | } |
6282 | if (!first) | 6982 | if (!first) |
@@ -6302,7 +7002,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
6302 | * | 7002 | * |
6303 | * Should use nodemask_t. | 7003 | * Should use nodemask_t. |
6304 | */ | 7004 | */ |
6305 | static int find_next_best_node(int node, unsigned long *used_nodes) | 7005 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6306 | { | 7006 | { |
6307 | int i, n, val, min_val, best_node = 0; | 7007 | int i, n, val, min_val, best_node = 0; |
6308 | 7008 | ||
@@ -6316,7 +7016,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
6316 | continue; | 7016 | continue; |
6317 | 7017 | ||
6318 | /* Skip already used nodes */ | 7018 | /* Skip already used nodes */ |
6319 | if (test_bit(n, used_nodes)) | 7019 | if (node_isset(n, *used_nodes)) |
6320 | continue; | 7020 | continue; |
6321 | 7021 | ||
6322 | /* Simple min distance search */ | 7022 | /* Simple min distance search */ |
@@ -6328,40 +7028,36 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
6328 | } | 7028 | } |
6329 | } | 7029 | } |
6330 | 7030 | ||
6331 | set_bit(best_node, used_nodes); | 7031 | node_set(best_node, *used_nodes); |
6332 | return best_node; | 7032 | return best_node; |
6333 | } | 7033 | } |
6334 | 7034 | ||
6335 | /** | 7035 | /** |
6336 | * sched_domain_node_span - get a cpumask for a node's sched_domain | 7036 | * sched_domain_node_span - get a cpumask for a node's sched_domain |
6337 | * @node: node whose cpumask we're constructing | 7037 | * @node: node whose cpumask we're constructing |
6338 | * @size: number of nodes to include in this span | ||
6339 | * | 7038 | * |
6340 | * Given a node, construct a good cpumask for its sched_domain to span. It | 7039 | * Given a node, construct a good cpumask for its sched_domain to span. It |
6341 | * should be one that prevents unnecessary balancing, but also spreads tasks | 7040 | * should be one that prevents unnecessary balancing, but also spreads tasks |
6342 | * out optimally. | 7041 | * out optimally. |
6343 | */ | 7042 | */ |
6344 | static cpumask_t sched_domain_node_span(int node) | 7043 | static void sched_domain_node_span(int node, cpumask_t *span) |
6345 | { | 7044 | { |
6346 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 7045 | nodemask_t used_nodes; |
6347 | cpumask_t span, nodemask; | 7046 | node_to_cpumask_ptr(nodemask, node); |
6348 | int i; | 7047 | int i; |
6349 | 7048 | ||
6350 | cpus_clear(span); | 7049 | cpus_clear(*span); |
6351 | bitmap_zero(used_nodes, MAX_NUMNODES); | 7050 | nodes_clear(used_nodes); |
6352 | 7051 | ||
6353 | nodemask = node_to_cpumask(node); | 7052 | cpus_or(*span, *span, *nodemask); |
6354 | cpus_or(span, span, nodemask); | 7053 | node_set(node, used_nodes); |
6355 | set_bit(node, used_nodes); | ||
6356 | 7054 | ||
6357 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 7055 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6358 | int next_node = find_next_best_node(node, used_nodes); | 7056 | int next_node = find_next_best_node(node, &used_nodes); |
6359 | 7057 | ||
6360 | nodemask = node_to_cpumask(next_node); | 7058 | node_to_cpumask_ptr_next(nodemask, next_node); |
6361 | cpus_or(span, span, nodemask); | 7059 | cpus_or(*span, *span, *nodemask); |
6362 | } | 7060 | } |
6363 | |||
6364 | return span; | ||
6365 | } | 7061 | } |
6366 | #endif | 7062 | #endif |
6367 | 7063 | ||
@@ -6375,7 +7071,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | |||
6375 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); | 7071 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
6376 | 7072 | ||
6377 | static int | 7073 | static int |
6378 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7074 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
7075 | cpumask_t *unused) | ||
6379 | { | 7076 | { |
6380 | if (sg) | 7077 | if (sg) |
6381 | *sg = &per_cpu(sched_group_cpus, cpu); | 7078 | *sg = &per_cpu(sched_group_cpus, cpu); |
@@ -6393,19 +7090,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); | |||
6393 | 7090 | ||
6394 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 7091 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6395 | static int | 7092 | static int |
6396 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7093 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
7094 | cpumask_t *mask) | ||
6397 | { | 7095 | { |
6398 | int group; | 7096 | int group; |
6399 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 7097 | |
6400 | cpus_and(mask, mask, *cpu_map); | 7098 | *mask = per_cpu(cpu_sibling_map, cpu); |
6401 | group = first_cpu(mask); | 7099 | cpus_and(*mask, *mask, *cpu_map); |
7100 | group = first_cpu(*mask); | ||
6402 | if (sg) | 7101 | if (sg) |
6403 | *sg = &per_cpu(sched_group_core, group); | 7102 | *sg = &per_cpu(sched_group_core, group); |
6404 | return group; | 7103 | return group; |
6405 | } | 7104 | } |
6406 | #elif defined(CONFIG_SCHED_MC) | 7105 | #elif defined(CONFIG_SCHED_MC) |
6407 | static int | 7106 | static int |
6408 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7107 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
7108 | cpumask_t *unused) | ||
6409 | { | 7109 | { |
6410 | if (sg) | 7110 | if (sg) |
6411 | *sg = &per_cpu(sched_group_core, cpu); | 7111 | *sg = &per_cpu(sched_group_core, cpu); |
@@ -6417,17 +7117,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); | |||
6417 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); | 7117 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
6418 | 7118 | ||
6419 | static int | 7119 | static int |
6420 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7120 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
7121 | cpumask_t *mask) | ||
6421 | { | 7122 | { |
6422 | int group; | 7123 | int group; |
6423 | #ifdef CONFIG_SCHED_MC | 7124 | #ifdef CONFIG_SCHED_MC |
6424 | cpumask_t mask = cpu_coregroup_map(cpu); | 7125 | *mask = cpu_coregroup_map(cpu); |
6425 | cpus_and(mask, mask, *cpu_map); | 7126 | cpus_and(*mask, *mask, *cpu_map); |
6426 | group = first_cpu(mask); | 7127 | group = first_cpu(*mask); |
6427 | #elif defined(CONFIG_SCHED_SMT) | 7128 | #elif defined(CONFIG_SCHED_SMT) |
6428 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 7129 | *mask = per_cpu(cpu_sibling_map, cpu); |
6429 | cpus_and(mask, mask, *cpu_map); | 7130 | cpus_and(*mask, *mask, *cpu_map); |
6430 | group = first_cpu(mask); | 7131 | group = first_cpu(*mask); |
6431 | #else | 7132 | #else |
6432 | group = cpu; | 7133 | group = cpu; |
6433 | #endif | 7134 | #endif |
@@ -6443,19 +7144,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | |||
6443 | * gets dynamically allocated. | 7144 | * gets dynamically allocated. |
6444 | */ | 7145 | */ |
6445 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 7146 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
6446 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 7147 | static struct sched_group ***sched_group_nodes_bycpu; |
6447 | 7148 | ||
6448 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 7149 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
6449 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); | 7150 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
6450 | 7151 | ||
6451 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, | 7152 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
6452 | struct sched_group **sg) | 7153 | struct sched_group **sg, cpumask_t *nodemask) |
6453 | { | 7154 | { |
6454 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); | ||
6455 | int group; | 7155 | int group; |
6456 | 7156 | ||
6457 | cpus_and(nodemask, nodemask, *cpu_map); | 7157 | *nodemask = node_to_cpumask(cpu_to_node(cpu)); |
6458 | group = first_cpu(nodemask); | 7158 | cpus_and(*nodemask, *nodemask, *cpu_map); |
7159 | group = first_cpu(*nodemask); | ||
6459 | 7160 | ||
6460 | if (sg) | 7161 | if (sg) |
6461 | *sg = &per_cpu(sched_group_allnodes, group); | 7162 | *sg = &per_cpu(sched_group_allnodes, group); |
@@ -6491,7 +7192,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
6491 | 7192 | ||
6492 | #ifdef CONFIG_NUMA | 7193 | #ifdef CONFIG_NUMA |
6493 | /* Free memory allocated for various sched_group structures */ | 7194 | /* Free memory allocated for various sched_group structures */ |
6494 | static void free_sched_groups(const cpumask_t *cpu_map) | 7195 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
6495 | { | 7196 | { |
6496 | int cpu, i; | 7197 | int cpu, i; |
6497 | 7198 | ||
@@ -6503,11 +7204,11 @@ static void free_sched_groups(const cpumask_t *cpu_map) | |||
6503 | continue; | 7204 | continue; |
6504 | 7205 | ||
6505 | for (i = 0; i < MAX_NUMNODES; i++) { | 7206 | for (i = 0; i < MAX_NUMNODES; i++) { |
6506 | cpumask_t nodemask = node_to_cpumask(i); | ||
6507 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7207 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
6508 | 7208 | ||
6509 | cpus_and(nodemask, nodemask, *cpu_map); | 7209 | *nodemask = node_to_cpumask(i); |
6510 | if (cpus_empty(nodemask)) | 7210 | cpus_and(*nodemask, *nodemask, *cpu_map); |
7211 | if (cpus_empty(*nodemask)) | ||
6511 | continue; | 7212 | continue; |
6512 | 7213 | ||
6513 | if (sg == NULL) | 7214 | if (sg == NULL) |
@@ -6525,7 +7226,7 @@ next_sg: | |||
6525 | } | 7226 | } |
6526 | } | 7227 | } |
6527 | #else | 7228 | #else |
6528 | static void free_sched_groups(const cpumask_t *cpu_map) | 7229 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
6529 | { | 7230 | { |
6530 | } | 7231 | } |
6531 | #endif | 7232 | #endif |
@@ -6583,13 +7284,106 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6583 | } | 7284 | } |
6584 | 7285 | ||
6585 | /* | 7286 | /* |
7287 | * Initializers for schedule domains | ||
7288 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
7289 | */ | ||
7290 | |||
7291 | #define SD_INIT(sd, type) sd_init_##type(sd) | ||
7292 | #define SD_INIT_FUNC(type) \ | ||
7293 | static noinline void sd_init_##type(struct sched_domain *sd) \ | ||
7294 | { \ | ||
7295 | memset(sd, 0, sizeof(*sd)); \ | ||
7296 | *sd = SD_##type##_INIT; \ | ||
7297 | sd->level = SD_LV_##type; \ | ||
7298 | } | ||
7299 | |||
7300 | SD_INIT_FUNC(CPU) | ||
7301 | #ifdef CONFIG_NUMA | ||
7302 | SD_INIT_FUNC(ALLNODES) | ||
7303 | SD_INIT_FUNC(NODE) | ||
7304 | #endif | ||
7305 | #ifdef CONFIG_SCHED_SMT | ||
7306 | SD_INIT_FUNC(SIBLING) | ||
7307 | #endif | ||
7308 | #ifdef CONFIG_SCHED_MC | ||
7309 | SD_INIT_FUNC(MC) | ||
7310 | #endif | ||
7311 | |||
7312 | /* | ||
7313 | * To minimize stack usage kmalloc room for cpumasks and share the | ||
7314 | * space as the usage in build_sched_domains() dictates. Used only | ||
7315 | * if the amount of space is significant. | ||
7316 | */ | ||
7317 | struct allmasks { | ||
7318 | cpumask_t tmpmask; /* make this one first */ | ||
7319 | union { | ||
7320 | cpumask_t nodemask; | ||
7321 | cpumask_t this_sibling_map; | ||
7322 | cpumask_t this_core_map; | ||
7323 | }; | ||
7324 | cpumask_t send_covered; | ||
7325 | |||
7326 | #ifdef CONFIG_NUMA | ||
7327 | cpumask_t domainspan; | ||
7328 | cpumask_t covered; | ||
7329 | cpumask_t notcovered; | ||
7330 | #endif | ||
7331 | }; | ||
7332 | |||
7333 | #if NR_CPUS > 128 | ||
7334 | #define SCHED_CPUMASK_ALLOC 1 | ||
7335 | #define SCHED_CPUMASK_FREE(v) kfree(v) | ||
7336 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks *v | ||
7337 | #else | ||
7338 | #define SCHED_CPUMASK_ALLOC 0 | ||
7339 | #define SCHED_CPUMASK_FREE(v) | ||
7340 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v | ||
7341 | #endif | ||
7342 | |||
7343 | #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ | ||
7344 | ((unsigned long)(a) + offsetof(struct allmasks, v)) | ||
7345 | |||
7346 | static int default_relax_domain_level = -1; | ||
7347 | |||
7348 | static int __init setup_relax_domain_level(char *str) | ||
7349 | { | ||
7350 | default_relax_domain_level = simple_strtoul(str, NULL, 0); | ||
7351 | return 1; | ||
7352 | } | ||
7353 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
7354 | |||
7355 | static void set_domain_attribute(struct sched_domain *sd, | ||
7356 | struct sched_domain_attr *attr) | ||
7357 | { | ||
7358 | int request; | ||
7359 | |||
7360 | if (!attr || attr->relax_domain_level < 0) { | ||
7361 | if (default_relax_domain_level < 0) | ||
7362 | return; | ||
7363 | else | ||
7364 | request = default_relax_domain_level; | ||
7365 | } else | ||
7366 | request = attr->relax_domain_level; | ||
7367 | if (request < sd->level) { | ||
7368 | /* turn off idle balance on this domain */ | ||
7369 | sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); | ||
7370 | } else { | ||
7371 | /* turn on idle balance on this domain */ | ||
7372 | sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); | ||
7373 | } | ||
7374 | } | ||
7375 | |||
7376 | /* | ||
6586 | * Build sched domains for a given set of cpus and attach the sched domains | 7377 | * Build sched domains for a given set of cpus and attach the sched domains |
6587 | * to the individual cpus | 7378 | * to the individual cpus |
6588 | */ | 7379 | */ |
6589 | static int build_sched_domains(const cpumask_t *cpu_map) | 7380 | static int __build_sched_domains(const cpumask_t *cpu_map, |
7381 | struct sched_domain_attr *attr) | ||
6590 | { | 7382 | { |
6591 | int i; | 7383 | int i; |
6592 | struct root_domain *rd; | 7384 | struct root_domain *rd; |
7385 | SCHED_CPUMASK_DECLARE(allmasks); | ||
7386 | cpumask_t *tmpmask; | ||
6593 | #ifdef CONFIG_NUMA | 7387 | #ifdef CONFIG_NUMA |
6594 | struct sched_group **sched_group_nodes = NULL; | 7388 | struct sched_group **sched_group_nodes = NULL; |
6595 | int sd_allnodes = 0; | 7389 | int sd_allnodes = 0; |
@@ -6603,39 +7397,65 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6603 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 7397 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
6604 | return -ENOMEM; | 7398 | return -ENOMEM; |
6605 | } | 7399 | } |
6606 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
6607 | #endif | 7400 | #endif |
6608 | 7401 | ||
6609 | rd = alloc_rootdomain(); | 7402 | rd = alloc_rootdomain(); |
6610 | if (!rd) { | 7403 | if (!rd) { |
6611 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7404 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
7405 | #ifdef CONFIG_NUMA | ||
7406 | kfree(sched_group_nodes); | ||
7407 | #endif | ||
6612 | return -ENOMEM; | 7408 | return -ENOMEM; |
6613 | } | 7409 | } |
6614 | 7410 | ||
7411 | #if SCHED_CPUMASK_ALLOC | ||
7412 | /* get space for all scratch cpumask variables */ | ||
7413 | allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); | ||
7414 | if (!allmasks) { | ||
7415 | printk(KERN_WARNING "Cannot alloc cpumask array\n"); | ||
7416 | kfree(rd); | ||
7417 | #ifdef CONFIG_NUMA | ||
7418 | kfree(sched_group_nodes); | ||
7419 | #endif | ||
7420 | return -ENOMEM; | ||
7421 | } | ||
7422 | #endif | ||
7423 | tmpmask = (cpumask_t *)allmasks; | ||
7424 | |||
7425 | |||
7426 | #ifdef CONFIG_NUMA | ||
7427 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
7428 | #endif | ||
7429 | |||
6615 | /* | 7430 | /* |
6616 | * Set up domains for cpus specified by the cpu_map. | 7431 | * Set up domains for cpus specified by the cpu_map. |
6617 | */ | 7432 | */ |
6618 | for_each_cpu_mask(i, *cpu_map) { | 7433 | for_each_cpu_mask(i, *cpu_map) { |
6619 | struct sched_domain *sd = NULL, *p; | 7434 | struct sched_domain *sd = NULL, *p; |
6620 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 7435 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
6621 | 7436 | ||
6622 | cpus_and(nodemask, nodemask, *cpu_map); | 7437 | *nodemask = node_to_cpumask(cpu_to_node(i)); |
7438 | cpus_and(*nodemask, *nodemask, *cpu_map); | ||
6623 | 7439 | ||
6624 | #ifdef CONFIG_NUMA | 7440 | #ifdef CONFIG_NUMA |
6625 | if (cpus_weight(*cpu_map) > | 7441 | if (cpus_weight(*cpu_map) > |
6626 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 7442 | SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { |
6627 | sd = &per_cpu(allnodes_domains, i); | 7443 | sd = &per_cpu(allnodes_domains, i); |
6628 | *sd = SD_ALLNODES_INIT; | 7444 | SD_INIT(sd, ALLNODES); |
7445 | set_domain_attribute(sd, attr); | ||
6629 | sd->span = *cpu_map; | 7446 | sd->span = *cpu_map; |
6630 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); | 7447 | sd->first_cpu = first_cpu(sd->span); |
7448 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
6631 | p = sd; | 7449 | p = sd; |
6632 | sd_allnodes = 1; | 7450 | sd_allnodes = 1; |
6633 | } else | 7451 | } else |
6634 | p = NULL; | 7452 | p = NULL; |
6635 | 7453 | ||
6636 | sd = &per_cpu(node_domains, i); | 7454 | sd = &per_cpu(node_domains, i); |
6637 | *sd = SD_NODE_INIT; | 7455 | SD_INIT(sd, NODE); |
6638 | sd->span = sched_domain_node_span(cpu_to_node(i)); | 7456 | set_domain_attribute(sd, attr); |
7457 | sched_domain_node_span(cpu_to_node(i), &sd->span); | ||
7458 | sd->first_cpu = first_cpu(sd->span); | ||
6639 | sd->parent = p; | 7459 | sd->parent = p; |
6640 | if (p) | 7460 | if (p) |
6641 | p->child = sd; | 7461 | p->child = sd; |
@@ -6644,94 +7464,120 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6644 | 7464 | ||
6645 | p = sd; | 7465 | p = sd; |
6646 | sd = &per_cpu(phys_domains, i); | 7466 | sd = &per_cpu(phys_domains, i); |
6647 | *sd = SD_CPU_INIT; | 7467 | SD_INIT(sd, CPU); |
6648 | sd->span = nodemask; | 7468 | set_domain_attribute(sd, attr); |
7469 | sd->span = *nodemask; | ||
7470 | sd->first_cpu = first_cpu(sd->span); | ||
6649 | sd->parent = p; | 7471 | sd->parent = p; |
6650 | if (p) | 7472 | if (p) |
6651 | p->child = sd; | 7473 | p->child = sd; |
6652 | cpu_to_phys_group(i, cpu_map, &sd->groups); | 7474 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); |
6653 | 7475 | ||
6654 | #ifdef CONFIG_SCHED_MC | 7476 | #ifdef CONFIG_SCHED_MC |
6655 | p = sd; | 7477 | p = sd; |
6656 | sd = &per_cpu(core_domains, i); | 7478 | sd = &per_cpu(core_domains, i); |
6657 | *sd = SD_MC_INIT; | 7479 | SD_INIT(sd, MC); |
7480 | set_domain_attribute(sd, attr); | ||
6658 | sd->span = cpu_coregroup_map(i); | 7481 | sd->span = cpu_coregroup_map(i); |
7482 | sd->first_cpu = first_cpu(sd->span); | ||
6659 | cpus_and(sd->span, sd->span, *cpu_map); | 7483 | cpus_and(sd->span, sd->span, *cpu_map); |
6660 | sd->parent = p; | 7484 | sd->parent = p; |
6661 | p->child = sd; | 7485 | p->child = sd; |
6662 | cpu_to_core_group(i, cpu_map, &sd->groups); | 7486 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); |
6663 | #endif | 7487 | #endif |
6664 | 7488 | ||
6665 | #ifdef CONFIG_SCHED_SMT | 7489 | #ifdef CONFIG_SCHED_SMT |
6666 | p = sd; | 7490 | p = sd; |
6667 | sd = &per_cpu(cpu_domains, i); | 7491 | sd = &per_cpu(cpu_domains, i); |
6668 | *sd = SD_SIBLING_INIT; | 7492 | SD_INIT(sd, SIBLING); |
7493 | set_domain_attribute(sd, attr); | ||
6669 | sd->span = per_cpu(cpu_sibling_map, i); | 7494 | sd->span = per_cpu(cpu_sibling_map, i); |
7495 | sd->first_cpu = first_cpu(sd->span); | ||
6670 | cpus_and(sd->span, sd->span, *cpu_map); | 7496 | cpus_and(sd->span, sd->span, *cpu_map); |
6671 | sd->parent = p; | 7497 | sd->parent = p; |
6672 | p->child = sd; | 7498 | p->child = sd; |
6673 | cpu_to_cpu_group(i, cpu_map, &sd->groups); | 7499 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); |
6674 | #endif | 7500 | #endif |
6675 | } | 7501 | } |
6676 | 7502 | ||
6677 | #ifdef CONFIG_SCHED_SMT | 7503 | #ifdef CONFIG_SCHED_SMT |
6678 | /* Set up CPU (sibling) groups */ | 7504 | /* Set up CPU (sibling) groups */ |
6679 | for_each_cpu_mask(i, *cpu_map) { | 7505 | for_each_cpu_mask(i, *cpu_map) { |
6680 | cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); | 7506 | SCHED_CPUMASK_VAR(this_sibling_map, allmasks); |
6681 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 7507 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
6682 | if (i != first_cpu(this_sibling_map)) | 7508 | |
7509 | *this_sibling_map = per_cpu(cpu_sibling_map, i); | ||
7510 | cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); | ||
7511 | if (i != first_cpu(*this_sibling_map)) | ||
6683 | continue; | 7512 | continue; |
6684 | 7513 | ||
6685 | init_sched_build_groups(this_sibling_map, cpu_map, | 7514 | init_sched_build_groups(this_sibling_map, cpu_map, |
6686 | &cpu_to_cpu_group); | 7515 | &cpu_to_cpu_group, |
7516 | send_covered, tmpmask); | ||
6687 | } | 7517 | } |
6688 | #endif | 7518 | #endif |
6689 | 7519 | ||
6690 | #ifdef CONFIG_SCHED_MC | 7520 | #ifdef CONFIG_SCHED_MC |
6691 | /* Set up multi-core groups */ | 7521 | /* Set up multi-core groups */ |
6692 | for_each_cpu_mask(i, *cpu_map) { | 7522 | for_each_cpu_mask(i, *cpu_map) { |
6693 | cpumask_t this_core_map = cpu_coregroup_map(i); | 7523 | SCHED_CPUMASK_VAR(this_core_map, allmasks); |
6694 | cpus_and(this_core_map, this_core_map, *cpu_map); | 7524 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
6695 | if (i != first_cpu(this_core_map)) | 7525 | |
7526 | *this_core_map = cpu_coregroup_map(i); | ||
7527 | cpus_and(*this_core_map, *this_core_map, *cpu_map); | ||
7528 | if (i != first_cpu(*this_core_map)) | ||
6696 | continue; | 7529 | continue; |
7530 | |||
6697 | init_sched_build_groups(this_core_map, cpu_map, | 7531 | init_sched_build_groups(this_core_map, cpu_map, |
6698 | &cpu_to_core_group); | 7532 | &cpu_to_core_group, |
7533 | send_covered, tmpmask); | ||
6699 | } | 7534 | } |
6700 | #endif | 7535 | #endif |
6701 | 7536 | ||
6702 | /* Set up physical groups */ | 7537 | /* Set up physical groups */ |
6703 | for (i = 0; i < MAX_NUMNODES; i++) { | 7538 | for (i = 0; i < MAX_NUMNODES; i++) { |
6704 | cpumask_t nodemask = node_to_cpumask(i); | 7539 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
7540 | SCHED_CPUMASK_VAR(send_covered, allmasks); | ||
6705 | 7541 | ||
6706 | cpus_and(nodemask, nodemask, *cpu_map); | 7542 | *nodemask = node_to_cpumask(i); |
6707 | if (cpus_empty(nodemask)) | 7543 | cpus_and(*nodemask, *nodemask, *cpu_map); |
7544 | if (cpus_empty(*nodemask)) | ||
6708 | continue; | 7545 | continue; |
6709 | 7546 | ||
6710 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); | 7547 | init_sched_build_groups(nodemask, cpu_map, |
7548 | &cpu_to_phys_group, | ||
7549 | send_covered, tmpmask); | ||
6711 | } | 7550 | } |
6712 | 7551 | ||
6713 | #ifdef CONFIG_NUMA | 7552 | #ifdef CONFIG_NUMA |
6714 | /* Set up node groups */ | 7553 | /* Set up node groups */ |
6715 | if (sd_allnodes) | 7554 | if (sd_allnodes) { |
6716 | init_sched_build_groups(*cpu_map, cpu_map, | 7555 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
6717 | &cpu_to_allnodes_group); | 7556 | |
7557 | init_sched_build_groups(cpu_map, cpu_map, | ||
7558 | &cpu_to_allnodes_group, | ||
7559 | send_covered, tmpmask); | ||
7560 | } | ||
6718 | 7561 | ||
6719 | for (i = 0; i < MAX_NUMNODES; i++) { | 7562 | for (i = 0; i < MAX_NUMNODES; i++) { |
6720 | /* Set up node groups */ | 7563 | /* Set up node groups */ |
6721 | struct sched_group *sg, *prev; | 7564 | struct sched_group *sg, *prev; |
6722 | cpumask_t nodemask = node_to_cpumask(i); | 7565 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
6723 | cpumask_t domainspan; | 7566 | SCHED_CPUMASK_VAR(domainspan, allmasks); |
6724 | cpumask_t covered = CPU_MASK_NONE; | 7567 | SCHED_CPUMASK_VAR(covered, allmasks); |
6725 | int j; | 7568 | int j; |
6726 | 7569 | ||
6727 | cpus_and(nodemask, nodemask, *cpu_map); | 7570 | *nodemask = node_to_cpumask(i); |
6728 | if (cpus_empty(nodemask)) { | 7571 | cpus_clear(*covered); |
7572 | |||
7573 | cpus_and(*nodemask, *nodemask, *cpu_map); | ||
7574 | if (cpus_empty(*nodemask)) { | ||
6729 | sched_group_nodes[i] = NULL; | 7575 | sched_group_nodes[i] = NULL; |
6730 | continue; | 7576 | continue; |
6731 | } | 7577 | } |
6732 | 7578 | ||
6733 | domainspan = sched_domain_node_span(i); | 7579 | sched_domain_node_span(i, domainspan); |
6734 | cpus_and(domainspan, domainspan, *cpu_map); | 7580 | cpus_and(*domainspan, *domainspan, *cpu_map); |
6735 | 7581 | ||
6736 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); | 7582 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6737 | if (!sg) { | 7583 | if (!sg) { |
@@ -6740,31 +7586,31 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6740 | goto error; | 7586 | goto error; |
6741 | } | 7587 | } |
6742 | sched_group_nodes[i] = sg; | 7588 | sched_group_nodes[i] = sg; |
6743 | for_each_cpu_mask(j, nodemask) { | 7589 | for_each_cpu_mask(j, *nodemask) { |
6744 | struct sched_domain *sd; | 7590 | struct sched_domain *sd; |
6745 | 7591 | ||
6746 | sd = &per_cpu(node_domains, j); | 7592 | sd = &per_cpu(node_domains, j); |
6747 | sd->groups = sg; | 7593 | sd->groups = sg; |
6748 | } | 7594 | } |
6749 | sg->__cpu_power = 0; | 7595 | sg->__cpu_power = 0; |
6750 | sg->cpumask = nodemask; | 7596 | sg->cpumask = *nodemask; |
6751 | sg->next = sg; | 7597 | sg->next = sg; |
6752 | cpus_or(covered, covered, nodemask); | 7598 | cpus_or(*covered, *covered, *nodemask); |
6753 | prev = sg; | 7599 | prev = sg; |
6754 | 7600 | ||
6755 | for (j = 0; j < MAX_NUMNODES; j++) { | 7601 | for (j = 0; j < MAX_NUMNODES; j++) { |
6756 | cpumask_t tmp, notcovered; | 7602 | SCHED_CPUMASK_VAR(notcovered, allmasks); |
6757 | int n = (i + j) % MAX_NUMNODES; | 7603 | int n = (i + j) % MAX_NUMNODES; |
7604 | node_to_cpumask_ptr(pnodemask, n); | ||
6758 | 7605 | ||
6759 | cpus_complement(notcovered, covered); | 7606 | cpus_complement(*notcovered, *covered); |
6760 | cpus_and(tmp, notcovered, *cpu_map); | 7607 | cpus_and(*tmpmask, *notcovered, *cpu_map); |
6761 | cpus_and(tmp, tmp, domainspan); | 7608 | cpus_and(*tmpmask, *tmpmask, *domainspan); |
6762 | if (cpus_empty(tmp)) | 7609 | if (cpus_empty(*tmpmask)) |
6763 | break; | 7610 | break; |
6764 | 7611 | ||
6765 | nodemask = node_to_cpumask(n); | 7612 | cpus_and(*tmpmask, *tmpmask, *pnodemask); |
6766 | cpus_and(tmp, tmp, nodemask); | 7613 | if (cpus_empty(*tmpmask)) |
6767 | if (cpus_empty(tmp)) | ||
6768 | continue; | 7614 | continue; |
6769 | 7615 | ||
6770 | sg = kmalloc_node(sizeof(struct sched_group), | 7616 | sg = kmalloc_node(sizeof(struct sched_group), |
@@ -6775,9 +7621,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6775 | goto error; | 7621 | goto error; |
6776 | } | 7622 | } |
6777 | sg->__cpu_power = 0; | 7623 | sg->__cpu_power = 0; |
6778 | sg->cpumask = tmp; | 7624 | sg->cpumask = *tmpmask; |
6779 | sg->next = prev->next; | 7625 | sg->next = prev->next; |
6780 | cpus_or(covered, covered, tmp); | 7626 | cpus_or(*covered, *covered, *tmpmask); |
6781 | prev->next = sg; | 7627 | prev->next = sg; |
6782 | prev = sg; | 7628 | prev = sg; |
6783 | } | 7629 | } |
@@ -6813,7 +7659,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6813 | if (sd_allnodes) { | 7659 | if (sd_allnodes) { |
6814 | struct sched_group *sg; | 7660 | struct sched_group *sg; |
6815 | 7661 | ||
6816 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | 7662 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, |
7663 | tmpmask); | ||
6817 | init_numa_sched_groups_power(sg); | 7664 | init_numa_sched_groups_power(sg); |
6818 | } | 7665 | } |
6819 | #endif | 7666 | #endif |
@@ -6831,17 +7678,26 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6831 | cpu_attach_domain(sd, rd, i); | 7678 | cpu_attach_domain(sd, rd, i); |
6832 | } | 7679 | } |
6833 | 7680 | ||
7681 | SCHED_CPUMASK_FREE((void *)allmasks); | ||
6834 | return 0; | 7682 | return 0; |
6835 | 7683 | ||
6836 | #ifdef CONFIG_NUMA | 7684 | #ifdef CONFIG_NUMA |
6837 | error: | 7685 | error: |
6838 | free_sched_groups(cpu_map); | 7686 | free_sched_groups(cpu_map, tmpmask); |
7687 | SCHED_CPUMASK_FREE((void *)allmasks); | ||
6839 | return -ENOMEM; | 7688 | return -ENOMEM; |
6840 | #endif | 7689 | #endif |
6841 | } | 7690 | } |
6842 | 7691 | ||
7692 | static int build_sched_domains(const cpumask_t *cpu_map) | ||
7693 | { | ||
7694 | return __build_sched_domains(cpu_map, NULL); | ||
7695 | } | ||
7696 | |||
6843 | static cpumask_t *doms_cur; /* current sched domains */ | 7697 | static cpumask_t *doms_cur; /* current sched domains */ |
6844 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 7698 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
7699 | static struct sched_domain_attr *dattr_cur; /* attribues of custom domains | ||
7700 | in 'doms_cur' */ | ||
6845 | 7701 | ||
6846 | /* | 7702 | /* |
6847 | * Special case: If a kmalloc of a doms_cur partition (array of | 7703 | * Special case: If a kmalloc of a doms_cur partition (array of |
@@ -6869,15 +7725,17 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) | |||
6869 | if (!doms_cur) | 7725 | if (!doms_cur) |
6870 | doms_cur = &fallback_doms; | 7726 | doms_cur = &fallback_doms; |
6871 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); | 7727 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); |
7728 | dattr_cur = NULL; | ||
6872 | err = build_sched_domains(doms_cur); | 7729 | err = build_sched_domains(doms_cur); |
6873 | register_sched_domain_sysctl(); | 7730 | register_sched_domain_sysctl(); |
6874 | 7731 | ||
6875 | return err; | 7732 | return err; |
6876 | } | 7733 | } |
6877 | 7734 | ||
6878 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 7735 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map, |
7736 | cpumask_t *tmpmask) | ||
6879 | { | 7737 | { |
6880 | free_sched_groups(cpu_map); | 7738 | free_sched_groups(cpu_map, tmpmask); |
6881 | } | 7739 | } |
6882 | 7740 | ||
6883 | /* | 7741 | /* |
@@ -6886,6 +7744,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | |||
6886 | */ | 7744 | */ |
6887 | static void detach_destroy_domains(const cpumask_t *cpu_map) | 7745 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
6888 | { | 7746 | { |
7747 | cpumask_t tmpmask; | ||
6889 | int i; | 7748 | int i; |
6890 | 7749 | ||
6891 | unregister_sched_domain_sysctl(); | 7750 | unregister_sched_domain_sysctl(); |
@@ -6893,7 +7752,23 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6893 | for_each_cpu_mask(i, *cpu_map) | 7752 | for_each_cpu_mask(i, *cpu_map) |
6894 | cpu_attach_domain(NULL, &def_root_domain, i); | 7753 | cpu_attach_domain(NULL, &def_root_domain, i); |
6895 | synchronize_sched(); | 7754 | synchronize_sched(); |
6896 | arch_destroy_sched_domains(cpu_map); | 7755 | arch_destroy_sched_domains(cpu_map, &tmpmask); |
7756 | } | ||
7757 | |||
7758 | /* handle null as "default" */ | ||
7759 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
7760 | struct sched_domain_attr *new, int idx_new) | ||
7761 | { | ||
7762 | struct sched_domain_attr tmp; | ||
7763 | |||
7764 | /* fast path */ | ||
7765 | if (!new && !cur) | ||
7766 | return 1; | ||
7767 | |||
7768 | tmp = SD_ATTR_INIT; | ||
7769 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
7770 | new ? (new + idx_new) : &tmp, | ||
7771 | sizeof(struct sched_domain_attr)); | ||
6897 | } | 7772 | } |
6898 | 7773 | ||
6899 | /* | 7774 | /* |
@@ -6917,7 +7792,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6917 | * | 7792 | * |
6918 | * Call with hotplug lock held | 7793 | * Call with hotplug lock held |
6919 | */ | 7794 | */ |
6920 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | 7795 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
7796 | struct sched_domain_attr *dattr_new) | ||
6921 | { | 7797 | { |
6922 | int i, j; | 7798 | int i, j; |
6923 | 7799 | ||
@@ -6930,12 +7806,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
6930 | ndoms_new = 1; | 7806 | ndoms_new = 1; |
6931 | doms_new = &fallback_doms; | 7807 | doms_new = &fallback_doms; |
6932 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 7808 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
7809 | dattr_new = NULL; | ||
6933 | } | 7810 | } |
6934 | 7811 | ||
6935 | /* Destroy deleted domains */ | 7812 | /* Destroy deleted domains */ |
6936 | for (i = 0; i < ndoms_cur; i++) { | 7813 | for (i = 0; i < ndoms_cur; i++) { |
6937 | for (j = 0; j < ndoms_new; j++) { | 7814 | for (j = 0; j < ndoms_new; j++) { |
6938 | if (cpus_equal(doms_cur[i], doms_new[j])) | 7815 | if (cpus_equal(doms_cur[i], doms_new[j]) |
7816 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
6939 | goto match1; | 7817 | goto match1; |
6940 | } | 7818 | } |
6941 | /* no match - a current sched domain not in new doms_new[] */ | 7819 | /* no match - a current sched domain not in new doms_new[] */ |
@@ -6947,11 +7825,13 @@ match1: | |||
6947 | /* Build new domains */ | 7825 | /* Build new domains */ |
6948 | for (i = 0; i < ndoms_new; i++) { | 7826 | for (i = 0; i < ndoms_new; i++) { |
6949 | for (j = 0; j < ndoms_cur; j++) { | 7827 | for (j = 0; j < ndoms_cur; j++) { |
6950 | if (cpus_equal(doms_new[i], doms_cur[j])) | 7828 | if (cpus_equal(doms_new[i], doms_cur[j]) |
7829 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
6951 | goto match2; | 7830 | goto match2; |
6952 | } | 7831 | } |
6953 | /* no match - add a new doms_new */ | 7832 | /* no match - add a new doms_new */ |
6954 | build_sched_domains(doms_new + i); | 7833 | __build_sched_domains(doms_new + i, |
7834 | dattr_new ? dattr_new + i : NULL); | ||
6955 | match2: | 7835 | match2: |
6956 | ; | 7836 | ; |
6957 | } | 7837 | } |
@@ -6959,7 +7839,9 @@ match2: | |||
6959 | /* Remember the new sched domains */ | 7839 | /* Remember the new sched domains */ |
6960 | if (doms_cur != &fallback_doms) | 7840 | if (doms_cur != &fallback_doms) |
6961 | kfree(doms_cur); | 7841 | kfree(doms_cur); |
7842 | kfree(dattr_cur); /* kfree(NULL) is safe */ | ||
6962 | doms_cur = doms_new; | 7843 | doms_cur = doms_new; |
7844 | dattr_cur = dattr_new; | ||
6963 | ndoms_cur = ndoms_new; | 7845 | ndoms_cur = ndoms_new; |
6964 | 7846 | ||
6965 | register_sched_domain_sysctl(); | 7847 | register_sched_domain_sysctl(); |
@@ -7086,6 +7968,11 @@ void __init sched_init_smp(void) | |||
7086 | { | 7968 | { |
7087 | cpumask_t non_isolated_cpus; | 7969 | cpumask_t non_isolated_cpus; |
7088 | 7970 | ||
7971 | #if defined(CONFIG_NUMA) | ||
7972 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7973 | GFP_KERNEL); | ||
7974 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7975 | #endif | ||
7089 | get_online_cpus(); | 7976 | get_online_cpus(); |
7090 | arch_init_sched_domains(&cpu_online_map); | 7977 | arch_init_sched_domains(&cpu_online_map); |
7091 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7978 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
@@ -7096,13 +7983,18 @@ void __init sched_init_smp(void) | |||
7096 | hotcpu_notifier(update_sched_domains, 0); | 7983 | hotcpu_notifier(update_sched_domains, 0); |
7097 | 7984 | ||
7098 | /* Move init over to a non-isolated CPU */ | 7985 | /* Move init over to a non-isolated CPU */ |
7099 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7986 | if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) |
7100 | BUG(); | 7987 | BUG(); |
7101 | sched_init_granularity(); | 7988 | sched_init_granularity(); |
7102 | } | 7989 | } |
7103 | #else | 7990 | #else |
7104 | void __init sched_init_smp(void) | 7991 | void __init sched_init_smp(void) |
7105 | { | 7992 | { |
7993 | #if defined(CONFIG_NUMA) | ||
7994 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7995 | GFP_KERNEL); | ||
7996 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7997 | #endif | ||
7106 | sched_init_granularity(); | 7998 | sched_init_granularity(); |
7107 | } | 7999 | } |
7108 | #endif /* CONFIG_SMP */ | 8000 | #endif /* CONFIG_SMP */ |
@@ -7117,6 +8009,7 @@ int in_sched_functions(unsigned long addr) | |||
7117 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 8009 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
7118 | { | 8010 | { |
7119 | cfs_rq->tasks_timeline = RB_ROOT; | 8011 | cfs_rq->tasks_timeline = RB_ROOT; |
8012 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
7120 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8013 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7121 | cfs_rq->rq = rq; | 8014 | cfs_rq->rq = rq; |
7122 | #endif | 8015 | #endif |
@@ -7146,6 +8039,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7146 | 8039 | ||
7147 | rt_rq->rt_time = 0; | 8040 | rt_rq->rt_time = 0; |
7148 | rt_rq->rt_throttled = 0; | 8041 | rt_rq->rt_throttled = 0; |
8042 | rt_rq->rt_runtime = 0; | ||
8043 | spin_lock_init(&rt_rq->rt_runtime_lock); | ||
7149 | 8044 | ||
7150 | #ifdef CONFIG_RT_GROUP_SCHED | 8045 | #ifdef CONFIG_RT_GROUP_SCHED |
7151 | rt_rq->rt_nr_boosted = 0; | 8046 | rt_rq->rt_nr_boosted = 0; |
@@ -7154,10 +8049,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7154 | } | 8049 | } |
7155 | 8050 | ||
7156 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8051 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7157 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | 8052 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7158 | struct cfs_rq *cfs_rq, struct sched_entity *se, | 8053 | struct sched_entity *se, int cpu, int add, |
7159 | int cpu, int add) | 8054 | struct sched_entity *parent) |
7160 | { | 8055 | { |
8056 | struct rq *rq = cpu_rq(cpu); | ||
7161 | tg->cfs_rq[cpu] = cfs_rq; | 8057 | tg->cfs_rq[cpu] = cfs_rq; |
7162 | init_cfs_rq(cfs_rq, rq); | 8058 | init_cfs_rq(cfs_rq, rq); |
7163 | cfs_rq->tg = tg; | 8059 | cfs_rq->tg = tg; |
@@ -7165,45 +8061,132 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | |||
7165 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 8061 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7166 | 8062 | ||
7167 | tg->se[cpu] = se; | 8063 | tg->se[cpu] = se; |
7168 | se->cfs_rq = &rq->cfs; | 8064 | /* se could be NULL for init_task_group */ |
8065 | if (!se) | ||
8066 | return; | ||
8067 | |||
8068 | if (!parent) | ||
8069 | se->cfs_rq = &rq->cfs; | ||
8070 | else | ||
8071 | se->cfs_rq = parent->my_q; | ||
8072 | |||
7169 | se->my_q = cfs_rq; | 8073 | se->my_q = cfs_rq; |
7170 | se->load.weight = tg->shares; | 8074 | se->load.weight = tg->shares; |
7171 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | 8075 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); |
7172 | se->parent = NULL; | 8076 | se->parent = parent; |
7173 | } | 8077 | } |
7174 | #endif | 8078 | #endif |
7175 | 8079 | ||
7176 | #ifdef CONFIG_RT_GROUP_SCHED | 8080 | #ifdef CONFIG_RT_GROUP_SCHED |
7177 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | 8081 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7178 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | 8082 | struct sched_rt_entity *rt_se, int cpu, int add, |
7179 | int cpu, int add) | 8083 | struct sched_rt_entity *parent) |
7180 | { | 8084 | { |
8085 | struct rq *rq = cpu_rq(cpu); | ||
8086 | |||
7181 | tg->rt_rq[cpu] = rt_rq; | 8087 | tg->rt_rq[cpu] = rt_rq; |
7182 | init_rt_rq(rt_rq, rq); | 8088 | init_rt_rq(rt_rq, rq); |
7183 | rt_rq->tg = tg; | 8089 | rt_rq->tg = tg; |
7184 | rt_rq->rt_se = rt_se; | 8090 | rt_rq->rt_se = rt_se; |
8091 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
7185 | if (add) | 8092 | if (add) |
7186 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 8093 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
7187 | 8094 | ||
7188 | tg->rt_se[cpu] = rt_se; | 8095 | tg->rt_se[cpu] = rt_se; |
8096 | if (!rt_se) | ||
8097 | return; | ||
8098 | |||
8099 | if (!parent) | ||
8100 | rt_se->rt_rq = &rq->rt; | ||
8101 | else | ||
8102 | rt_se->rt_rq = parent->my_q; | ||
8103 | |||
7189 | rt_se->rt_rq = &rq->rt; | 8104 | rt_se->rt_rq = &rq->rt; |
7190 | rt_se->my_q = rt_rq; | 8105 | rt_se->my_q = rt_rq; |
7191 | rt_se->parent = NULL; | 8106 | rt_se->parent = parent; |
7192 | INIT_LIST_HEAD(&rt_se->run_list); | 8107 | INIT_LIST_HEAD(&rt_se->run_list); |
7193 | } | 8108 | } |
7194 | #endif | 8109 | #endif |
7195 | 8110 | ||
7196 | void __init sched_init(void) | 8111 | void __init sched_init(void) |
7197 | { | 8112 | { |
7198 | int highest_cpu = 0; | ||
7199 | int i, j; | 8113 | int i, j; |
8114 | unsigned long alloc_size = 0, ptr; | ||
8115 | |||
8116 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8117 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | ||
8118 | #endif | ||
8119 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8120 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | ||
8121 | #endif | ||
8122 | #ifdef CONFIG_USER_SCHED | ||
8123 | alloc_size *= 2; | ||
8124 | #endif | ||
8125 | /* | ||
8126 | * As sched_init() is called before page_alloc is setup, | ||
8127 | * we use alloc_bootmem(). | ||
8128 | */ | ||
8129 | if (alloc_size) { | ||
8130 | ptr = (unsigned long)alloc_bootmem_low(alloc_size); | ||
8131 | |||
8132 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8133 | init_task_group.se = (struct sched_entity **)ptr; | ||
8134 | ptr += nr_cpu_ids * sizeof(void **); | ||
8135 | |||
8136 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
8137 | ptr += nr_cpu_ids * sizeof(void **); | ||
8138 | |||
8139 | #ifdef CONFIG_USER_SCHED | ||
8140 | root_task_group.se = (struct sched_entity **)ptr; | ||
8141 | ptr += nr_cpu_ids * sizeof(void **); | ||
8142 | |||
8143 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
8144 | ptr += nr_cpu_ids * sizeof(void **); | ||
8145 | #endif | ||
8146 | #endif | ||
8147 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8148 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
8149 | ptr += nr_cpu_ids * sizeof(void **); | ||
8150 | |||
8151 | init_task_group.rt_rq = (struct rt_rq **)ptr; | ||
8152 | ptr += nr_cpu_ids * sizeof(void **); | ||
8153 | |||
8154 | #ifdef CONFIG_USER_SCHED | ||
8155 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
8156 | ptr += nr_cpu_ids * sizeof(void **); | ||
8157 | |||
8158 | root_task_group.rt_rq = (struct rt_rq **)ptr; | ||
8159 | ptr += nr_cpu_ids * sizeof(void **); | ||
8160 | #endif | ||
8161 | #endif | ||
8162 | } | ||
7200 | 8163 | ||
7201 | #ifdef CONFIG_SMP | 8164 | #ifdef CONFIG_SMP |
8165 | init_aggregate(); | ||
7202 | init_defrootdomain(); | 8166 | init_defrootdomain(); |
7203 | #endif | 8167 | #endif |
7204 | 8168 | ||
8169 | init_rt_bandwidth(&def_rt_bandwidth, | ||
8170 | global_rt_period(), global_rt_runtime()); | ||
8171 | |||
8172 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8173 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | ||
8174 | global_rt_period(), global_rt_runtime()); | ||
8175 | #ifdef CONFIG_USER_SCHED | ||
8176 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | ||
8177 | global_rt_period(), RUNTIME_INF); | ||
8178 | #endif | ||
8179 | #endif | ||
8180 | |||
7205 | #ifdef CONFIG_GROUP_SCHED | 8181 | #ifdef CONFIG_GROUP_SCHED |
7206 | list_add(&init_task_group.list, &task_groups); | 8182 | list_add(&init_task_group.list, &task_groups); |
8183 | INIT_LIST_HEAD(&init_task_group.children); | ||
8184 | |||
8185 | #ifdef CONFIG_USER_SCHED | ||
8186 | INIT_LIST_HEAD(&root_task_group.children); | ||
8187 | init_task_group.parent = &root_task_group; | ||
8188 | list_add(&init_task_group.siblings, &root_task_group.children); | ||
8189 | #endif | ||
7207 | #endif | 8190 | #endif |
7208 | 8191 | ||
7209 | for_each_possible_cpu(i) { | 8192 | for_each_possible_cpu(i) { |
@@ -7214,26 +8197,68 @@ void __init sched_init(void) | |||
7214 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 8197 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
7215 | rq->nr_running = 0; | 8198 | rq->nr_running = 0; |
7216 | rq->clock = 1; | 8199 | rq->clock = 1; |
8200 | update_last_tick_seen(rq); | ||
7217 | init_cfs_rq(&rq->cfs, rq); | 8201 | init_cfs_rq(&rq->cfs, rq); |
7218 | init_rt_rq(&rq->rt, rq); | 8202 | init_rt_rq(&rq->rt, rq); |
7219 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8203 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7220 | init_task_group.shares = init_task_group_load; | 8204 | init_task_group.shares = init_task_group_load; |
7221 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 8205 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7222 | init_tg_cfs_entry(rq, &init_task_group, | 8206 | #ifdef CONFIG_CGROUP_SCHED |
8207 | /* | ||
8208 | * How much cpu bandwidth does init_task_group get? | ||
8209 | * | ||
8210 | * In case of task-groups formed thr' the cgroup filesystem, it | ||
8211 | * gets 100% of the cpu resources in the system. This overall | ||
8212 | * system cpu resource is divided among the tasks of | ||
8213 | * init_task_group and its child task-groups in a fair manner, | ||
8214 | * based on each entity's (task or task-group's) weight | ||
8215 | * (se->load.weight). | ||
8216 | * | ||
8217 | * In other words, if init_task_group has 10 tasks of weight | ||
8218 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | ||
8219 | * then A0's share of the cpu resource is: | ||
8220 | * | ||
8221 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | ||
8222 | * | ||
8223 | * We achieve this by letting init_task_group's tasks sit | ||
8224 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | ||
8225 | */ | ||
8226 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | ||
8227 | #elif defined CONFIG_USER_SCHED | ||
8228 | root_task_group.shares = NICE_0_LOAD; | ||
8229 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); | ||
8230 | /* | ||
8231 | * In case of task-groups formed thr' the user id of tasks, | ||
8232 | * init_task_group represents tasks belonging to root user. | ||
8233 | * Hence it forms a sibling of all subsequent groups formed. | ||
8234 | * In this case, init_task_group gets only a fraction of overall | ||
8235 | * system cpu resource, based on the weight assigned to root | ||
8236 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
8237 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
8238 | * (init_cfs_rq) and having one entity represent this group of | ||
8239 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
8240 | */ | ||
8241 | init_tg_cfs_entry(&init_task_group, | ||
7223 | &per_cpu(init_cfs_rq, i), | 8242 | &per_cpu(init_cfs_rq, i), |
7224 | &per_cpu(init_sched_entity, i), i, 1); | 8243 | &per_cpu(init_sched_entity, i), i, 1, |
8244 | root_task_group.se[i]); | ||
7225 | 8245 | ||
7226 | #endif | 8246 | #endif |
8247 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
8248 | |||
8249 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | ||
7227 | #ifdef CONFIG_RT_GROUP_SCHED | 8250 | #ifdef CONFIG_RT_GROUP_SCHED |
7228 | init_task_group.rt_runtime = | ||
7229 | sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
7230 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 8251 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7231 | init_tg_rt_entry(rq, &init_task_group, | 8252 | #ifdef CONFIG_CGROUP_SCHED |
8253 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
8254 | #elif defined CONFIG_USER_SCHED | ||
8255 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); | ||
8256 | init_tg_rt_entry(&init_task_group, | ||
7232 | &per_cpu(init_rt_rq, i), | 8257 | &per_cpu(init_rt_rq, i), |
7233 | &per_cpu(init_sched_rt_entity, i), i, 1); | 8258 | &per_cpu(init_sched_rt_entity, i), i, 1, |
8259 | root_task_group.rt_se[i]); | ||
8260 | #endif | ||
7234 | #endif | 8261 | #endif |
7235 | rq->rt_period_expire = 0; | ||
7236 | rq->rt_throttled = 0; | ||
7237 | 8262 | ||
7238 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8263 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
7239 | rq->cpu_load[j] = 0; | 8264 | rq->cpu_load[j] = 0; |
@@ -7250,7 +8275,6 @@ void __init sched_init(void) | |||
7250 | #endif | 8275 | #endif |
7251 | init_rq_hrtick(rq); | 8276 | init_rq_hrtick(rq); |
7252 | atomic_set(&rq->nr_iowait, 0); | 8277 | atomic_set(&rq->nr_iowait, 0); |
7253 | highest_cpu = i; | ||
7254 | } | 8278 | } |
7255 | 8279 | ||
7256 | set_load_weight(&init_task); | 8280 | set_load_weight(&init_task); |
@@ -7260,7 +8284,6 @@ void __init sched_init(void) | |||
7260 | #endif | 8284 | #endif |
7261 | 8285 | ||
7262 | #ifdef CONFIG_SMP | 8286 | #ifdef CONFIG_SMP |
7263 | nr_cpu_ids = highest_cpu + 1; | ||
7264 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 8287 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); |
7265 | #endif | 8288 | #endif |
7266 | 8289 | ||
@@ -7419,8 +8442,6 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
7419 | 8442 | ||
7420 | #endif | 8443 | #endif |
7421 | 8444 | ||
7422 | #ifdef CONFIG_GROUP_SCHED | ||
7423 | |||
7424 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8445 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7425 | static void free_fair_sched_group(struct task_group *tg) | 8446 | static void free_fair_sched_group(struct task_group *tg) |
7426 | { | 8447 | { |
@@ -7437,17 +8458,18 @@ static void free_fair_sched_group(struct task_group *tg) | |||
7437 | kfree(tg->se); | 8458 | kfree(tg->se); |
7438 | } | 8459 | } |
7439 | 8460 | ||
7440 | static int alloc_fair_sched_group(struct task_group *tg) | 8461 | static |
8462 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
7441 | { | 8463 | { |
7442 | struct cfs_rq *cfs_rq; | 8464 | struct cfs_rq *cfs_rq; |
7443 | struct sched_entity *se; | 8465 | struct sched_entity *se, *parent_se; |
7444 | struct rq *rq; | 8466 | struct rq *rq; |
7445 | int i; | 8467 | int i; |
7446 | 8468 | ||
7447 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); | 8469 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
7448 | if (!tg->cfs_rq) | 8470 | if (!tg->cfs_rq) |
7449 | goto err; | 8471 | goto err; |
7450 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 8472 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); |
7451 | if (!tg->se) | 8473 | if (!tg->se) |
7452 | goto err; | 8474 | goto err; |
7453 | 8475 | ||
@@ -7466,7 +8488,8 @@ static int alloc_fair_sched_group(struct task_group *tg) | |||
7466 | if (!se) | 8488 | if (!se) |
7467 | goto err; | 8489 | goto err; |
7468 | 8490 | ||
7469 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); | 8491 | parent_se = parent ? parent->se[i] : NULL; |
8492 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); | ||
7470 | } | 8493 | } |
7471 | 8494 | ||
7472 | return 1; | 8495 | return 1; |
@@ -7490,7 +8513,8 @@ static inline void free_fair_sched_group(struct task_group *tg) | |||
7490 | { | 8513 | { |
7491 | } | 8514 | } |
7492 | 8515 | ||
7493 | static inline int alloc_fair_sched_group(struct task_group *tg) | 8516 | static inline |
8517 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
7494 | { | 8518 | { |
7495 | return 1; | 8519 | return 1; |
7496 | } | 8520 | } |
@@ -7509,6 +8533,8 @@ static void free_rt_sched_group(struct task_group *tg) | |||
7509 | { | 8533 | { |
7510 | int i; | 8534 | int i; |
7511 | 8535 | ||
8536 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
8537 | |||
7512 | for_each_possible_cpu(i) { | 8538 | for_each_possible_cpu(i) { |
7513 | if (tg->rt_rq) | 8539 | if (tg->rt_rq) |
7514 | kfree(tg->rt_rq[i]); | 8540 | kfree(tg->rt_rq[i]); |
@@ -7520,21 +8546,23 @@ static void free_rt_sched_group(struct task_group *tg) | |||
7520 | kfree(tg->rt_se); | 8546 | kfree(tg->rt_se); |
7521 | } | 8547 | } |
7522 | 8548 | ||
7523 | static int alloc_rt_sched_group(struct task_group *tg) | 8549 | static |
8550 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
7524 | { | 8551 | { |
7525 | struct rt_rq *rt_rq; | 8552 | struct rt_rq *rt_rq; |
7526 | struct sched_rt_entity *rt_se; | 8553 | struct sched_rt_entity *rt_se, *parent_se; |
7527 | struct rq *rq; | 8554 | struct rq *rq; |
7528 | int i; | 8555 | int i; |
7529 | 8556 | ||
7530 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | 8557 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
7531 | if (!tg->rt_rq) | 8558 | if (!tg->rt_rq) |
7532 | goto err; | 8559 | goto err; |
7533 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | 8560 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); |
7534 | if (!tg->rt_se) | 8561 | if (!tg->rt_se) |
7535 | goto err; | 8562 | goto err; |
7536 | 8563 | ||
7537 | tg->rt_runtime = 0; | 8564 | init_rt_bandwidth(&tg->rt_bandwidth, |
8565 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
7538 | 8566 | ||
7539 | for_each_possible_cpu(i) { | 8567 | for_each_possible_cpu(i) { |
7540 | rq = cpu_rq(i); | 8568 | rq = cpu_rq(i); |
@@ -7549,7 +8577,8 @@ static int alloc_rt_sched_group(struct task_group *tg) | |||
7549 | if (!rt_se) | 8577 | if (!rt_se) |
7550 | goto err; | 8578 | goto err; |
7551 | 8579 | ||
7552 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); | 8580 | parent_se = parent ? parent->rt_se[i] : NULL; |
8581 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); | ||
7553 | } | 8582 | } |
7554 | 8583 | ||
7555 | return 1; | 8584 | return 1; |
@@ -7573,7 +8602,8 @@ static inline void free_rt_sched_group(struct task_group *tg) | |||
7573 | { | 8602 | { |
7574 | } | 8603 | } |
7575 | 8604 | ||
7576 | static inline int alloc_rt_sched_group(struct task_group *tg) | 8605 | static inline |
8606 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
7577 | { | 8607 | { |
7578 | return 1; | 8608 | return 1; |
7579 | } | 8609 | } |
@@ -7587,6 +8617,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
7587 | } | 8617 | } |
7588 | #endif | 8618 | #endif |
7589 | 8619 | ||
8620 | #ifdef CONFIG_GROUP_SCHED | ||
7590 | static void free_sched_group(struct task_group *tg) | 8621 | static void free_sched_group(struct task_group *tg) |
7591 | { | 8622 | { |
7592 | free_fair_sched_group(tg); | 8623 | free_fair_sched_group(tg); |
@@ -7595,7 +8626,7 @@ static void free_sched_group(struct task_group *tg) | |||
7595 | } | 8626 | } |
7596 | 8627 | ||
7597 | /* allocate runqueue etc for a new task group */ | 8628 | /* allocate runqueue etc for a new task group */ |
7598 | struct task_group *sched_create_group(void) | 8629 | struct task_group *sched_create_group(struct task_group *parent) |
7599 | { | 8630 | { |
7600 | struct task_group *tg; | 8631 | struct task_group *tg; |
7601 | unsigned long flags; | 8632 | unsigned long flags; |
@@ -7605,10 +8636,10 @@ struct task_group *sched_create_group(void) | |||
7605 | if (!tg) | 8636 | if (!tg) |
7606 | return ERR_PTR(-ENOMEM); | 8637 | return ERR_PTR(-ENOMEM); |
7607 | 8638 | ||
7608 | if (!alloc_fair_sched_group(tg)) | 8639 | if (!alloc_fair_sched_group(tg, parent)) |
7609 | goto err; | 8640 | goto err; |
7610 | 8641 | ||
7611 | if (!alloc_rt_sched_group(tg)) | 8642 | if (!alloc_rt_sched_group(tg, parent)) |
7612 | goto err; | 8643 | goto err; |
7613 | 8644 | ||
7614 | spin_lock_irqsave(&task_group_lock, flags); | 8645 | spin_lock_irqsave(&task_group_lock, flags); |
@@ -7617,6 +8648,12 @@ struct task_group *sched_create_group(void) | |||
7617 | register_rt_sched_group(tg, i); | 8648 | register_rt_sched_group(tg, i); |
7618 | } | 8649 | } |
7619 | list_add_rcu(&tg->list, &task_groups); | 8650 | list_add_rcu(&tg->list, &task_groups); |
8651 | |||
8652 | WARN_ON(!parent); /* root should already exist */ | ||
8653 | |||
8654 | tg->parent = parent; | ||
8655 | list_add_rcu(&tg->siblings, &parent->children); | ||
8656 | INIT_LIST_HEAD(&tg->children); | ||
7620 | spin_unlock_irqrestore(&task_group_lock, flags); | 8657 | spin_unlock_irqrestore(&task_group_lock, flags); |
7621 | 8658 | ||
7622 | return tg; | 8659 | return tg; |
@@ -7645,6 +8682,7 @@ void sched_destroy_group(struct task_group *tg) | |||
7645 | unregister_rt_sched_group(tg, i); | 8682 | unregister_rt_sched_group(tg, i); |
7646 | } | 8683 | } |
7647 | list_del_rcu(&tg->list); | 8684 | list_del_rcu(&tg->list); |
8685 | list_del_rcu(&tg->siblings); | ||
7648 | spin_unlock_irqrestore(&task_group_lock, flags); | 8686 | spin_unlock_irqrestore(&task_group_lock, flags); |
7649 | 8687 | ||
7650 | /* wait for possible concurrent references to cfs_rqs complete */ | 8688 | /* wait for possible concurrent references to cfs_rqs complete */ |
@@ -7688,16 +8726,14 @@ void sched_move_task(struct task_struct *tsk) | |||
7688 | 8726 | ||
7689 | task_rq_unlock(rq, &flags); | 8727 | task_rq_unlock(rq, &flags); |
7690 | } | 8728 | } |
8729 | #endif | ||
7691 | 8730 | ||
7692 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8731 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7693 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8732 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
7694 | { | 8733 | { |
7695 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8734 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7696 | struct rq *rq = cfs_rq->rq; | ||
7697 | int on_rq; | 8735 | int on_rq; |
7698 | 8736 | ||
7699 | spin_lock_irq(&rq->lock); | ||
7700 | |||
7701 | on_rq = se->on_rq; | 8737 | on_rq = se->on_rq; |
7702 | if (on_rq) | 8738 | if (on_rq) |
7703 | dequeue_entity(cfs_rq, se, 0); | 8739 | dequeue_entity(cfs_rq, se, 0); |
@@ -7707,8 +8743,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
7707 | 8743 | ||
7708 | if (on_rq) | 8744 | if (on_rq) |
7709 | enqueue_entity(cfs_rq, se, 0); | 8745 | enqueue_entity(cfs_rq, se, 0); |
8746 | } | ||
7710 | 8747 | ||
7711 | spin_unlock_irq(&rq->lock); | 8748 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
8749 | { | ||
8750 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8751 | struct rq *rq = cfs_rq->rq; | ||
8752 | unsigned long flags; | ||
8753 | |||
8754 | spin_lock_irqsave(&rq->lock, flags); | ||
8755 | __set_se_shares(se, shares); | ||
8756 | spin_unlock_irqrestore(&rq->lock, flags); | ||
7712 | } | 8757 | } |
7713 | 8758 | ||
7714 | static DEFINE_MUTEX(shares_mutex); | 8759 | static DEFINE_MUTEX(shares_mutex); |
@@ -7719,12 +8764,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7719 | unsigned long flags; | 8764 | unsigned long flags; |
7720 | 8765 | ||
7721 | /* | 8766 | /* |
8767 | * We can't change the weight of the root cgroup. | ||
8768 | */ | ||
8769 | if (!tg->se[0]) | ||
8770 | return -EINVAL; | ||
8771 | |||
8772 | /* | ||
7722 | * A weight of 0 or 1 can cause arithmetics problems. | 8773 | * A weight of 0 or 1 can cause arithmetics problems. |
7723 | * (The default weight is 1024 - so there's no practical | 8774 | * (The default weight is 1024 - so there's no practical |
7724 | * limitation from this.) | 8775 | * limitation from this.) |
7725 | */ | 8776 | */ |
7726 | if (shares < 2) | 8777 | if (shares < MIN_SHARES) |
7727 | shares = 2; | 8778 | shares = MIN_SHARES; |
7728 | 8779 | ||
7729 | mutex_lock(&shares_mutex); | 8780 | mutex_lock(&shares_mutex); |
7730 | if (tg->shares == shares) | 8781 | if (tg->shares == shares) |
@@ -7733,6 +8784,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7733 | spin_lock_irqsave(&task_group_lock, flags); | 8784 | spin_lock_irqsave(&task_group_lock, flags); |
7734 | for_each_possible_cpu(i) | 8785 | for_each_possible_cpu(i) |
7735 | unregister_fair_sched_group(tg, i); | 8786 | unregister_fair_sched_group(tg, i); |
8787 | list_del_rcu(&tg->siblings); | ||
7736 | spin_unlock_irqrestore(&task_group_lock, flags); | 8788 | spin_unlock_irqrestore(&task_group_lock, flags); |
7737 | 8789 | ||
7738 | /* wait for any ongoing reference to this group to finish */ | 8790 | /* wait for any ongoing reference to this group to finish */ |
@@ -7743,8 +8795,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7743 | * w/o tripping rebalance_share or load_balance_fair. | 8795 | * w/o tripping rebalance_share or load_balance_fair. |
7744 | */ | 8796 | */ |
7745 | tg->shares = shares; | 8797 | tg->shares = shares; |
7746 | for_each_possible_cpu(i) | 8798 | for_each_possible_cpu(i) { |
7747 | set_se_shares(tg->se[i], shares); | 8799 | /* |
8800 | * force a rebalance | ||
8801 | */ | ||
8802 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8803 | set_se_shares(tg->se[i], shares/nr_cpu_ids); | ||
8804 | } | ||
7748 | 8805 | ||
7749 | /* | 8806 | /* |
7750 | * Enable load balance activity on this group, by inserting it back on | 8807 | * Enable load balance activity on this group, by inserting it back on |
@@ -7753,6 +8810,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7753 | spin_lock_irqsave(&task_group_lock, flags); | 8810 | spin_lock_irqsave(&task_group_lock, flags); |
7754 | for_each_possible_cpu(i) | 8811 | for_each_possible_cpu(i) |
7755 | register_fair_sched_group(tg, i); | 8812 | register_fair_sched_group(tg, i); |
8813 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
7756 | spin_unlock_irqrestore(&task_group_lock, flags); | 8814 | spin_unlock_irqrestore(&task_group_lock, flags); |
7757 | done: | 8815 | done: |
7758 | mutex_unlock(&shares_mutex); | 8816 | mutex_unlock(&shares_mutex); |
@@ -7779,26 +8837,58 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
7779 | return div64_64(runtime << 16, period); | 8837 | return div64_64(runtime << 16, period); |
7780 | } | 8838 | } |
7781 | 8839 | ||
8840 | #ifdef CONFIG_CGROUP_SCHED | ||
8841 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
8842 | { | ||
8843 | struct task_group *tgi, *parent = tg->parent; | ||
8844 | unsigned long total = 0; | ||
8845 | |||
8846 | if (!parent) { | ||
8847 | if (global_rt_period() < period) | ||
8848 | return 0; | ||
8849 | |||
8850 | return to_ratio(period, runtime) < | ||
8851 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
8852 | } | ||
8853 | |||
8854 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | ||
8855 | return 0; | ||
8856 | |||
8857 | rcu_read_lock(); | ||
8858 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | ||
8859 | if (tgi == tg) | ||
8860 | continue; | ||
8861 | |||
8862 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | ||
8863 | tgi->rt_bandwidth.rt_runtime); | ||
8864 | } | ||
8865 | rcu_read_unlock(); | ||
8866 | |||
8867 | return total + to_ratio(period, runtime) < | ||
8868 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | ||
8869 | parent->rt_bandwidth.rt_runtime); | ||
8870 | } | ||
8871 | #elif defined CONFIG_USER_SCHED | ||
7782 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8872 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
7783 | { | 8873 | { |
7784 | struct task_group *tgi; | 8874 | struct task_group *tgi; |
7785 | unsigned long total = 0; | 8875 | unsigned long total = 0; |
7786 | unsigned long global_ratio = | 8876 | unsigned long global_ratio = |
7787 | to_ratio(sysctl_sched_rt_period, | 8877 | to_ratio(global_rt_period(), global_rt_runtime()); |
7788 | sysctl_sched_rt_runtime < 0 ? | ||
7789 | RUNTIME_INF : sysctl_sched_rt_runtime); | ||
7790 | 8878 | ||
7791 | rcu_read_lock(); | 8879 | rcu_read_lock(); |
7792 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8880 | list_for_each_entry_rcu(tgi, &task_groups, list) { |
7793 | if (tgi == tg) | 8881 | if (tgi == tg) |
7794 | continue; | 8882 | continue; |
7795 | 8883 | ||
7796 | total += to_ratio(period, tgi->rt_runtime); | 8884 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), |
8885 | tgi->rt_bandwidth.rt_runtime); | ||
7797 | } | 8886 | } |
7798 | rcu_read_unlock(); | 8887 | rcu_read_unlock(); |
7799 | 8888 | ||
7800 | return total + to_ratio(period, runtime) < global_ratio; | 8889 | return total + to_ratio(period, runtime) < global_ratio; |
7801 | } | 8890 | } |
8891 | #endif | ||
7802 | 8892 | ||
7803 | /* Must be called with tasklist_lock held */ | 8893 | /* Must be called with tasklist_lock held */ |
7804 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8894 | static inline int tg_has_rt_tasks(struct task_group *tg) |
@@ -7811,19 +8901,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
7811 | return 0; | 8901 | return 0; |
7812 | } | 8902 | } |
7813 | 8903 | ||
7814 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 8904 | static int tg_set_bandwidth(struct task_group *tg, |
8905 | u64 rt_period, u64 rt_runtime) | ||
7815 | { | 8906 | { |
7816 | u64 rt_runtime, rt_period; | 8907 | int i, err = 0; |
7817 | int err = 0; | ||
7818 | |||
7819 | rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
7820 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
7821 | if (rt_runtime_us == -1) | ||
7822 | rt_runtime = RUNTIME_INF; | ||
7823 | 8908 | ||
7824 | mutex_lock(&rt_constraints_mutex); | 8909 | mutex_lock(&rt_constraints_mutex); |
7825 | read_lock(&tasklist_lock); | 8910 | read_lock(&tasklist_lock); |
7826 | if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { | 8911 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { |
7827 | err = -EBUSY; | 8912 | err = -EBUSY; |
7828 | goto unlock; | 8913 | goto unlock; |
7829 | } | 8914 | } |
@@ -7831,7 +8916,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
7831 | err = -EINVAL; | 8916 | err = -EINVAL; |
7832 | goto unlock; | 8917 | goto unlock; |
7833 | } | 8918 | } |
7834 | tg->rt_runtime = rt_runtime; | 8919 | |
8920 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
8921 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | ||
8922 | tg->rt_bandwidth.rt_runtime = rt_runtime; | ||
8923 | |||
8924 | for_each_possible_cpu(i) { | ||
8925 | struct rt_rq *rt_rq = tg->rt_rq[i]; | ||
8926 | |||
8927 | spin_lock(&rt_rq->rt_runtime_lock); | ||
8928 | rt_rq->rt_runtime = rt_runtime; | ||
8929 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
8930 | } | ||
8931 | spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
7835 | unlock: | 8932 | unlock: |
7836 | read_unlock(&tasklist_lock); | 8933 | read_unlock(&tasklist_lock); |
7837 | mutex_unlock(&rt_constraints_mutex); | 8934 | mutex_unlock(&rt_constraints_mutex); |
@@ -7839,19 +8936,109 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
7839 | return err; | 8936 | return err; |
7840 | } | 8937 | } |
7841 | 8938 | ||
8939 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | ||
8940 | { | ||
8941 | u64 rt_runtime, rt_period; | ||
8942 | |||
8943 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8944 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
8945 | if (rt_runtime_us < 0) | ||
8946 | rt_runtime = RUNTIME_INF; | ||
8947 | |||
8948 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | ||
8949 | } | ||
8950 | |||
7842 | long sched_group_rt_runtime(struct task_group *tg) | 8951 | long sched_group_rt_runtime(struct task_group *tg) |
7843 | { | 8952 | { |
7844 | u64 rt_runtime_us; | 8953 | u64 rt_runtime_us; |
7845 | 8954 | ||
7846 | if (tg->rt_runtime == RUNTIME_INF) | 8955 | if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) |
7847 | return -1; | 8956 | return -1; |
7848 | 8957 | ||
7849 | rt_runtime_us = tg->rt_runtime; | 8958 | rt_runtime_us = tg->rt_bandwidth.rt_runtime; |
7850 | do_div(rt_runtime_us, NSEC_PER_USEC); | 8959 | do_div(rt_runtime_us, NSEC_PER_USEC); |
7851 | return rt_runtime_us; | 8960 | return rt_runtime_us; |
7852 | } | 8961 | } |
8962 | |||
8963 | int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | ||
8964 | { | ||
8965 | u64 rt_runtime, rt_period; | ||
8966 | |||
8967 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | ||
8968 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8969 | |||
8970 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | ||
8971 | } | ||
8972 | |||
8973 | long sched_group_rt_period(struct task_group *tg) | ||
8974 | { | ||
8975 | u64 rt_period_us; | ||
8976 | |||
8977 | rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8978 | do_div(rt_period_us, NSEC_PER_USEC); | ||
8979 | return rt_period_us; | ||
8980 | } | ||
8981 | |||
8982 | static int sched_rt_global_constraints(void) | ||
8983 | { | ||
8984 | int ret = 0; | ||
8985 | |||
8986 | mutex_lock(&rt_constraints_mutex); | ||
8987 | if (!__rt_schedulable(NULL, 1, 0)) | ||
8988 | ret = -EINVAL; | ||
8989 | mutex_unlock(&rt_constraints_mutex); | ||
8990 | |||
8991 | return ret; | ||
8992 | } | ||
8993 | #else | ||
8994 | static int sched_rt_global_constraints(void) | ||
8995 | { | ||
8996 | unsigned long flags; | ||
8997 | int i; | ||
8998 | |||
8999 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
9000 | for_each_possible_cpu(i) { | ||
9001 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | ||
9002 | |||
9003 | spin_lock(&rt_rq->rt_runtime_lock); | ||
9004 | rt_rq->rt_runtime = global_rt_runtime(); | ||
9005 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
9006 | } | ||
9007 | spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
9008 | |||
9009 | return 0; | ||
9010 | } | ||
7853 | #endif | 9011 | #endif |
7854 | #endif /* CONFIG_GROUP_SCHED */ | 9012 | |
9013 | int sched_rt_handler(struct ctl_table *table, int write, | ||
9014 | struct file *filp, void __user *buffer, size_t *lenp, | ||
9015 | loff_t *ppos) | ||
9016 | { | ||
9017 | int ret; | ||
9018 | int old_period, old_runtime; | ||
9019 | static DEFINE_MUTEX(mutex); | ||
9020 | |||
9021 | mutex_lock(&mutex); | ||
9022 | old_period = sysctl_sched_rt_period; | ||
9023 | old_runtime = sysctl_sched_rt_runtime; | ||
9024 | |||
9025 | ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
9026 | |||
9027 | if (!ret && write) { | ||
9028 | ret = sched_rt_global_constraints(); | ||
9029 | if (ret) { | ||
9030 | sysctl_sched_rt_period = old_period; | ||
9031 | sysctl_sched_rt_runtime = old_runtime; | ||
9032 | } else { | ||
9033 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
9034 | def_rt_bandwidth.rt_period = | ||
9035 | ns_to_ktime(global_rt_period()); | ||
9036 | } | ||
9037 | } | ||
9038 | mutex_unlock(&mutex); | ||
9039 | |||
9040 | return ret; | ||
9041 | } | ||
7855 | 9042 | ||
7856 | #ifdef CONFIG_CGROUP_SCHED | 9043 | #ifdef CONFIG_CGROUP_SCHED |
7857 | 9044 | ||
@@ -7865,7 +9052,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | |||
7865 | static struct cgroup_subsys_state * | 9052 | static struct cgroup_subsys_state * |
7866 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | 9053 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) |
7867 | { | 9054 | { |
7868 | struct task_group *tg; | 9055 | struct task_group *tg, *parent; |
7869 | 9056 | ||
7870 | if (!cgrp->parent) { | 9057 | if (!cgrp->parent) { |
7871 | /* This is early initialization for the top cgroup */ | 9058 | /* This is early initialization for the top cgroup */ |
@@ -7873,11 +9060,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
7873 | return &init_task_group.css; | 9060 | return &init_task_group.css; |
7874 | } | 9061 | } |
7875 | 9062 | ||
7876 | /* we support only 1-level deep hierarchical scheduler atm */ | 9063 | parent = cgroup_tg(cgrp->parent); |
7877 | if (cgrp->parent->parent) | 9064 | tg = sched_create_group(parent); |
7878 | return ERR_PTR(-EINVAL); | ||
7879 | |||
7880 | tg = sched_create_group(); | ||
7881 | if (IS_ERR(tg)) | 9065 | if (IS_ERR(tg)) |
7882 | return ERR_PTR(-ENOMEM); | 9066 | return ERR_PTR(-ENOMEM); |
7883 | 9067 | ||
@@ -7901,7 +9085,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
7901 | { | 9085 | { |
7902 | #ifdef CONFIG_RT_GROUP_SCHED | 9086 | #ifdef CONFIG_RT_GROUP_SCHED |
7903 | /* Don't accept realtime tasks when there is no way for them to run */ | 9087 | /* Don't accept realtime tasks when there is no way for them to run */ |
7904 | if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) | 9088 | if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) |
7905 | return -EINVAL; | 9089 | return -EINVAL; |
7906 | #else | 9090 | #else |
7907 | /* We don't support RT-tasks being in separate groups */ | 9091 | /* We don't support RT-tasks being in separate groups */ |
@@ -7935,7 +9119,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
7935 | #endif | 9119 | #endif |
7936 | 9120 | ||
7937 | #ifdef CONFIG_RT_GROUP_SCHED | 9121 | #ifdef CONFIG_RT_GROUP_SCHED |
7938 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 9122 | static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
7939 | struct file *file, | 9123 | struct file *file, |
7940 | const char __user *userbuf, | 9124 | const char __user *userbuf, |
7941 | size_t nbytes, loff_t *unused_ppos) | 9125 | size_t nbytes, loff_t *unused_ppos) |
@@ -7979,6 +9163,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, | |||
7979 | 9163 | ||
7980 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 9164 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
7981 | } | 9165 | } |
9166 | |||
9167 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, | ||
9168 | u64 rt_period_us) | ||
9169 | { | ||
9170 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); | ||
9171 | } | ||
9172 | |||
9173 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | ||
9174 | { | ||
9175 | return sched_group_rt_period(cgroup_tg(cgrp)); | ||
9176 | } | ||
7982 | #endif | 9177 | #endif |
7983 | 9178 | ||
7984 | static struct cftype cpu_files[] = { | 9179 | static struct cftype cpu_files[] = { |
@@ -7995,6 +9190,11 @@ static struct cftype cpu_files[] = { | |||
7995 | .read = cpu_rt_runtime_read, | 9190 | .read = cpu_rt_runtime_read, |
7996 | .write = cpu_rt_runtime_write, | 9191 | .write = cpu_rt_runtime_write, |
7997 | }, | 9192 | }, |
9193 | { | ||
9194 | .name = "rt_period_us", | ||
9195 | .read_uint = cpu_rt_period_read_uint, | ||
9196 | .write_uint = cpu_rt_period_write_uint, | ||
9197 | }, | ||
7998 | #endif | 9198 | #endif |
7999 | }; | 9199 | }; |
8000 | 9200 | ||
@@ -8035,9 +9235,9 @@ struct cpuacct { | |||
8035 | struct cgroup_subsys cpuacct_subsys; | 9235 | struct cgroup_subsys cpuacct_subsys; |
8036 | 9236 | ||
8037 | /* return cpu accounting group corresponding to this container */ | 9237 | /* return cpu accounting group corresponding to this container */ |
8038 | static inline struct cpuacct *cgroup_ca(struct cgroup *cont) | 9238 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) |
8039 | { | 9239 | { |
8040 | return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), | 9240 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), |
8041 | struct cpuacct, css); | 9241 | struct cpuacct, css); |
8042 | } | 9242 | } |
8043 | 9243 | ||
@@ -8050,7 +9250,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) | |||
8050 | 9250 | ||
8051 | /* create a new cpu accounting group */ | 9251 | /* create a new cpu accounting group */ |
8052 | static struct cgroup_subsys_state *cpuacct_create( | 9252 | static struct cgroup_subsys_state *cpuacct_create( |
8053 | struct cgroup_subsys *ss, struct cgroup *cont) | 9253 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
8054 | { | 9254 | { |
8055 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 9255 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
8056 | 9256 | ||
@@ -8068,18 +9268,18 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
8068 | 9268 | ||
8069 | /* destroy an existing cpu accounting group */ | 9269 | /* destroy an existing cpu accounting group */ |
8070 | static void | 9270 | static void |
8071 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 9271 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
8072 | { | 9272 | { |
8073 | struct cpuacct *ca = cgroup_ca(cont); | 9273 | struct cpuacct *ca = cgroup_ca(cgrp); |
8074 | 9274 | ||
8075 | free_percpu(ca->cpuusage); | 9275 | free_percpu(ca->cpuusage); |
8076 | kfree(ca); | 9276 | kfree(ca); |
8077 | } | 9277 | } |
8078 | 9278 | ||
8079 | /* return total cpu usage (in nanoseconds) of a group */ | 9279 | /* return total cpu usage (in nanoseconds) of a group */ |
8080 | static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | 9280 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) |
8081 | { | 9281 | { |
8082 | struct cpuacct *ca = cgroup_ca(cont); | 9282 | struct cpuacct *ca = cgroup_ca(cgrp); |
8083 | u64 totalcpuusage = 0; | 9283 | u64 totalcpuusage = 0; |
8084 | int i; | 9284 | int i; |
8085 | 9285 | ||
@@ -8098,16 +9298,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | |||
8098 | return totalcpuusage; | 9298 | return totalcpuusage; |
8099 | } | 9299 | } |
8100 | 9300 | ||
9301 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
9302 | u64 reset) | ||
9303 | { | ||
9304 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
9305 | int err = 0; | ||
9306 | int i; | ||
9307 | |||
9308 | if (reset) { | ||
9309 | err = -EINVAL; | ||
9310 | goto out; | ||
9311 | } | ||
9312 | |||
9313 | for_each_possible_cpu(i) { | ||
9314 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); | ||
9315 | |||
9316 | spin_lock_irq(&cpu_rq(i)->lock); | ||
9317 | *cpuusage = 0; | ||
9318 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
9319 | } | ||
9320 | out: | ||
9321 | return err; | ||
9322 | } | ||
9323 | |||
8101 | static struct cftype files[] = { | 9324 | static struct cftype files[] = { |
8102 | { | 9325 | { |
8103 | .name = "usage", | 9326 | .name = "usage", |
8104 | .read_uint = cpuusage_read, | 9327 | .read_uint = cpuusage_read, |
9328 | .write_uint = cpuusage_write, | ||
8105 | }, | 9329 | }, |
8106 | }; | 9330 | }; |
8107 | 9331 | ||
8108 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 9332 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) |
8109 | { | 9333 | { |
8110 | return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 9334 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); |
8111 | } | 9335 | } |
8112 | 9336 | ||
8113 | /* | 9337 | /* |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index ef358ba07683..f3f4af4b8b0f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
67 | (long long)(p->nvcsw + p->nivcsw), | 67 | (long long)(p->nvcsw + p->nivcsw), |
68 | p->prio); | 68 | p->prio); |
69 | #ifdef CONFIG_SCHEDSTATS | 69 | #ifdef CONFIG_SCHEDSTATS |
70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", | 70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
71 | SPLIT_NS(p->se.vruntime), | 71 | SPLIT_NS(p->se.vruntime), |
72 | SPLIT_NS(p->se.sum_exec_runtime), | 72 | SPLIT_NS(p->se.sum_exec_runtime), |
73 | SPLIT_NS(p->se.sum_sleep_runtime)); | 73 | SPLIT_NS(p->se.sum_sleep_runtime)); |
74 | #else | 74 | #else |
75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", | 75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
77 | #endif | 77 | #endif |
78 | |||
79 | #ifdef CONFIG_CGROUP_SCHED | ||
80 | { | ||
81 | char path[64]; | ||
82 | |||
83 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); | ||
84 | SEQ_printf(m, " %s", path); | ||
85 | } | ||
86 | #endif | ||
87 | SEQ_printf(m, "\n"); | ||
78 | } | 88 | } |
79 | 89 | ||
80 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | 90 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
@@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
109 | struct sched_entity *last; | 119 | struct sched_entity *last; |
110 | unsigned long flags; | 120 | unsigned long flags; |
111 | 121 | ||
112 | SEQ_printf(m, "\ncfs_rq\n"); | 122 | #if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) |
123 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | ||
124 | #else | ||
125 | char path[128] = ""; | ||
126 | struct cgroup *cgroup = NULL; | ||
127 | struct task_group *tg = cfs_rq->tg; | ||
128 | |||
129 | if (tg) | ||
130 | cgroup = tg->css.cgroup; | ||
131 | |||
132 | if (cgroup) | ||
133 | cgroup_path(cgroup, path, sizeof(path)); | ||
134 | |||
135 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | ||
136 | #endif | ||
113 | 137 | ||
114 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", | 138 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
115 | SPLIT_NS(cfs_rq->exec_clock)); | 139 | SPLIT_NS(cfs_rq->exec_clock)); |
@@ -143,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
143 | #endif | 167 | #endif |
144 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | 168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", |
145 | cfs_rq->nr_spread_over); | 169 | cfs_rq->nr_spread_over); |
170 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
171 | #ifdef CONFIG_SMP | ||
172 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | ||
173 | #endif | ||
174 | #endif | ||
146 | } | 175 | } |
147 | 176 | ||
148 | static void print_cpu(struct seq_file *m, int cpu) | 177 | static void print_cpu(struct seq_file *m, int cpu) |
@@ -214,7 +243,6 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
214 | PN(sysctl_sched_latency); | 243 | PN(sysctl_sched_latency); |
215 | PN(sysctl_sched_min_granularity); | 244 | PN(sysctl_sched_min_granularity); |
216 | PN(sysctl_sched_wakeup_granularity); | 245 | PN(sysctl_sched_wakeup_granularity); |
217 | PN(sysctl_sched_batch_wakeup_granularity); | ||
218 | PN(sysctl_sched_child_runs_first); | 246 | PN(sysctl_sched_child_runs_first); |
219 | P(sysctl_sched_features); | 247 | P(sysctl_sched_features); |
220 | #undef PN | 248 | #undef PN |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0080968d3e4a..89fa32b4edf2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -62,24 +62,14 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1; | |||
62 | unsigned int __read_mostly sysctl_sched_compat_yield; | 62 | unsigned int __read_mostly sysctl_sched_compat_yield; |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * SCHED_BATCH wake-up granularity. | ||
66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | ||
67 | * | ||
68 | * This option delays the preemption effects of decoupled workloads | ||
69 | * and reduces their over-scheduling. Synchronous workloads will still | ||
70 | * have immediate wakeup/sleep latencies. | ||
71 | */ | ||
72 | unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; | ||
73 | |||
74 | /* | ||
75 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
76 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) |
77 | * | 67 | * |
78 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
79 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
80 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
81 | */ | 71 | */ |
82 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
83 | 73 | ||
84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
85 | 75 | ||
@@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
87 | * CFS operations on generic schedulable entities: | 77 | * CFS operations on generic schedulable entities: |
88 | */ | 78 | */ |
89 | 79 | ||
80 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
81 | { | ||
82 | return container_of(se, struct task_struct, se); | ||
83 | } | ||
84 | |||
90 | #ifdef CONFIG_FAIR_GROUP_SCHED | 85 | #ifdef CONFIG_FAIR_GROUP_SCHED |
91 | 86 | ||
92 | /* cpu runqueue to which this cfs_rq is attached */ | 87 | /* cpu runqueue to which this cfs_rq is attached */ |
@@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
98 | /* An entity is a task if it doesn't "own" a runqueue */ | 93 | /* An entity is a task if it doesn't "own" a runqueue */ |
99 | #define entity_is_task(se) (!se->my_q) | 94 | #define entity_is_task(se) (!se->my_q) |
100 | 95 | ||
96 | /* Walk up scheduling entities hierarchy */ | ||
97 | #define for_each_sched_entity(se) \ | ||
98 | for (; se; se = se->parent) | ||
99 | |||
100 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
101 | { | ||
102 | return p->se.cfs_rq; | ||
103 | } | ||
104 | |||
105 | /* runqueue on which this entity is (to be) queued */ | ||
106 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
107 | { | ||
108 | return se->cfs_rq; | ||
109 | } | ||
110 | |||
111 | /* runqueue "owned" by this group */ | ||
112 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
113 | { | ||
114 | return grp->my_q; | ||
115 | } | ||
116 | |||
117 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
118 | * another cpu ('this_cpu') | ||
119 | */ | ||
120 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
121 | { | ||
122 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
123 | } | ||
124 | |||
125 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
126 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
127 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
128 | |||
129 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
130 | static inline int | ||
131 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
132 | { | ||
133 | if (se->cfs_rq == pse->cfs_rq) | ||
134 | return 1; | ||
135 | |||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
140 | { | ||
141 | return se->parent; | ||
142 | } | ||
143 | |||
101 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 144 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
102 | 145 | ||
103 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 146 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
@@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
107 | 150 | ||
108 | #define entity_is_task(se) 1 | 151 | #define entity_is_task(se) 1 |
109 | 152 | ||
110 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 153 | #define for_each_sched_entity(se) \ |
154 | for (; se; se = NULL) | ||
111 | 155 | ||
112 | static inline struct task_struct *task_of(struct sched_entity *se) | 156 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) |
113 | { | 157 | { |
114 | return container_of(se, struct task_struct, se); | 158 | return &task_rq(p)->cfs; |
159 | } | ||
160 | |||
161 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
162 | { | ||
163 | struct task_struct *p = task_of(se); | ||
164 | struct rq *rq = task_rq(p); | ||
165 | |||
166 | return &rq->cfs; | ||
167 | } | ||
168 | |||
169 | /* runqueue "owned" by this group */ | ||
170 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
171 | { | ||
172 | return NULL; | ||
173 | } | ||
174 | |||
175 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
176 | { | ||
177 | return &cpu_rq(this_cpu)->cfs; | ||
178 | } | ||
179 | |||
180 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
181 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
182 | |||
183 | static inline int | ||
184 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
185 | { | ||
186 | return 1; | ||
187 | } | ||
188 | |||
189 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
190 | { | ||
191 | return NULL; | ||
115 | } | 192 | } |
116 | 193 | ||
194 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
195 | |||
117 | 196 | ||
118 | /************************************************************** | 197 | /************************************************************** |
119 | * Scheduling class tree data structure manipulation methods: | 198 | * Scheduling class tree data structure manipulation methods: |
@@ -255,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
255 | #endif | 334 | #endif |
256 | 335 | ||
257 | /* | 336 | /* |
337 | * delta *= w / rw | ||
338 | */ | ||
339 | static inline unsigned long | ||
340 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
341 | { | ||
342 | for_each_sched_entity(se) { | ||
343 | delta = calc_delta_mine(delta, | ||
344 | se->load.weight, &cfs_rq_of(se)->load); | ||
345 | } | ||
346 | |||
347 | return delta; | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * delta *= rw / w | ||
352 | */ | ||
353 | static inline unsigned long | ||
354 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
355 | { | ||
356 | for_each_sched_entity(se) { | ||
357 | delta = calc_delta_mine(delta, | ||
358 | cfs_rq_of(se)->load.weight, &se->load); | ||
359 | } | ||
360 | |||
361 | return delta; | ||
362 | } | ||
363 | |||
364 | /* | ||
258 | * The idea is to set a period in which each task runs once. | 365 | * The idea is to set a period in which each task runs once. |
259 | * | 366 | * |
260 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 367 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
@@ -283,29 +390,54 @@ static u64 __sched_period(unsigned long nr_running) | |||
283 | */ | 390 | */ |
284 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 391 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
285 | { | 392 | { |
286 | return calc_delta_mine(__sched_period(cfs_rq->nr_running), | 393 | return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); |
287 | se->load.weight, &cfs_rq->load); | ||
288 | } | 394 | } |
289 | 395 | ||
290 | /* | 396 | /* |
291 | * We calculate the vruntime slice. | 397 | * We calculate the vruntime slice of a to be inserted task |
292 | * | 398 | * |
293 | * vs = s/w = p/rw | 399 | * vs = s*rw/w = p |
294 | */ | 400 | */ |
295 | static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | 401 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
296 | { | 402 | { |
297 | u64 vslice = __sched_period(nr_running); | 403 | unsigned long nr_running = cfs_rq->nr_running; |
298 | 404 | ||
299 | vslice *= NICE_0_LOAD; | 405 | if (!se->on_rq) |
300 | do_div(vslice, rq_weight); | 406 | nr_running++; |
301 | 407 | ||
302 | return vslice; | 408 | return __sched_period(nr_running); |
303 | } | 409 | } |
304 | 410 | ||
305 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 411 | /* |
412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
413 | * that it favours >=0 over <0. | ||
414 | * | ||
415 | * -20 | | ||
416 | * | | ||
417 | * 0 --------+------- | ||
418 | * .' | ||
419 | * 19 .' | ||
420 | * | ||
421 | */ | ||
422 | static unsigned long | ||
423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
306 | { | 424 | { |
307 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, | 425 | struct load_weight lw = { |
308 | cfs_rq->nr_running + 1); | 426 | .weight = NICE_0_LOAD, |
427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
428 | }; | ||
429 | |||
430 | for_each_sched_entity(se) { | ||
431 | struct load_weight *se_lw = &se->load; | ||
432 | |||
433 | if (se->load.weight < NICE_0_LOAD) | ||
434 | se_lw = &lw; | ||
435 | |||
436 | delta = calc_delta_mine(delta, | ||
437 | cfs_rq_of(se)->load.weight, se_lw); | ||
438 | } | ||
439 | |||
440 | return delta; | ||
309 | } | 441 | } |
310 | 442 | ||
311 | /* | 443 | /* |
@@ -322,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
322 | 454 | ||
323 | curr->sum_exec_runtime += delta_exec; | 455 | curr->sum_exec_runtime += delta_exec; |
324 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 456 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
325 | delta_exec_weighted = delta_exec; | 457 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); |
326 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { | ||
327 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, | ||
328 | &curr->load); | ||
329 | } | ||
330 | curr->vruntime += delta_exec_weighted; | 458 | curr->vruntime += delta_exec_weighted; |
331 | } | 459 | } |
332 | 460 | ||
@@ -413,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
413 | * Scheduling class queueing methods: | 541 | * Scheduling class queueing methods: |
414 | */ | 542 | */ |
415 | 543 | ||
544 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
545 | static void | ||
546 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
547 | { | ||
548 | cfs_rq->task_weight += weight; | ||
549 | } | ||
550 | #else | ||
551 | static inline void | ||
552 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
553 | { | ||
554 | } | ||
555 | #endif | ||
556 | |||
416 | static void | 557 | static void |
417 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 558 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
418 | { | 559 | { |
419 | update_load_add(&cfs_rq->load, se->load.weight); | 560 | update_load_add(&cfs_rq->load, se->load.weight); |
561 | if (!parent_entity(se)) | ||
562 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
563 | if (entity_is_task(se)) | ||
564 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
420 | cfs_rq->nr_running++; | 565 | cfs_rq->nr_running++; |
421 | se->on_rq = 1; | 566 | se->on_rq = 1; |
567 | list_add(&se->group_node, &cfs_rq->tasks); | ||
422 | } | 568 | } |
423 | 569 | ||
424 | static void | 570 | static void |
425 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 571 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
426 | { | 572 | { |
427 | update_load_sub(&cfs_rq->load, se->load.weight); | 573 | update_load_sub(&cfs_rq->load, se->load.weight); |
574 | if (!parent_entity(se)) | ||
575 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
576 | if (entity_is_task(se)) | ||
577 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
428 | cfs_rq->nr_running--; | 578 | cfs_rq->nr_running--; |
429 | se->on_rq = 0; | 579 | se->on_rq = 0; |
580 | list_del_init(&se->group_node); | ||
430 | } | 581 | } |
431 | 582 | ||
432 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 583 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -510,8 +661,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
510 | 661 | ||
511 | if (!initial) { | 662 | if (!initial) { |
512 | /* sleeps upto a single latency don't count. */ | 663 | /* sleeps upto a single latency don't count. */ |
513 | if (sched_feat(NEW_FAIR_SLEEPERS)) | 664 | if (sched_feat(NEW_FAIR_SLEEPERS)) { |
514 | vruntime -= sysctl_sched_latency; | 665 | if (sched_feat(NORMALIZED_SLEEPER)) |
666 | vruntime -= calc_delta_weight(sysctl_sched_latency, se); | ||
667 | else | ||
668 | vruntime -= sysctl_sched_latency; | ||
669 | } | ||
515 | 670 | ||
516 | /* ensure we never gain time by being placed backwards. */ | 671 | /* ensure we never gain time by being placed backwards. */ |
517 | vruntime = max_vruntime(se->vruntime, vruntime); | 672 | vruntime = max_vruntime(se->vruntime, vruntime); |
@@ -627,20 +782,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
627 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 782 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
628 | } | 783 | } |
629 | 784 | ||
785 | static int | ||
786 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | ||
787 | |||
630 | static struct sched_entity * | 788 | static struct sched_entity * |
631 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) | 789 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) |
632 | { | 790 | { |
633 | s64 diff, gran; | ||
634 | |||
635 | if (!cfs_rq->next) | 791 | if (!cfs_rq->next) |
636 | return se; | 792 | return se; |
637 | 793 | ||
638 | diff = cfs_rq->next->vruntime - se->vruntime; | 794 | if (wakeup_preempt_entity(cfs_rq->next, se) != 0) |
639 | if (diff < 0) | ||
640 | return se; | ||
641 | |||
642 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); | ||
643 | if (diff > gran) | ||
644 | return se; | 795 | return se; |
645 | 796 | ||
646 | return cfs_rq->next; | 797 | return cfs_rq->next; |
@@ -708,101 +859,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
708 | * CFS operations on tasks: | 859 | * CFS operations on tasks: |
709 | */ | 860 | */ |
710 | 861 | ||
711 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
712 | |||
713 | /* Walk up scheduling entities hierarchy */ | ||
714 | #define for_each_sched_entity(se) \ | ||
715 | for (; se; se = se->parent) | ||
716 | |||
717 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
718 | { | ||
719 | return p->se.cfs_rq; | ||
720 | } | ||
721 | |||
722 | /* runqueue on which this entity is (to be) queued */ | ||
723 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
724 | { | ||
725 | return se->cfs_rq; | ||
726 | } | ||
727 | |||
728 | /* runqueue "owned" by this group */ | ||
729 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
730 | { | ||
731 | return grp->my_q; | ||
732 | } | ||
733 | |||
734 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
735 | * another cpu ('this_cpu') | ||
736 | */ | ||
737 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
738 | { | ||
739 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
740 | } | ||
741 | |||
742 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
743 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
744 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
745 | |||
746 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
747 | static inline int | ||
748 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
749 | { | ||
750 | if (se->cfs_rq == pse->cfs_rq) | ||
751 | return 1; | ||
752 | |||
753 | return 0; | ||
754 | } | ||
755 | |||
756 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
757 | { | ||
758 | return se->parent; | ||
759 | } | ||
760 | |||
761 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
762 | |||
763 | #define for_each_sched_entity(se) \ | ||
764 | for (; se; se = NULL) | ||
765 | |||
766 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
767 | { | ||
768 | return &task_rq(p)->cfs; | ||
769 | } | ||
770 | |||
771 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
772 | { | ||
773 | struct task_struct *p = task_of(se); | ||
774 | struct rq *rq = task_rq(p); | ||
775 | |||
776 | return &rq->cfs; | ||
777 | } | ||
778 | |||
779 | /* runqueue "owned" by this group */ | ||
780 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
781 | { | ||
782 | return NULL; | ||
783 | } | ||
784 | |||
785 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
786 | { | ||
787 | return &cpu_rq(this_cpu)->cfs; | ||
788 | } | ||
789 | |||
790 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
791 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
792 | |||
793 | static inline int | ||
794 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
795 | { | ||
796 | return 1; | ||
797 | } | ||
798 | |||
799 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
800 | { | ||
801 | return NULL; | ||
802 | } | ||
803 | |||
804 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
805 | |||
806 | #ifdef CONFIG_SCHED_HRTICK | 862 | #ifdef CONFIG_SCHED_HRTICK |
807 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | 863 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) |
808 | { | 864 | { |
@@ -916,7 +972,7 @@ static void yield_task_fair(struct rq *rq) | |||
916 | /* | 972 | /* |
917 | * Already in the rightmost position? | 973 | * Already in the rightmost position? |
918 | */ | 974 | */ |
919 | if (unlikely(rightmost->vruntime < se->vruntime)) | 975 | if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) |
920 | return; | 976 | return; |
921 | 977 | ||
922 | /* | 978 | /* |
@@ -955,7 +1011,9 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
955 | return cpu; | 1011 | return cpu; |
956 | 1012 | ||
957 | for_each_domain(cpu, sd) { | 1013 | for_each_domain(cpu, sd) { |
958 | if (sd->flags & SD_WAKE_IDLE) { | 1014 | if ((sd->flags & SD_WAKE_IDLE) |
1015 | || ((sd->flags & SD_WAKE_IDLE_FAR) | ||
1016 | && !task_hot(p, task_rq(p)->clock, sd))) { | ||
959 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1017 | cpus_and(tmp, sd->span, p->cpus_allowed); |
960 | for_each_cpu_mask(i, tmp) { | 1018 | for_each_cpu_mask(i, tmp) { |
961 | if (idle_cpu(i)) { | 1019 | if (idle_cpu(i)) { |
@@ -1099,6 +1157,58 @@ out: | |||
1099 | } | 1157 | } |
1100 | #endif /* CONFIG_SMP */ | 1158 | #endif /* CONFIG_SMP */ |
1101 | 1159 | ||
1160 | static unsigned long wakeup_gran(struct sched_entity *se) | ||
1161 | { | ||
1162 | unsigned long gran = sysctl_sched_wakeup_granularity; | ||
1163 | |||
1164 | /* | ||
1165 | * More easily preempt - nice tasks, while not making it harder for | ||
1166 | * + nice tasks. | ||
1167 | */ | ||
1168 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); | ||
1169 | |||
1170 | return gran; | ||
1171 | } | ||
1172 | |||
1173 | /* | ||
1174 | * Should 'se' preempt 'curr'. | ||
1175 | * | ||
1176 | * |s1 | ||
1177 | * |s2 | ||
1178 | * |s3 | ||
1179 | * g | ||
1180 | * |<--->|c | ||
1181 | * | ||
1182 | * w(c, s1) = -1 | ||
1183 | * w(c, s2) = 0 | ||
1184 | * w(c, s3) = 1 | ||
1185 | * | ||
1186 | */ | ||
1187 | static int | ||
1188 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | ||
1189 | { | ||
1190 | s64 gran, vdiff = curr->vruntime - se->vruntime; | ||
1191 | |||
1192 | if (vdiff < 0) | ||
1193 | return -1; | ||
1194 | |||
1195 | gran = wakeup_gran(curr); | ||
1196 | if (vdiff > gran) | ||
1197 | return 1; | ||
1198 | |||
1199 | return 0; | ||
1200 | } | ||
1201 | |||
1202 | /* return depth at which a sched entity is present in the hierarchy */ | ||
1203 | static inline int depth_se(struct sched_entity *se) | ||
1204 | { | ||
1205 | int depth = 0; | ||
1206 | |||
1207 | for_each_sched_entity(se) | ||
1208 | depth++; | ||
1209 | |||
1210 | return depth; | ||
1211 | } | ||
1102 | 1212 | ||
1103 | /* | 1213 | /* |
1104 | * Preempt the current task with a newly woken task if needed: | 1214 | * Preempt the current task with a newly woken task if needed: |
@@ -1108,7 +1218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1108 | struct task_struct *curr = rq->curr; | 1218 | struct task_struct *curr = rq->curr; |
1109 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1219 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1110 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1220 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1111 | unsigned long gran; | 1221 | int se_depth, pse_depth; |
1112 | 1222 | ||
1113 | if (unlikely(rt_prio(p->prio))) { | 1223 | if (unlikely(rt_prio(p->prio))) { |
1114 | update_rq_clock(rq); | 1224 | update_rq_clock(rq); |
@@ -1133,20 +1243,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1133 | if (!sched_feat(WAKEUP_PREEMPT)) | 1243 | if (!sched_feat(WAKEUP_PREEMPT)) |
1134 | return; | 1244 | return; |
1135 | 1245 | ||
1136 | while (!is_same_group(se, pse)) { | 1246 | /* |
1247 | * preemption test can be made between sibling entities who are in the | ||
1248 | * same cfs_rq i.e who have a common parent. Walk up the hierarchy of | ||
1249 | * both tasks until we find their ancestors who are siblings of common | ||
1250 | * parent. | ||
1251 | */ | ||
1252 | |||
1253 | /* First walk up until both entities are at same depth */ | ||
1254 | se_depth = depth_se(se); | ||
1255 | pse_depth = depth_se(pse); | ||
1256 | |||
1257 | while (se_depth > pse_depth) { | ||
1258 | se_depth--; | ||
1137 | se = parent_entity(se); | 1259 | se = parent_entity(se); |
1260 | } | ||
1261 | |||
1262 | while (pse_depth > se_depth) { | ||
1263 | pse_depth--; | ||
1138 | pse = parent_entity(pse); | 1264 | pse = parent_entity(pse); |
1139 | } | 1265 | } |
1140 | 1266 | ||
1141 | gran = sysctl_sched_wakeup_granularity; | 1267 | while (!is_same_group(se, pse)) { |
1142 | /* | 1268 | se = parent_entity(se); |
1143 | * More easily preempt - nice tasks, while not making | 1269 | pse = parent_entity(pse); |
1144 | * it harder for + nice tasks. | 1270 | } |
1145 | */ | ||
1146 | if (unlikely(se->load.weight > NICE_0_LOAD)) | ||
1147 | gran = calc_delta_fair(gran, &se->load); | ||
1148 | 1271 | ||
1149 | if (pse->vruntime + gran < se->vruntime) | 1272 | if (wakeup_preempt_entity(se, pse) == 1) |
1150 | resched_task(curr); | 1273 | resched_task(curr); |
1151 | } | 1274 | } |
1152 | 1275 | ||
@@ -1197,15 +1320,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1197 | * the current task: | 1320 | * the current task: |
1198 | */ | 1321 | */ |
1199 | static struct task_struct * | 1322 | static struct task_struct * |
1200 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 1323 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) |
1201 | { | 1324 | { |
1202 | struct task_struct *p; | 1325 | struct task_struct *p = NULL; |
1326 | struct sched_entity *se; | ||
1327 | |||
1328 | if (next == &cfs_rq->tasks) | ||
1329 | return NULL; | ||
1330 | |||
1331 | /* Skip over entities that are not tasks */ | ||
1332 | do { | ||
1333 | se = list_entry(next, struct sched_entity, group_node); | ||
1334 | next = next->next; | ||
1335 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
1203 | 1336 | ||
1204 | if (!curr) | 1337 | if (next == &cfs_rq->tasks) |
1205 | return NULL; | 1338 | return NULL; |
1206 | 1339 | ||
1207 | p = rb_entry(curr, struct task_struct, se.run_node); | 1340 | cfs_rq->balance_iterator = next; |
1208 | cfs_rq->rb_load_balance_curr = rb_next(curr); | 1341 | |
1342 | if (entity_is_task(se)) | ||
1343 | p = task_of(se); | ||
1209 | 1344 | ||
1210 | return p; | 1345 | return p; |
1211 | } | 1346 | } |
@@ -1214,85 +1349,100 @@ static struct task_struct *load_balance_start_fair(void *arg) | |||
1214 | { | 1349 | { |
1215 | struct cfs_rq *cfs_rq = arg; | 1350 | struct cfs_rq *cfs_rq = arg; |
1216 | 1351 | ||
1217 | return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); | 1352 | return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); |
1218 | } | 1353 | } |
1219 | 1354 | ||
1220 | static struct task_struct *load_balance_next_fair(void *arg) | 1355 | static struct task_struct *load_balance_next_fair(void *arg) |
1221 | { | 1356 | { |
1222 | struct cfs_rq *cfs_rq = arg; | 1357 | struct cfs_rq *cfs_rq = arg; |
1223 | 1358 | ||
1224 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1359 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
1225 | } | 1360 | } |
1226 | 1361 | ||
1227 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1362 | static unsigned long |
1228 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1363 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1364 | unsigned long max_load_move, struct sched_domain *sd, | ||
1365 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
1366 | struct cfs_rq *cfs_rq) | ||
1229 | { | 1367 | { |
1230 | struct sched_entity *curr; | 1368 | struct rq_iterator cfs_rq_iterator; |
1231 | struct task_struct *p; | ||
1232 | |||
1233 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
1234 | return MAX_PRIO; | ||
1235 | |||
1236 | curr = cfs_rq->curr; | ||
1237 | if (!curr) | ||
1238 | curr = __pick_next_entity(cfs_rq); | ||
1239 | 1369 | ||
1240 | p = task_of(curr); | 1370 | cfs_rq_iterator.start = load_balance_start_fair; |
1371 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1372 | cfs_rq_iterator.arg = cfs_rq; | ||
1241 | 1373 | ||
1242 | return p->prio; | 1374 | return balance_tasks(this_rq, this_cpu, busiest, |
1375 | max_load_move, sd, idle, all_pinned, | ||
1376 | this_best_prio, &cfs_rq_iterator); | ||
1243 | } | 1377 | } |
1244 | #endif | ||
1245 | 1378 | ||
1379 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1246 | static unsigned long | 1380 | static unsigned long |
1247 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1381 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1248 | unsigned long max_load_move, | 1382 | unsigned long max_load_move, |
1249 | struct sched_domain *sd, enum cpu_idle_type idle, | 1383 | struct sched_domain *sd, enum cpu_idle_type idle, |
1250 | int *all_pinned, int *this_best_prio) | 1384 | int *all_pinned, int *this_best_prio) |
1251 | { | 1385 | { |
1252 | struct cfs_rq *busy_cfs_rq; | ||
1253 | long rem_load_move = max_load_move; | 1386 | long rem_load_move = max_load_move; |
1254 | struct rq_iterator cfs_rq_iterator; | 1387 | int busiest_cpu = cpu_of(busiest); |
1255 | 1388 | struct task_group *tg; | |
1256 | cfs_rq_iterator.start = load_balance_start_fair; | ||
1257 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1258 | 1389 | ||
1259 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1390 | rcu_read_lock(); |
1260 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1391 | list_for_each_entry(tg, &task_groups, list) { |
1261 | struct cfs_rq *this_cfs_rq; | ||
1262 | long imbalance; | 1392 | long imbalance; |
1263 | unsigned long maxload; | 1393 | unsigned long this_weight, busiest_weight; |
1394 | long rem_load, max_load, moved_load; | ||
1395 | |||
1396 | /* | ||
1397 | * empty group | ||
1398 | */ | ||
1399 | if (!aggregate(tg, sd)->task_weight) | ||
1400 | continue; | ||
1401 | |||
1402 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
1403 | rem_load /= aggregate(tg, sd)->load + 1; | ||
1404 | |||
1405 | this_weight = tg->cfs_rq[this_cpu]->task_weight; | ||
1406 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
1407 | |||
1408 | imbalance = (busiest_weight - this_weight) / 2; | ||
1264 | 1409 | ||
1265 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1410 | if (imbalance < 0) |
1411 | imbalance = busiest_weight; | ||
1266 | 1412 | ||
1267 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1413 | max_load = max(rem_load, imbalance); |
1268 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1414 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, |
1269 | if (imbalance <= 0) | 1415 | max_load, sd, idle, all_pinned, this_best_prio, |
1416 | tg->cfs_rq[busiest_cpu]); | ||
1417 | |||
1418 | if (!moved_load) | ||
1270 | continue; | 1419 | continue; |
1271 | 1420 | ||
1272 | /* Don't pull more than imbalance/2 */ | 1421 | move_group_shares(tg, sd, busiest_cpu, this_cpu); |
1273 | imbalance /= 2; | ||
1274 | maxload = min(rem_load_move, imbalance); | ||
1275 | 1422 | ||
1276 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1423 | moved_load *= aggregate(tg, sd)->load; |
1277 | #else | 1424 | moved_load /= aggregate(tg, sd)->rq_weight + 1; |
1278 | # define maxload rem_load_move | ||
1279 | #endif | ||
1280 | /* | ||
1281 | * pass busy_cfs_rq argument into | ||
1282 | * load_balance_[start|next]_fair iterators | ||
1283 | */ | ||
1284 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
1285 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
1286 | maxload, sd, idle, all_pinned, | ||
1287 | this_best_prio, | ||
1288 | &cfs_rq_iterator); | ||
1289 | 1425 | ||
1290 | if (rem_load_move <= 0) | 1426 | rem_load_move -= moved_load; |
1427 | if (rem_load_move < 0) | ||
1291 | break; | 1428 | break; |
1292 | } | 1429 | } |
1430 | rcu_read_unlock(); | ||
1293 | 1431 | ||
1294 | return max_load_move - rem_load_move; | 1432 | return max_load_move - rem_load_move; |
1295 | } | 1433 | } |
1434 | #else | ||
1435 | static unsigned long | ||
1436 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1437 | unsigned long max_load_move, | ||
1438 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1439 | int *all_pinned, int *this_best_prio) | ||
1440 | { | ||
1441 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
1442 | max_load_move, sd, idle, all_pinned, | ||
1443 | this_best_prio, &busiest->cfs); | ||
1444 | } | ||
1445 | #endif | ||
1296 | 1446 | ||
1297 | static int | 1447 | static int |
1298 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1448 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
@@ -1461,16 +1611,40 @@ static const struct sched_class fair_sched_class = { | |||
1461 | }; | 1611 | }; |
1462 | 1612 | ||
1463 | #ifdef CONFIG_SCHED_DEBUG | 1613 | #ifdef CONFIG_SCHED_DEBUG |
1614 | static void | ||
1615 | print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth) | ||
1616 | { | ||
1617 | struct sched_entity *se; | ||
1618 | |||
1619 | if (!cfs_rq) | ||
1620 | return; | ||
1621 | |||
1622 | list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) { | ||
1623 | int i; | ||
1624 | |||
1625 | for (i = depth; i; i--) | ||
1626 | seq_puts(m, " "); | ||
1627 | |||
1628 | seq_printf(m, "%lu %s %lu\n", | ||
1629 | se->load.weight, | ||
1630 | entity_is_task(se) ? "T" : "G", | ||
1631 | calc_delta_weight(SCHED_LOAD_SCALE, se) | ||
1632 | ); | ||
1633 | if (!entity_is_task(se)) | ||
1634 | print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1); | ||
1635 | } | ||
1636 | } | ||
1637 | |||
1464 | static void print_cfs_stats(struct seq_file *m, int cpu) | 1638 | static void print_cfs_stats(struct seq_file *m, int cpu) |
1465 | { | 1639 | { |
1466 | struct cfs_rq *cfs_rq; | 1640 | struct cfs_rq *cfs_rq; |
1467 | 1641 | ||
1468 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1469 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
1470 | #endif | ||
1471 | rcu_read_lock(); | 1642 | rcu_read_lock(); |
1472 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1643 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1473 | print_cfs_rq(m, cpu, cfs_rq); | 1644 | print_cfs_rq(m, cpu, cfs_rq); |
1645 | |||
1646 | seq_printf(m, "\nWeight tree:\n"); | ||
1647 | print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1); | ||
1474 | rcu_read_unlock(); | 1648 | rcu_read_unlock(); |
1475 | } | 1649 | } |
1476 | #endif | 1650 | #endif |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h new file mode 100644 index 000000000000..1c7283cb9581 --- /dev/null +++ b/kernel/sched_features.h | |||
@@ -0,0 +1,10 @@ | |||
1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) | ||
2 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | ||
3 | SCHED_FEAT(START_DEBIT, 1) | ||
4 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | ||
5 | SCHED_FEAT(CACHE_HOT_BUDDY, 1) | ||
6 | SCHED_FEAT(SYNC_WAKEUPS, 1) | ||
7 | SCHED_FEAT(HRTICK, 1) | ||
8 | SCHED_FEAT(DOUBLE_TICK, 0) | ||
9 | SCHED_FEAT(NORMALIZED_SLEEPER, 1) | ||
10 | SCHED_FEAT(DEADLINE, 1) | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0a6d2e516420..c2730a5a4f05 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | |||
62 | if (!rt_rq->tg) | 62 | if (!rt_rq->tg) |
63 | return RUNTIME_INF; | 63 | return RUNTIME_INF; |
64 | 64 | ||
65 | return rt_rq->tg->rt_runtime; | 65 | return rt_rq->rt_runtime; |
66 | } | ||
67 | |||
68 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) | ||
69 | { | ||
70 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | ||
66 | } | 71 | } |
67 | 72 | ||
68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 73 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
@@ -127,14 +132,39 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) | |||
127 | return p->prio != p->normal_prio; | 132 | return p->prio != p->normal_prio; |
128 | } | 133 | } |
129 | 134 | ||
135 | #ifdef CONFIG_SMP | ||
136 | static inline cpumask_t sched_rt_period_mask(void) | ||
137 | { | ||
138 | return cpu_rq(smp_processor_id())->rd->span; | ||
139 | } | ||
140 | #else | ||
141 | static inline cpumask_t sched_rt_period_mask(void) | ||
142 | { | ||
143 | return cpu_online_map; | ||
144 | } | ||
145 | #endif | ||
146 | |||
147 | static inline | ||
148 | struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | ||
149 | { | ||
150 | return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; | ||
151 | } | ||
152 | |||
153 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
154 | { | ||
155 | return &rt_rq->tg->rt_bandwidth; | ||
156 | } | ||
157 | |||
130 | #else | 158 | #else |
131 | 159 | ||
132 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | 160 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) |
133 | { | 161 | { |
134 | if (sysctl_sched_rt_runtime == -1) | 162 | return rt_rq->rt_runtime; |
135 | return RUNTIME_INF; | 163 | } |
136 | 164 | ||
137 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 165 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) |
166 | { | ||
167 | return ktime_to_ns(def_rt_bandwidth.rt_period); | ||
138 | } | 168 | } |
139 | 169 | ||
140 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 170 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
@@ -173,6 +203,102 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) | |||
173 | { | 203 | { |
174 | return rt_rq->rt_throttled; | 204 | return rt_rq->rt_throttled; |
175 | } | 205 | } |
206 | |||
207 | static inline cpumask_t sched_rt_period_mask(void) | ||
208 | { | ||
209 | return cpu_online_map; | ||
210 | } | ||
211 | |||
212 | static inline | ||
213 | struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | ||
214 | { | ||
215 | return &cpu_rq(cpu)->rt; | ||
216 | } | ||
217 | |||
218 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
219 | { | ||
220 | return &def_rt_bandwidth; | ||
221 | } | ||
222 | |||
223 | #endif | ||
224 | |||
225 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | ||
226 | { | ||
227 | int i, idle = 1; | ||
228 | cpumask_t span; | ||
229 | |||
230 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
231 | return 1; | ||
232 | |||
233 | span = sched_rt_period_mask(); | ||
234 | for_each_cpu_mask(i, span) { | ||
235 | int enqueue = 0; | ||
236 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | ||
237 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
238 | |||
239 | spin_lock(&rq->lock); | ||
240 | if (rt_rq->rt_time) { | ||
241 | u64 runtime; | ||
242 | |||
243 | spin_lock(&rt_rq->rt_runtime_lock); | ||
244 | runtime = rt_rq->rt_runtime; | ||
245 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); | ||
246 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
247 | rt_rq->rt_throttled = 0; | ||
248 | enqueue = 1; | ||
249 | } | ||
250 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | ||
251 | idle = 0; | ||
252 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
253 | } | ||
254 | |||
255 | if (enqueue) | ||
256 | sched_rt_rq_enqueue(rt_rq); | ||
257 | spin_unlock(&rq->lock); | ||
258 | } | ||
259 | |||
260 | return idle; | ||
261 | } | ||
262 | |||
263 | #ifdef CONFIG_SMP | ||
264 | static int balance_runtime(struct rt_rq *rt_rq) | ||
265 | { | ||
266 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
267 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; | ||
268 | int i, weight, more = 0; | ||
269 | u64 rt_period; | ||
270 | |||
271 | weight = cpus_weight(rd->span); | ||
272 | |||
273 | spin_lock(&rt_b->rt_runtime_lock); | ||
274 | rt_period = ktime_to_ns(rt_b->rt_period); | ||
275 | for_each_cpu_mask(i, rd->span) { | ||
276 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); | ||
277 | s64 diff; | ||
278 | |||
279 | if (iter == rt_rq) | ||
280 | continue; | ||
281 | |||
282 | spin_lock(&iter->rt_runtime_lock); | ||
283 | diff = iter->rt_runtime - iter->rt_time; | ||
284 | if (diff > 0) { | ||
285 | do_div(diff, weight); | ||
286 | if (rt_rq->rt_runtime + diff > rt_period) | ||
287 | diff = rt_period - rt_rq->rt_runtime; | ||
288 | iter->rt_runtime -= diff; | ||
289 | rt_rq->rt_runtime += diff; | ||
290 | more = 1; | ||
291 | if (rt_rq->rt_runtime == rt_period) { | ||
292 | spin_unlock(&iter->rt_runtime_lock); | ||
293 | break; | ||
294 | } | ||
295 | } | ||
296 | spin_unlock(&iter->rt_runtime_lock); | ||
297 | } | ||
298 | spin_unlock(&rt_b->rt_runtime_lock); | ||
299 | |||
300 | return more; | ||
301 | } | ||
176 | #endif | 302 | #endif |
177 | 303 | ||
178 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | 304 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) |
@@ -197,12 +323,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
197 | if (rt_rq->rt_throttled) | 323 | if (rt_rq->rt_throttled) |
198 | return rt_rq_throttled(rt_rq); | 324 | return rt_rq_throttled(rt_rq); |
199 | 325 | ||
326 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | ||
327 | return 0; | ||
328 | |||
329 | #ifdef CONFIG_SMP | ||
200 | if (rt_rq->rt_time > runtime) { | 330 | if (rt_rq->rt_time > runtime) { |
201 | struct rq *rq = rq_of_rt_rq(rt_rq); | 331 | int more; |
202 | 332 | ||
203 | rq->rt_throttled = 1; | 333 | spin_unlock(&rt_rq->rt_runtime_lock); |
204 | rt_rq->rt_throttled = 1; | 334 | more = balance_runtime(rt_rq); |
335 | spin_lock(&rt_rq->rt_runtime_lock); | ||
205 | 336 | ||
337 | if (more) | ||
338 | runtime = sched_rt_runtime(rt_rq); | ||
339 | } | ||
340 | #endif | ||
341 | |||
342 | if (rt_rq->rt_time > runtime) { | ||
343 | rt_rq->rt_throttled = 1; | ||
206 | if (rt_rq_throttled(rt_rq)) { | 344 | if (rt_rq_throttled(rt_rq)) { |
207 | sched_rt_rq_dequeue(rt_rq); | 345 | sched_rt_rq_dequeue(rt_rq); |
208 | return 1; | 346 | return 1; |
@@ -212,29 +350,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
212 | return 0; | 350 | return 0; |
213 | } | 351 | } |
214 | 352 | ||
215 | static void update_sched_rt_period(struct rq *rq) | ||
216 | { | ||
217 | struct rt_rq *rt_rq; | ||
218 | u64 period; | ||
219 | |||
220 | while (rq->clock > rq->rt_period_expire) { | ||
221 | period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
222 | rq->rt_period_expire += period; | ||
223 | |||
224 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
225 | u64 runtime = sched_rt_runtime(rt_rq); | ||
226 | |||
227 | rt_rq->rt_time -= min(rt_rq->rt_time, runtime); | ||
228 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
229 | rt_rq->rt_throttled = 0; | ||
230 | sched_rt_rq_enqueue(rt_rq); | ||
231 | } | ||
232 | } | ||
233 | |||
234 | rq->rt_throttled = 0; | ||
235 | } | ||
236 | } | ||
237 | |||
238 | /* | 353 | /* |
239 | * Update the current task's runtime statistics. Skip current tasks that | 354 | * Update the current task's runtime statistics. Skip current tasks that |
240 | * are not in our scheduling class. | 355 | * are not in our scheduling class. |
@@ -259,9 +374,15 @@ static void update_curr_rt(struct rq *rq) | |||
259 | curr->se.exec_start = rq->clock; | 374 | curr->se.exec_start = rq->clock; |
260 | cpuacct_charge(curr, delta_exec); | 375 | cpuacct_charge(curr, delta_exec); |
261 | 376 | ||
262 | rt_rq->rt_time += delta_exec; | 377 | for_each_sched_rt_entity(rt_se) { |
263 | if (sched_rt_runtime_exceeded(rt_rq)) | 378 | rt_rq = rt_rq_of_se(rt_se); |
264 | resched_task(curr); | 379 | |
380 | spin_lock(&rt_rq->rt_runtime_lock); | ||
381 | rt_rq->rt_time += delta_exec; | ||
382 | if (sched_rt_runtime_exceeded(rt_rq)) | ||
383 | resched_task(curr); | ||
384 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
385 | } | ||
265 | } | 386 | } |
266 | 387 | ||
267 | static inline | 388 | static inline |
@@ -284,6 +405,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
284 | #ifdef CONFIG_RT_GROUP_SCHED | 405 | #ifdef CONFIG_RT_GROUP_SCHED |
285 | if (rt_se_boosted(rt_se)) | 406 | if (rt_se_boosted(rt_se)) |
286 | rt_rq->rt_nr_boosted++; | 407 | rt_rq->rt_nr_boosted++; |
408 | |||
409 | if (rt_rq->tg) | ||
410 | start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); | ||
411 | #else | ||
412 | start_rt_bandwidth(&def_rt_bandwidth); | ||
287 | #endif | 413 | #endif |
288 | } | 414 | } |
289 | 415 | ||
@@ -353,27 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
353 | /* | 479 | /* |
354 | * Because the prio of an upper entry depends on the lower | 480 | * Because the prio of an upper entry depends on the lower |
355 | * entries, we must remove entries top - down. | 481 | * entries, we must remove entries top - down. |
356 | * | ||
357 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | ||
358 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
359 | */ | 482 | */ |
360 | static void dequeue_rt_stack(struct task_struct *p) | 483 | static void dequeue_rt_stack(struct task_struct *p) |
361 | { | 484 | { |
362 | struct sched_rt_entity *rt_se, *top_se; | 485 | struct sched_rt_entity *rt_se, *back = NULL; |
363 | 486 | ||
364 | /* | 487 | rt_se = &p->rt; |
365 | * dequeue all, top - down. | 488 | for_each_sched_rt_entity(rt_se) { |
366 | */ | 489 | rt_se->back = back; |
367 | do { | 490 | back = rt_se; |
368 | rt_se = &p->rt; | 491 | } |
369 | top_se = NULL; | 492 | |
370 | for_each_sched_rt_entity(rt_se) { | 493 | for (rt_se = back; rt_se; rt_se = rt_se->back) { |
371 | if (on_rt_rq(rt_se)) | 494 | if (on_rt_rq(rt_se)) |
372 | top_se = rt_se; | 495 | dequeue_rt_entity(rt_se); |
373 | } | 496 | } |
374 | if (top_se) | ||
375 | dequeue_rt_entity(top_se); | ||
376 | } while (top_se); | ||
377 | } | 497 | } |
378 | 498 | ||
379 | /* | 499 | /* |
@@ -393,6 +513,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
393 | */ | 513 | */ |
394 | for_each_sched_rt_entity(rt_se) | 514 | for_each_sched_rt_entity(rt_se) |
395 | enqueue_rt_entity(rt_se); | 515 | enqueue_rt_entity(rt_se); |
516 | |||
517 | inc_cpu_load(rq, p->se.load.weight); | ||
396 | } | 518 | } |
397 | 519 | ||
398 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 520 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
@@ -412,6 +534,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
412 | if (rt_rq && rt_rq->rt_nr_running) | 534 | if (rt_rq && rt_rq->rt_nr_running) |
413 | enqueue_rt_entity(rt_se); | 535 | enqueue_rt_entity(rt_se); |
414 | } | 536 | } |
537 | |||
538 | dec_cpu_load(rq, p->se.load.weight); | ||
415 | } | 539 | } |
416 | 540 | ||
417 | /* | 541 | /* |
@@ -1001,7 +1125,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1001 | return 0; | 1125 | return 0; |
1002 | } | 1126 | } |
1003 | 1127 | ||
1004 | static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) | 1128 | static void set_cpus_allowed_rt(struct task_struct *p, |
1129 | const cpumask_t *new_mask) | ||
1005 | { | 1130 | { |
1006 | int weight = cpus_weight(*new_mask); | 1131 | int weight = cpus_weight(*new_mask); |
1007 | 1132 | ||
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 5b32433e7ee5..5bae2e0c3ff2 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -9,6 +9,11 @@ | |||
9 | static int show_schedstat(struct seq_file *seq, void *v) | 9 | static int show_schedstat(struct seq_file *seq, void *v) |
10 | { | 10 | { |
11 | int cpu; | 11 | int cpu; |
12 | int mask_len = NR_CPUS/32 * 9; | ||
13 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
14 | |||
15 | if (mask_str == NULL) | ||
16 | return -ENOMEM; | ||
12 | 17 | ||
13 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 18 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
14 | seq_printf(seq, "timestamp %lu\n", jiffies); | 19 | seq_printf(seq, "timestamp %lu\n", jiffies); |
@@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
36 | preempt_disable(); | 41 | preempt_disable(); |
37 | for_each_domain(cpu, sd) { | 42 | for_each_domain(cpu, sd) { |
38 | enum cpu_idle_type itype; | 43 | enum cpu_idle_type itype; |
39 | char mask_str[NR_CPUS]; | ||
40 | 44 | ||
41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | 45 | cpumask_scnprintf(mask_str, mask_len, sd->span); |
42 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | 46 | seq_printf(seq, "domain%d %s", dcount++, mask_str); |
43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | 47 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; |
44 | itype++) { | 48 | itype++) { |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 31e9f2a47928..3c44956ee7e2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -356,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) | |||
356 | /* Tasklets */ | 356 | /* Tasklets */ |
357 | struct tasklet_head | 357 | struct tasklet_head |
358 | { | 358 | { |
359 | struct tasklet_struct *list; | 359 | struct tasklet_struct *head; |
360 | struct tasklet_struct **tail; | ||
360 | }; | 361 | }; |
361 | 362 | ||
362 | /* Some compilers disobey section attribute on statics when not | 363 | /* Some compilers disobey section attribute on statics when not |
@@ -369,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t) | |||
369 | unsigned long flags; | 370 | unsigned long flags; |
370 | 371 | ||
371 | local_irq_save(flags); | 372 | local_irq_save(flags); |
372 | t->next = __get_cpu_var(tasklet_vec).list; | 373 | t->next = NULL; |
373 | __get_cpu_var(tasklet_vec).list = t; | 374 | *__get_cpu_var(tasklet_vec).tail = t; |
375 | __get_cpu_var(tasklet_vec).tail = &(t->next); | ||
374 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 376 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
375 | local_irq_restore(flags); | 377 | local_irq_restore(flags); |
376 | } | 378 | } |
@@ -382,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) | |||
382 | unsigned long flags; | 384 | unsigned long flags; |
383 | 385 | ||
384 | local_irq_save(flags); | 386 | local_irq_save(flags); |
385 | t->next = __get_cpu_var(tasklet_hi_vec).list; | 387 | t->next = NULL; |
386 | __get_cpu_var(tasklet_hi_vec).list = t; | 388 | *__get_cpu_var(tasklet_hi_vec).tail = t; |
389 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | ||
387 | raise_softirq_irqoff(HI_SOFTIRQ); | 390 | raise_softirq_irqoff(HI_SOFTIRQ); |
388 | local_irq_restore(flags); | 391 | local_irq_restore(flags); |
389 | } | 392 | } |
@@ -395,8 +398,9 @@ static void tasklet_action(struct softirq_action *a) | |||
395 | struct tasklet_struct *list; | 398 | struct tasklet_struct *list; |
396 | 399 | ||
397 | local_irq_disable(); | 400 | local_irq_disable(); |
398 | list = __get_cpu_var(tasklet_vec).list; | 401 | list = __get_cpu_var(tasklet_vec).head; |
399 | __get_cpu_var(tasklet_vec).list = NULL; | 402 | __get_cpu_var(tasklet_vec).head = NULL; |
403 | __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; | ||
400 | local_irq_enable(); | 404 | local_irq_enable(); |
401 | 405 | ||
402 | while (list) { | 406 | while (list) { |
@@ -416,8 +420,9 @@ static void tasklet_action(struct softirq_action *a) | |||
416 | } | 420 | } |
417 | 421 | ||
418 | local_irq_disable(); | 422 | local_irq_disable(); |
419 | t->next = __get_cpu_var(tasklet_vec).list; | 423 | t->next = NULL; |
420 | __get_cpu_var(tasklet_vec).list = t; | 424 | *__get_cpu_var(tasklet_vec).tail = t; |
425 | __get_cpu_var(tasklet_vec).tail = &(t->next); | ||
421 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | 426 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); |
422 | local_irq_enable(); | 427 | local_irq_enable(); |
423 | } | 428 | } |
@@ -428,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
428 | struct tasklet_struct *list; | 433 | struct tasklet_struct *list; |
429 | 434 | ||
430 | local_irq_disable(); | 435 | local_irq_disable(); |
431 | list = __get_cpu_var(tasklet_hi_vec).list; | 436 | list = __get_cpu_var(tasklet_hi_vec).head; |
432 | __get_cpu_var(tasklet_hi_vec).list = NULL; | 437 | __get_cpu_var(tasklet_hi_vec).head = NULL; |
438 | __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; | ||
433 | local_irq_enable(); | 439 | local_irq_enable(); |
434 | 440 | ||
435 | while (list) { | 441 | while (list) { |
@@ -449,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
449 | } | 455 | } |
450 | 456 | ||
451 | local_irq_disable(); | 457 | local_irq_disable(); |
452 | t->next = __get_cpu_var(tasklet_hi_vec).list; | 458 | t->next = NULL; |
453 | __get_cpu_var(tasklet_hi_vec).list = t; | 459 | *__get_cpu_var(tasklet_hi_vec).tail = t; |
460 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | ||
454 | __raise_softirq_irqoff(HI_SOFTIRQ); | 461 | __raise_softirq_irqoff(HI_SOFTIRQ); |
455 | local_irq_enable(); | 462 | local_irq_enable(); |
456 | } | 463 | } |
@@ -487,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill); | |||
487 | 494 | ||
488 | void __init softirq_init(void) | 495 | void __init softirq_init(void) |
489 | { | 496 | { |
497 | int cpu; | ||
498 | |||
499 | for_each_possible_cpu(cpu) { | ||
500 | per_cpu(tasklet_vec, cpu).tail = | ||
501 | &per_cpu(tasklet_vec, cpu).head; | ||
502 | per_cpu(tasklet_hi_vec, cpu).tail = | ||
503 | &per_cpu(tasklet_hi_vec, cpu).head; | ||
504 | } | ||
505 | |||
490 | open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); | 506 | open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); |
491 | open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); | 507 | open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); |
492 | } | 508 | } |
@@ -555,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
555 | return; | 571 | return; |
556 | 572 | ||
557 | /* CPU is dead, so no lock needed. */ | 573 | /* CPU is dead, so no lock needed. */ |
558 | for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { | 574 | for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { |
559 | if (*i == t) { | 575 | if (*i == t) { |
560 | *i = t->next; | 576 | *i = t->next; |
577 | /* If this was the tail element, move the tail ptr */ | ||
578 | if (*i == NULL) | ||
579 | per_cpu(tasklet_vec, cpu).tail = i; | ||
561 | return; | 580 | return; |
562 | } | 581 | } |
563 | } | 582 | } |
@@ -566,20 +585,20 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
566 | 585 | ||
567 | static void takeover_tasklets(unsigned int cpu) | 586 | static void takeover_tasklets(unsigned int cpu) |
568 | { | 587 | { |
569 | struct tasklet_struct **i; | ||
570 | |||
571 | /* CPU is dead, so no lock needed. */ | 588 | /* CPU is dead, so no lock needed. */ |
572 | local_irq_disable(); | 589 | local_irq_disable(); |
573 | 590 | ||
574 | /* Find end, append list for that CPU. */ | 591 | /* Find end, append list for that CPU. */ |
575 | for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); | 592 | *__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head; |
576 | *i = per_cpu(tasklet_vec, cpu).list; | 593 | __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; |
577 | per_cpu(tasklet_vec, cpu).list = NULL; | 594 | per_cpu(tasklet_vec, cpu).head = NULL; |
595 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | ||
578 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 596 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
579 | 597 | ||
580 | for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); | 598 | *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; |
581 | *i = per_cpu(tasklet_hi_vec, cpu).list; | 599 | __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; |
582 | per_cpu(tasklet_hi_vec, cpu).list = NULL; | 600 | per_cpu(tasklet_hi_vec, cpu).head = NULL; |
601 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; | ||
583 | raise_softirq_irqoff(HI_SOFTIRQ); | 602 | raise_softirq_irqoff(HI_SOFTIRQ); |
584 | 603 | ||
585 | local_irq_enable(); | 604 | local_irq_enable(); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6f4e0e13f70c..e1b2a5b1b105 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -35,7 +35,7 @@ static int stopmachine(void *cpu) | |||
35 | int irqs_disabled = 0; | 35 | int irqs_disabled = 0; |
36 | int prepared = 0; | 36 | int prepared = 0; |
37 | 37 | ||
38 | set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); | 38 | set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); |
39 | 39 | ||
40 | /* Ack: we are alive */ | 40 | /* Ack: we are alive */ |
41 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ | 41 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2a2d6889bab..fd3364827ccf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -270,17 +270,6 @@ static struct ctl_table kern_table[] = { | |||
270 | }, | 270 | }, |
271 | { | 271 | { |
272 | .ctl_name = CTL_UNNUMBERED, | 272 | .ctl_name = CTL_UNNUMBERED, |
273 | .procname = "sched_batch_wakeup_granularity_ns", | ||
274 | .data = &sysctl_sched_batch_wakeup_granularity, | ||
275 | .maxlen = sizeof(unsigned int), | ||
276 | .mode = 0644, | ||
277 | .proc_handler = &proc_dointvec_minmax, | ||
278 | .strategy = &sysctl_intvec, | ||
279 | .extra1 = &min_wakeup_granularity_ns, | ||
280 | .extra2 = &max_wakeup_granularity_ns, | ||
281 | }, | ||
282 | { | ||
283 | .ctl_name = CTL_UNNUMBERED, | ||
284 | .procname = "sched_child_runs_first", | 273 | .procname = "sched_child_runs_first", |
285 | .data = &sysctl_sched_child_runs_first, | 274 | .data = &sysctl_sched_child_runs_first, |
286 | .maxlen = sizeof(unsigned int), | 275 | .maxlen = sizeof(unsigned int), |
@@ -318,7 +307,7 @@ static struct ctl_table kern_table[] = { | |||
318 | .data = &sysctl_sched_rt_period, | 307 | .data = &sysctl_sched_rt_period, |
319 | .maxlen = sizeof(unsigned int), | 308 | .maxlen = sizeof(unsigned int), |
320 | .mode = 0644, | 309 | .mode = 0644, |
321 | .proc_handler = &proc_dointvec, | 310 | .proc_handler = &sched_rt_handler, |
322 | }, | 311 | }, |
323 | { | 312 | { |
324 | .ctl_name = CTL_UNNUMBERED, | 313 | .ctl_name = CTL_UNNUMBERED, |
@@ -326,7 +315,7 @@ static struct ctl_table kern_table[] = { | |||
326 | .data = &sysctl_sched_rt_runtime, | 315 | .data = &sysctl_sched_rt_runtime, |
327 | .maxlen = sizeof(int), | 316 | .maxlen = sizeof(int), |
328 | .mode = 0644, | 317 | .mode = 0644, |
329 | .proc_handler = &proc_dointvec, | 318 | .proc_handler = &sched_rt_handler, |
330 | }, | 319 | }, |
331 | { | 320 | { |
332 | .ctl_name = CTL_UNNUMBERED, | 321 | .ctl_name = CTL_UNNUMBERED, |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69dba0c71727..d358d4e3a958 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -191,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | |||
191 | void tick_nohz_stop_sched_tick(void) | 191 | void tick_nohz_stop_sched_tick(void) |
192 | { | 192 | { |
193 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 193 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; |
194 | unsigned long rt_jiffies; | ||
195 | struct tick_sched *ts; | 194 | struct tick_sched *ts; |
196 | ktime_t last_update, expires, now; | 195 | ktime_t last_update, expires, now; |
197 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 196 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
@@ -243,10 +242,6 @@ void tick_nohz_stop_sched_tick(void) | |||
243 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 242 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
244 | delta_jiffies = next_jiffies - last_jiffies; | 243 | delta_jiffies = next_jiffies - last_jiffies; |
245 | 244 | ||
246 | rt_jiffies = rt_needs_cpu(cpu); | ||
247 | if (rt_jiffies && rt_jiffies < delta_jiffies) | ||
248 | delta_jiffies = rt_jiffies; | ||
249 | |||
250 | if (rcu_needs_cpu(cpu)) | 245 | if (rcu_needs_cpu(cpu)) |
251 | delta_jiffies = 1; | 246 | delta_jiffies = 1; |
252 | /* | 247 | /* |
diff --git a/kernel/user.c b/kernel/user.c index 7132022a040c..debce602bfdd 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up) | |||
101 | { | 101 | { |
102 | int rc = 0; | 102 | int rc = 0; |
103 | 103 | ||
104 | up->tg = sched_create_group(); | 104 | up->tg = sched_create_group(&root_task_group); |
105 | if (IS_ERR(up->tg)) | 105 | if (IS_ERR(up->tg)) |
106 | rc = -ENOMEM; | 106 | rc = -ENOMEM; |
107 | 107 | ||
@@ -193,6 +193,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | |||
193 | 193 | ||
194 | static struct kobj_attribute cpu_rt_runtime_attr = | 194 | static struct kobj_attribute cpu_rt_runtime_attr = |
195 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); | 195 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); |
196 | |||
197 | static ssize_t cpu_rt_period_show(struct kobject *kobj, | ||
198 | struct kobj_attribute *attr, | ||
199 | char *buf) | ||
200 | { | ||
201 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
202 | |||
203 | return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); | ||
204 | } | ||
205 | |||
206 | static ssize_t cpu_rt_period_store(struct kobject *kobj, | ||
207 | struct kobj_attribute *attr, | ||
208 | const char *buf, size_t size) | ||
209 | { | ||
210 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
211 | unsigned long rt_period; | ||
212 | int rc; | ||
213 | |||
214 | sscanf(buf, "%lu", &rt_period); | ||
215 | |||
216 | rc = sched_group_set_rt_period(up->tg, rt_period); | ||
217 | |||
218 | return (rc ? rc : size); | ||
219 | } | ||
220 | |||
221 | static struct kobj_attribute cpu_rt_period_attr = | ||
222 | __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); | ||
196 | #endif | 223 | #endif |
197 | 224 | ||
198 | /* default attributes per uid directory */ | 225 | /* default attributes per uid directory */ |
@@ -202,6 +229,7 @@ static struct attribute *uids_attributes[] = { | |||
202 | #endif | 229 | #endif |
203 | #ifdef CONFIG_RT_GROUP_SCHED | 230 | #ifdef CONFIG_RT_GROUP_SCHED |
204 | &cpu_rt_runtime_attr.attr, | 231 | &cpu_rt_runtime_attr.attr, |
232 | &cpu_rt_period_attr.attr, | ||
205 | #endif | 233 | #endif |
206 | NULL | 234 | NULL |
207 | }; | 235 | }; |
diff --git a/lib/bitmap.c b/lib/bitmap.c index 2c9242e3fed0..a6939e18d7bb 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c | |||
@@ -316,6 +316,22 @@ int bitmap_scnprintf(char *buf, unsigned int buflen, | |||
316 | EXPORT_SYMBOL(bitmap_scnprintf); | 316 | EXPORT_SYMBOL(bitmap_scnprintf); |
317 | 317 | ||
318 | /** | 318 | /** |
319 | * bitmap_scnprintf_len - return buffer length needed to convert | ||
320 | * bitmap to an ASCII hex string. | ||
321 | * @len: number of bits to be converted | ||
322 | */ | ||
323 | int bitmap_scnprintf_len(unsigned int len) | ||
324 | { | ||
325 | /* we need 9 chars per word for 32 bit words (8 hexdigits + sep/null) */ | ||
326 | int bitslen = ALIGN(len, CHUNKSZ); | ||
327 | int wordlen = CHUNKSZ / 4; | ||
328 | int buflen = (bitslen / wordlen) * (wordlen + 1) * sizeof(char); | ||
329 | |||
330 | return buflen; | ||
331 | } | ||
332 | EXPORT_SYMBOL(bitmap_scnprintf_len); | ||
333 | |||
334 | /** | ||
319 | * __bitmap_parse - convert an ASCII hex string into a bitmap. | 335 | * __bitmap_parse - convert an ASCII hex string into a bitmap. |
320 | * @buf: pointer to buffer containing string. | 336 | * @buf: pointer to buffer containing string. |
321 | * @buflen: buffer size in bytes. If string is smaller than this | 337 | * @buflen: buffer size in bytes. If string is smaller than this |
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index b0012e27fea8..f4026bae6eed 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c | |||
@@ -82,9 +82,10 @@ EXPORT_SYMBOL_GPL(percpu_populate); | |||
82 | int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | 82 | int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, |
83 | cpumask_t *mask) | 83 | cpumask_t *mask) |
84 | { | 84 | { |
85 | cpumask_t populated = CPU_MASK_NONE; | 85 | cpumask_t populated; |
86 | int cpu; | 86 | int cpu; |
87 | 87 | ||
88 | cpus_clear(populated); | ||
88 | for_each_cpu_mask(cpu, *mask) | 89 | for_each_cpu_mask(cpu, *mask) |
89 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { | 90 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { |
90 | __percpu_depopulate_mask(__pdata, &populated); | 91 | __percpu_depopulate_mask(__pdata, &populated); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 402a504f1228..32e796af12a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2029,6 +2029,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
2029 | int n, val; | 2029 | int n, val; |
2030 | int min_val = INT_MAX; | 2030 | int min_val = INT_MAX; |
2031 | int best_node = -1; | 2031 | int best_node = -1; |
2032 | node_to_cpumask_ptr(tmp, 0); | ||
2032 | 2033 | ||
2033 | /* Use the local node if we haven't already */ | 2034 | /* Use the local node if we haven't already */ |
2034 | if (!node_isset(node, *used_node_mask)) { | 2035 | if (!node_isset(node, *used_node_mask)) { |
@@ -2037,7 +2038,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
2037 | } | 2038 | } |
2038 | 2039 | ||
2039 | for_each_node_state(n, N_HIGH_MEMORY) { | 2040 | for_each_node_state(n, N_HIGH_MEMORY) { |
2040 | cpumask_t tmp; | ||
2041 | 2041 | ||
2042 | /* Don't want a node to appear more than once */ | 2042 | /* Don't want a node to appear more than once */ |
2043 | if (node_isset(n, *used_node_mask)) | 2043 | if (node_isset(n, *used_node_mask)) |
@@ -2050,8 +2050,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
2050 | val += (n < node); | 2050 | val += (n < node); |
2051 | 2051 | ||
2052 | /* Give preference to headless and unused nodes */ | 2052 | /* Give preference to headless and unused nodes */ |
2053 | tmp = node_to_cpumask(n); | 2053 | node_to_cpumask_ptr_next(tmp, n); |
2054 | if (!cpus_empty(tmp)) | 2054 | if (!cpus_empty(*tmp)) |
2055 | val += PENALTY_FOR_NODE_WITH_CPUS; | 2055 | val += PENALTY_FOR_NODE_WITH_CPUS; |
2056 | 2056 | ||
2057 | /* Slight preference for less loaded node */ | 2057 | /* Slight preference for less loaded node */ |
diff --git a/mm/pdflush.c b/mm/pdflush.c index 8f6ee073c0e3..0ceacff56457 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
@@ -187,8 +187,8 @@ static int pdflush(void *dummy) | |||
187 | * This is needed as pdflush's are dynamically created and destroyed. | 187 | * This is needed as pdflush's are dynamically created and destroyed. |
188 | * The boottime pdflush's are easily placed w/o these 2 lines. | 188 | * The boottime pdflush's are easily placed w/o these 2 lines. |
189 | */ | 189 | */ |
190 | cpus_allowed = cpuset_cpus_allowed(current); | 190 | cpuset_cpus_allowed(current, &cpus_allowed); |
191 | set_cpus_allowed(current, cpus_allowed); | 191 | set_cpus_allowed_ptr(current, &cpus_allowed); |
192 | 192 | ||
193 | return __pdflush(&my_work); | 193 | return __pdflush(&my_work); |
194 | } | 194 | } |
@@ -1160,14 +1160,13 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
1160 | struct kmem_cache *cachep; | 1160 | struct kmem_cache *cachep; |
1161 | struct kmem_list3 *l3 = NULL; | 1161 | struct kmem_list3 *l3 = NULL; |
1162 | int node = cpu_to_node(cpu); | 1162 | int node = cpu_to_node(cpu); |
1163 | node_to_cpumask_ptr(mask, node); | ||
1163 | 1164 | ||
1164 | list_for_each_entry(cachep, &cache_chain, next) { | 1165 | list_for_each_entry(cachep, &cache_chain, next) { |
1165 | struct array_cache *nc; | 1166 | struct array_cache *nc; |
1166 | struct array_cache *shared; | 1167 | struct array_cache *shared; |
1167 | struct array_cache **alien; | 1168 | struct array_cache **alien; |
1168 | cpumask_t mask; | ||
1169 | 1169 | ||
1170 | mask = node_to_cpumask(node); | ||
1171 | /* cpu is dead; no one can alloc from it. */ | 1170 | /* cpu is dead; no one can alloc from it. */ |
1172 | nc = cachep->array[cpu]; | 1171 | nc = cachep->array[cpu]; |
1173 | cachep->array[cpu] = NULL; | 1172 | cachep->array[cpu] = NULL; |
@@ -1183,7 +1182,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
1183 | if (nc) | 1182 | if (nc) |
1184 | free_block(cachep, nc->entry, nc->avail, node); | 1183 | free_block(cachep, nc->entry, nc->avail, node); |
1185 | 1184 | ||
1186 | if (!cpus_empty(mask)) { | 1185 | if (!cpus_empty(*mask)) { |
1187 | spin_unlock_irq(&l3->list_lock); | 1186 | spin_unlock_irq(&l3->list_lock); |
1188 | goto free_array_cache; | 1187 | goto free_array_cache; |
1189 | } | 1188 | } |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 4046434046e6..f80a5b7c057f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1647,11 +1647,10 @@ static int kswapd(void *p) | |||
1647 | struct reclaim_state reclaim_state = { | 1647 | struct reclaim_state reclaim_state = { |
1648 | .reclaimed_slab = 0, | 1648 | .reclaimed_slab = 0, |
1649 | }; | 1649 | }; |
1650 | cpumask_t cpumask; | 1650 | node_to_cpumask_ptr(cpumask, pgdat->node_id); |
1651 | 1651 | ||
1652 | cpumask = node_to_cpumask(pgdat->node_id); | 1652 | if (!cpus_empty(*cpumask)) |
1653 | if (!cpus_empty(cpumask)) | 1653 | set_cpus_allowed_ptr(tsk, cpumask); |
1654 | set_cpus_allowed(tsk, cpumask); | ||
1655 | current->reclaim_state = &reclaim_state; | 1654 | current->reclaim_state = &reclaim_state; |
1656 | 1655 | ||
1657 | /* | 1656 | /* |
@@ -1880,17 +1879,16 @@ out: | |||
1880 | static int __devinit cpu_callback(struct notifier_block *nfb, | 1879 | static int __devinit cpu_callback(struct notifier_block *nfb, |
1881 | unsigned long action, void *hcpu) | 1880 | unsigned long action, void *hcpu) |
1882 | { | 1881 | { |
1883 | pg_data_t *pgdat; | ||
1884 | cpumask_t mask; | ||
1885 | int nid; | 1882 | int nid; |
1886 | 1883 | ||
1887 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 1884 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
1888 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1885 | for_each_node_state(nid, N_HIGH_MEMORY) { |
1889 | pgdat = NODE_DATA(nid); | 1886 | pg_data_t *pgdat = NODE_DATA(nid); |
1890 | mask = node_to_cpumask(pgdat->node_id); | 1887 | node_to_cpumask_ptr(mask, pgdat->node_id); |
1891 | if (any_online_cpu(mask) != NR_CPUS) | 1888 | |
1889 | if (any_online_cpu(*mask) < nr_cpu_ids) | ||
1892 | /* One of our CPUs online: restore mask */ | 1890 | /* One of our CPUs online: restore mask */ |
1893 | set_cpus_allowed(pgdat->kswapd, mask); | 1891 | set_cpus_allowed_ptr(pgdat->kswapd, mask); |
1894 | } | 1892 | } |
1895 | } | 1893 | } |
1896 | return NOTIFY_OK; | 1894 | return NOTIFY_OK; |
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a290e1523297..090af78d68b5 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c | |||
@@ -301,7 +301,6 @@ static inline int | |||
301 | svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) | 301 | svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) |
302 | { | 302 | { |
303 | struct svc_pool_map *m = &svc_pool_map; | 303 | struct svc_pool_map *m = &svc_pool_map; |
304 | unsigned int node; /* or cpu */ | ||
305 | 304 | ||
306 | /* | 305 | /* |
307 | * The caller checks for sv_nrpools > 1, which | 306 | * The caller checks for sv_nrpools > 1, which |
@@ -314,16 +313,23 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) | |||
314 | default: | 313 | default: |
315 | return 0; | 314 | return 0; |
316 | case SVC_POOL_PERCPU: | 315 | case SVC_POOL_PERCPU: |
317 | node = m->pool_to[pidx]; | 316 | { |
317 | unsigned int cpu = m->pool_to[pidx]; | ||
318 | |||
318 | *oldmask = current->cpus_allowed; | 319 | *oldmask = current->cpus_allowed; |
319 | set_cpus_allowed(current, cpumask_of_cpu(node)); | 320 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
320 | return 1; | 321 | return 1; |
322 | } | ||
321 | case SVC_POOL_PERNODE: | 323 | case SVC_POOL_PERNODE: |
322 | node = m->pool_to[pidx]; | 324 | { |
325 | unsigned int node = m->pool_to[pidx]; | ||
326 | node_to_cpumask_ptr(nodecpumask, node); | ||
327 | |||
323 | *oldmask = current->cpus_allowed; | 328 | *oldmask = current->cpus_allowed; |
324 | set_cpus_allowed(current, node_to_cpumask(node)); | 329 | set_cpus_allowed_ptr(current, nodecpumask); |
325 | return 1; | 330 | return 1; |
326 | } | 331 | } |
332 | } | ||
327 | } | 333 | } |
328 | 334 | ||
329 | /* | 335 | /* |