aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cpusets.txt72
-rw-r--r--Documentation/scheduler/sched-rt-group.txt188
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/kernel/acpi/cstate.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c28
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c32
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c13
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c20
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c92
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c46
-rw-r--r--arch/x86/kernel/io_apic_64.c2
-rw-r--r--arch/x86/kernel/microcode.c16
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c28
-rw-r--r--arch/x86/mm/numa_64.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c49
-rw-r--r--drivers/acpi/processor_throttling.c10
-rw-r--r--drivers/base/cpu.c48
-rw-r--r--drivers/base/node.c29
-rw-r--r--drivers/base/topology.c41
-rw-r--r--drivers/firmware/dcdbas.c4
-rw-r--r--drivers/pci/pci-driver.c9
-rw-r--r--drivers/pci/pci-sysfs.c20
-rw-r--r--drivers/pci/probe.c27
-rw-r--r--include/asm-alpha/topology.h3
-rw-r--r--include/asm-frv/topology.h4
-rw-r--r--include/asm-generic/topology.h14
-rw-r--r--include/asm-ia64/topology.h7
-rw-r--r--include/asm-powerpc/topology.h3
-rw-r--r--include/asm-sh/topology.h2
-rw-r--r--include/asm-x86/topology.h22
-rw-r--r--include/linux/bitmap.h1
-rw-r--r--include/linux/cpumask.h25
-rw-r--r--include/linux/cpuset.h13
-rw-r--r--include/linux/init_task.h3
-rw-r--r--include/linux/ktime.h6
-rw-r--r--include/linux/sched.h56
-rw-r--r--include/linux/sysdev.h17
-rw-r--r--include/linux/topology.h46
-rw-r--r--init/Kconfig7
-rw-r--r--init/main.c24
-rw-r--r--kernel/compat.c2
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c100
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/latencytop.c27
-rw-r--r--kernel/rcupreempt.c4
-rw-r--r--kernel/rcutorture.c15
-rw-r--r--kernel/sched.c1912
-rw-r--r--kernel/sched_debug.c36
-rw-r--r--kernel/sched_fair.c580
-rw-r--r--kernel/sched_features.h10
-rw-r--r--kernel/sched_rt.c227
-rw-r--r--kernel/sched_stats.h8
-rw-r--r--kernel/softirq.c63
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/user.c30
-rw-r--r--lib/bitmap.c16
-rw-r--r--mm/allocpercpu.c3
-rw-r--r--mm/page_alloc.c6
-rw-r--r--mm/pdflush.c4
-rw-r--r--mm/slab.c5
-rw-r--r--mm/vmscan.c18
-rw-r--r--net/sunrpc/svc.c16
68 files changed, 3157 insertions, 997 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index ad2bb3b3acc1..aa854b9b18cd 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -8,6 +8,7 @@ Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
8Modified by Paul Jackson <pj@sgi.com> 8Modified by Paul Jackson <pj@sgi.com>
9Modified by Christoph Lameter <clameter@sgi.com> 9Modified by Christoph Lameter <clameter@sgi.com>
10Modified by Paul Menage <menage@google.com> 10Modified by Paul Menage <menage@google.com>
11Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
11 12
12CONTENTS: 13CONTENTS:
13========= 14=========
@@ -20,7 +21,8 @@ CONTENTS:
20 1.5 What is memory_pressure ? 21 1.5 What is memory_pressure ?
21 1.6 What is memory spread ? 22 1.6 What is memory spread ?
22 1.7 What is sched_load_balance ? 23 1.7 What is sched_load_balance ?
23 1.8 How do I use cpusets ? 24 1.8 What is sched_relax_domain_level ?
25 1.9 How do I use cpusets ?
242. Usage Examples and Syntax 262. Usage Examples and Syntax
25 2.1 Basic Usage 27 2.1 Basic Usage
26 2.2 Adding/removing cpus 28 2.2 Adding/removing cpus
@@ -497,7 +499,73 @@ the cpuset code to update these sched domains, it compares the new
497partition requested with the current, and updates its sched domains, 499partition requested with the current, and updates its sched domains,
498removing the old and adding the new, for each change. 500removing the old and adding the new, for each change.
499 501
5001.8 How do I use cpusets ? 502
5031.8 What is sched_relax_domain_level ?
504--------------------------------------
505
506In sched domain, the scheduler migrates tasks in 2 ways; periodic load
507balance on tick, and at time of some schedule events.
508
509When a task is woken up, scheduler try to move the task on idle CPU.
510For example, if a task A running on CPU X activates another task B
511on the same CPU X, and if CPU Y is X's sibling and performing idle,
512then scheduler migrate task B to CPU Y so that task B can start on
513CPU Y without waiting task A on CPU X.
514
515And if a CPU run out of tasks in its runqueue, the CPU try to pull
516extra tasks from other busy CPUs to help them before it is going to
517be idle.
518
519Of course it takes some searching cost to find movable tasks and/or
520idle CPUs, the scheduler might not search all CPUs in the domain
521everytime. In fact, in some architectures, the searching ranges on
522events are limited in the same socket or node where the CPU locates,
523while the load balance on tick searchs all.
524
525For example, assume CPU Z is relatively far from CPU X. Even if CPU Z
526is idle while CPU X and the siblings are busy, scheduler can't migrate
527woken task B from X to Z since it is out of its searching range.
528As the result, task B on CPU X need to wait task A or wait load balance
529on the next tick. For some applications in special situation, waiting
5301 tick may be too long.
531
532The 'sched_relax_domain_level' file allows you to request changing
533this searching range as you like. This file takes int value which
534indicates size of searching range in levels ideally as follows,
535otherwise initial value -1 that indicates the cpuset has no request.
536
537 -1 : no request. use system default or follow request of others.
538 0 : no search.
539 1 : search siblings (hyperthreads in a core).
540 2 : search cores in a package.
541 3 : search cpus in a node [= system wide on non-NUMA system]
542 ( 4 : search nodes in a chunk of node [on NUMA system] )
543 ( 5~ : search system wide [on NUMA system])
544
545This file is per-cpuset and affect the sched domain where the cpuset
546belongs to. Therefore if the flag 'sched_load_balance' of a cpuset
547is disabled, then 'sched_relax_domain_level' have no effect since
548there is no sched domain belonging the cpuset.
549
550If multiple cpusets are overlapping and hence they form a single sched
551domain, the largest value among those is used. Be careful, if one
552requests 0 and others are -1 then 0 is used.
553
554Note that modifying this file will have both good and bad effects,
555and whether it is acceptable or not will be depend on your situation.
556Don't modify this file if you are not sure.
557
558If your situation is:
559 - The migration costs between each cpu can be assumed considerably
560 small(for you) due to your special application's behavior or
561 special hardware support for CPU cache etc.
562 - The searching cost doesn't have impact(for you) or you can make
563 the searching cost enough small by managing cpuset to compact etc.
564 - The latency is required even it sacrifices cache hit rate etc.
565then increasing 'sched_relax_domain_level' would benefit you.
566
567
5681.9 How do I use cpusets ?
501-------------------------- 569--------------------------
502 570
503In order to minimize the impact of cpusets on critical kernel 571In order to minimize the impact of cpusets on critical kernel
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 1c6332f4543c..14f901f639ee 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -1,59 +1,177 @@
1 Real-Time group scheduling
2 --------------------------
1 3
4CONTENTS
5========
2 6
3Real-Time group scheduling. 71. Overview
8 1.1 The problem
9 1.2 The solution
102. The interface
11 2.1 System-wide settings
12 2.2 Default behaviour
13 2.3 Basis for grouping tasks
143. Future plans
4 15
5The problem space:
6 16
7In order to schedule multiple groups of realtime tasks each group must 171. Overview
8be assigned a fixed portion of the CPU time available. Without a minimum 18===========
9guarantee a realtime group can obviously fall short. A fuzzy upper limit
10is of no use since it cannot be relied upon. Which leaves us with just
11the single fixed portion.
12 19
13CPU time is divided by means of specifying how much time can be spent
14running in a given period. Say a frame fixed realtime renderer must
15deliver 25 frames a second, which yields a period of 0.04s. Now say
16it will also have to play some music and respond to input, leaving it
17with around 80% for the graphics. We can then give this group a runtime
18of 0.8 * 0.04s = 0.032s.
19 20
20This way the graphics group will have a 0.04s period with a 0.032s runtime 211.1 The problem
21limit. 22---------------
22 23
23Now if the audio thread needs to refill the DMA buffer every 0.005s, but 24Realtime scheduling is all about determinism, a group has to be able to rely on
24needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s 25the amount of bandwidth (eg. CPU time) being constant. In order to schedule
25= 0.00015s. 26multiple groups of realtime tasks, each group must be assigned a fixed portion
27of the CPU time available. Without a minimum guarantee a realtime group can
28obviously fall short. A fuzzy upper limit is of no use since it cannot be
29relied upon. Which leaves us with just the single fixed portion.
26 30
311.2 The solution
32----------------
27 33
28The Interface: 34CPU time is divided by means of specifying how much time can be spent running
35in a given period. We allocate this "run time" for each realtime group which
36the other realtime groups will not be permitted to use.
29 37
30system wide: 38Any time not allocated to a realtime group will be used to run normal priority
39tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by
40SCHED_OTHER.
31 41
32/proc/sys/kernel/sched_rt_period_ms 42Let's consider an example: a frame fixed realtime renderer must deliver 25
33/proc/sys/kernel/sched_rt_runtime_us 43frames a second, which yields a period of 0.04s per frame. Now say it will also
44have to play some music and respond to input, leaving it with around 80% CPU
45time dedicated for the graphics. We can then give this group a run time of 0.8
46* 0.04s = 0.032s.
34 47
35CONFIG_FAIR_USER_SCHED 48This way the graphics group will have a 0.04s period with a 0.032s run time
49limit. Now if the audio thread needs to refill the DMA buffer every 0.005s, but
50needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
510.00015s. So this group can be scheduled with a period of 0.005s and a run time
52of 0.00015s.
36 53
37/sys/kernel/uids/<uid>/cpu_rt_runtime_us 54The remaining CPU time will be used for user input and other tass. Because
55realtime tasks have explicitly allocated the CPU time they need to perform
56their tasks, buffer underruns in the graphocs or audio can be eliminated.
38 57
39or 58NOTE: the above example is not fully implemented as of yet (2.6.25). We still
59lack an EDF scheduler to make non-uniform periods usable.
40 60
41CONFIG_FAIR_CGROUP_SCHED
42 61
43/cgroup/<cgroup>/cpu.rt_runtime_us 622. The Interface
63================
44 64
45[ time is specified in us because the interface is s32; this gives an
46 operating range of ~35m to 1us ]
47 65
48The period takes values in [ 1, INT_MAX ], runtime in [ -1, INT_MAX - 1 ]. 662.1 System wide settings
67------------------------
49 68
50A runtime of -1 specifies runtime == period, ie. no limit. 69The system wide settings are configured under the /proc virtual file system:
51 70
52New groups get the period from /proc/sys/kernel/sched_rt_period_us and 71/proc/sys/kernel/sched_rt_period_us:
53a runtime of 0. 72 The scheduling period that is equivalent to 100% CPU bandwidth
54 73
55Settings are constrained to: 74/proc/sys/kernel/sched_rt_runtime_us:
75 A global limit on how much time realtime scheduling may use. Even without
76 CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime
77 processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth
78 available to all realtime groups.
79
80 * Time is specified in us because the interface is s32. This gives an
81 operating range from 1us to about 35 minutes.
82 * sched_rt_period_us takes values from 1 to INT_MAX.
83 * sched_rt_runtime_us takes values from -1 to (INT_MAX - 1).
84 * A run time of -1 specifies runtime == period, ie. no limit.
85
86
872.2 Default behaviour
88---------------------
89
90The default values for sched_rt_period_us (1000000 or 1s) and
91sched_rt_runtime_us (950000 or 0.95s). This gives 0.05s to be used by
92SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away
93realtime tasks will not lock up the machine but leave a little time to recover
94it. By setting runtime to -1 you'd get the old behaviour back.
95
96By default all bandwidth is assigned to the root group and new groups get the
97period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you
98want to assign bandwidth to another group, reduce the root group's bandwidth
99and assign some or all of the difference to another group.
100
101Realtime group scheduling means you have to assign a portion of total CPU
102bandwidth to the group before it will accept realtime tasks. Therefore you will
103not be able to run realtime tasks as any user other than root until you have
104done that, even if the user has the rights to run processes with realtime
105priority!
106
107
1082.3 Basis for grouping tasks
109----------------------------
110
111There are two compile-time settings for allocating CPU bandwidth. These are
112configured using the "Basis for grouping tasks" multiple choice menu under
113General setup > Group CPU Scheduler:
114
115a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id")
116
117This lets you use the virtual files under
118"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
119each user .
120
121The other option is:
122
123.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
124
125This uses the /cgroup virtual file system and "/cgroup/<cgroup>/cpu.rt_runtime_us"
126to control the CPU time reserved for each control group instead.
127
128For more information on working with control groups, you should read
129Documentation/cgroups.txt as well.
130
131Group settings are checked against the following limits in order to keep the configuration
132schedulable:
56 133
57 \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period 134 \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
58 135
59in order to keep the configuration schedulable. 136For now, this can be simplified to just the following (but see Future plans):
137
138 \Sum_{i} runtime_{i} <= global_runtime
139
140
1413. Future plans
142===============
143
144There is work in progress to make the scheduling period for each group
145("/sys/kernel/uids/<uid>/cpu_rt_period_us" or
146"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
147
148The constraint on the period is that a subgroup must have a smaller or
149equal period to its parent. But realistically its not very useful _yet_
150as its prone to starvation without deadline scheduling.
151
152Consider two sibling groups A and B; both have 50% bandwidth, but A's
153period is twice the length of B's.
154
155* group A: period=100000us, runtime=10000us
156 - this runs for 0.01s once every 0.1s
157
158* group B: period= 50000us, runtime=10000us
159 - this runs for 0.01s twice every 0.1s (or once every 0.05 sec).
160
161This means that currently a while (1) loop in A will run for the full period of
162B and can starve B's tasks (assuming they are of lower priority) for a whole
163period.
164
165The next project will be SCHED_EDF (Earliest Deadline First scheduling) to bring
166full deadline scheduling to the linux kernel. Deadline scheduling the above
167groups and treating end of the period as a deadline will ensure that they both
168get their allocated time.
169
170Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
171the biggest challenge as the current linux PI infrastructure is geared towards
172the limited static priority levels 0-139. With deadline scheduling you need to
173do deadline inheritance (since priority is inversely proportional to the
174deadline delta (deadline - now).
175
176This means the whole PI machinery will have to be reworked - and that is one of
177the most complex pieces of code we have.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 07cf77113565..87a693cf2bb7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -117,6 +117,9 @@ config ARCH_HAS_CPU_RELAX
117config HAVE_SETUP_PER_CPU_AREA 117config HAVE_SETUP_PER_CPU_AREA
118 def_bool X86_64 || (X86_SMP && !X86_VOYAGER) 118 def_bool X86_64 || (X86_SMP && !X86_VOYAGER)
119 119
120config HAVE_CPUMASK_OF_CPU_MAP
121 def_bool X86_64_SMP
122
120config ARCH_HIBERNATION_POSSIBLE 123config ARCH_HIBERNATION_POSSIBLE
121 def_bool y 124 def_bool y
122 depends on !SMP || !X86_VOYAGER 125 depends on !SMP || !X86_VOYAGER
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 9366fb68d8d8..c2502eb9aa83 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -91,7 +91,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
91 91
92 /* Make sure we are running on right CPU */ 92 /* Make sure we are running on right CPU */
93 saved_mask = current->cpus_allowed; 93 saved_mask = current->cpus_allowed;
94 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); 94 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
95 if (retval) 95 if (retval)
96 return -1; 96 return -1;
97 97
@@ -128,7 +128,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
128 cx->address); 128 cx->address);
129 129
130out: 130out:
131 set_cpus_allowed(current, saved_mask); 131 set_cpus_allowed_ptr(current, &saved_mask);
132 return retval; 132 return retval;
133} 133}
134EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 134EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index a962dcb9c408..e2d870de837c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -192,9 +192,9 @@ static void drv_read(struct drv_cmd *cmd)
192 cpumask_t saved_mask = current->cpus_allowed; 192 cpumask_t saved_mask = current->cpus_allowed;
193 cmd->val = 0; 193 cmd->val = 0;
194 194
195 set_cpus_allowed(current, cmd->mask); 195 set_cpus_allowed_ptr(current, &cmd->mask);
196 do_drv_read(cmd); 196 do_drv_read(cmd);
197 set_cpus_allowed(current, saved_mask); 197 set_cpus_allowed_ptr(current, &saved_mask);
198} 198}
199 199
200static void drv_write(struct drv_cmd *cmd) 200static void drv_write(struct drv_cmd *cmd)
@@ -203,30 +203,30 @@ static void drv_write(struct drv_cmd *cmd)
203 unsigned int i; 203 unsigned int i;
204 204
205 for_each_cpu_mask(i, cmd->mask) { 205 for_each_cpu_mask(i, cmd->mask) {
206 set_cpus_allowed(current, cpumask_of_cpu(i)); 206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
207 do_drv_write(cmd); 207 do_drv_write(cmd);
208 } 208 }
209 209
210 set_cpus_allowed(current, saved_mask); 210 set_cpus_allowed_ptr(current, &saved_mask);
211 return; 211 return;
212} 212}
213 213
214static u32 get_cur_val(cpumask_t mask) 214static u32 get_cur_val(const cpumask_t *mask)
215{ 215{
216 struct acpi_processor_performance *perf; 216 struct acpi_processor_performance *perf;
217 struct drv_cmd cmd; 217 struct drv_cmd cmd;
218 218
219 if (unlikely(cpus_empty(mask))) 219 if (unlikely(cpus_empty(*mask)))
220 return 0; 220 return 0;
221 221
222 switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) { 222 switch (per_cpu(drv_data, first_cpu(*mask))->cpu_feature) {
223 case SYSTEM_INTEL_MSR_CAPABLE: 223 case SYSTEM_INTEL_MSR_CAPABLE:
224 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 224 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
225 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 225 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
226 break; 226 break;
227 case SYSTEM_IO_CAPABLE: 227 case SYSTEM_IO_CAPABLE:
228 cmd.type = SYSTEM_IO_CAPABLE; 228 cmd.type = SYSTEM_IO_CAPABLE;
229 perf = per_cpu(drv_data, first_cpu(mask))->acpi_data; 229 perf = per_cpu(drv_data, first_cpu(*mask))->acpi_data;
230 cmd.addr.io.port = perf->control_register.address; 230 cmd.addr.io.port = perf->control_register.address;
231 cmd.addr.io.bit_width = perf->control_register.bit_width; 231 cmd.addr.io.bit_width = perf->control_register.bit_width;
232 break; 232 break;
@@ -234,7 +234,7 @@ static u32 get_cur_val(cpumask_t mask)
234 return 0; 234 return 0;
235 } 235 }
236 236
237 cmd.mask = mask; 237 cmd.mask = *mask;
238 238
239 drv_read(&cmd); 239 drv_read(&cmd);
240 240
@@ -271,7 +271,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
271 unsigned int retval; 271 unsigned int retval;
272 272
273 saved_mask = current->cpus_allowed; 273 saved_mask = current->cpus_allowed;
274 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 274 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
275 if (get_cpu() != cpu) { 275 if (get_cpu() != cpu) {
276 /* We were not able to run on requested processor */ 276 /* We were not able to run on requested processor */
277 put_cpu(); 277 put_cpu();
@@ -329,7 +329,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; 329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
330 330
331 put_cpu(); 331 put_cpu();
332 set_cpus_allowed(current, saved_mask); 332 set_cpus_allowed_ptr(current, &saved_mask);
333 333
334 dprintk("cpu %d: performance percent %d\n", cpu, perf_percent); 334 dprintk("cpu %d: performance percent %d\n", cpu, perf_percent);
335 return retval; 335 return retval;
@@ -347,13 +347,13 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
347 return 0; 347 return 0;
348 } 348 }
349 349
350 freq = extract_freq(get_cur_val(cpumask_of_cpu(cpu)), data); 350 freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data);
351 dprintk("cur freq = %u\n", freq); 351 dprintk("cur freq = %u\n", freq);
352 352
353 return freq; 353 return freq;
354} 354}
355 355
356static unsigned int check_freqs(cpumask_t mask, unsigned int freq, 356static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq,
357 struct acpi_cpufreq_data *data) 357 struct acpi_cpufreq_data *data)
358{ 358{
359 unsigned int cur_freq; 359 unsigned int cur_freq;
@@ -449,7 +449,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
449 drv_write(&cmd); 449 drv_write(&cmd);
450 450
451 if (acpi_pstate_strict) { 451 if (acpi_pstate_strict) {
452 if (!check_freqs(cmd.mask, freqs.new, data)) { 452 if (!check_freqs(&cmd.mask, freqs.new, data)) {
453 dprintk("acpi_cpufreq_target failed (%d)\n", 453 dprintk("acpi_cpufreq_target failed (%d)\n",
454 policy->cpu); 454 policy->cpu);
455 return -EAGAIN; 455 return -EAGAIN;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index c99d59d8ef2e..46d4034d9f37 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -478,12 +478,12 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
478 478
479static int check_supported_cpu(unsigned int cpu) 479static int check_supported_cpu(unsigned int cpu)
480{ 480{
481 cpumask_t oldmask = CPU_MASK_ALL; 481 cpumask_t oldmask;
482 u32 eax, ebx, ecx, edx; 482 u32 eax, ebx, ecx, edx;
483 unsigned int rc = 0; 483 unsigned int rc = 0;
484 484
485 oldmask = current->cpus_allowed; 485 oldmask = current->cpus_allowed;
486 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 486 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
487 487
488 if (smp_processor_id() != cpu) { 488 if (smp_processor_id() != cpu) {
489 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); 489 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
@@ -528,7 +528,7 @@ static int check_supported_cpu(unsigned int cpu)
528 rc = 1; 528 rc = 1;
529 529
530out: 530out:
531 set_cpus_allowed(current, oldmask); 531 set_cpus_allowed_ptr(current, &oldmask);
532 return rc; 532 return rc;
533} 533}
534 534
@@ -1015,7 +1015,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1015/* Driver entry point to switch to the target frequency */ 1015/* Driver entry point to switch to the target frequency */
1016static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1016static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
1017{ 1017{
1018 cpumask_t oldmask = CPU_MASK_ALL; 1018 cpumask_t oldmask;
1019 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1019 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1020 u32 checkfid; 1020 u32 checkfid;
1021 u32 checkvid; 1021 u32 checkvid;
@@ -1030,7 +1030,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1030 1030
1031 /* only run on specific CPU from here on */ 1031 /* only run on specific CPU from here on */
1032 oldmask = current->cpus_allowed; 1032 oldmask = current->cpus_allowed;
1033 set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); 1033 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
1034 1034
1035 if (smp_processor_id() != pol->cpu) { 1035 if (smp_processor_id() != pol->cpu) {
1036 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1036 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1085,7 +1085,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1085 ret = 0; 1085 ret = 0;
1086 1086
1087err_out: 1087err_out:
1088 set_cpus_allowed(current, oldmask); 1088 set_cpus_allowed_ptr(current, &oldmask);
1089 return ret; 1089 return ret;
1090} 1090}
1091 1091
@@ -1104,7 +1104,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
1104static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) 1104static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1105{ 1105{
1106 struct powernow_k8_data *data; 1106 struct powernow_k8_data *data;
1107 cpumask_t oldmask = CPU_MASK_ALL; 1107 cpumask_t oldmask;
1108 int rc; 1108 int rc;
1109 1109
1110 if (!cpu_online(pol->cpu)) 1110 if (!cpu_online(pol->cpu))
@@ -1145,7 +1145,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1145 1145
1146 /* only run on specific CPU from here on */ 1146 /* only run on specific CPU from here on */
1147 oldmask = current->cpus_allowed; 1147 oldmask = current->cpus_allowed;
1148 set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); 1148 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
1149 1149
1150 if (smp_processor_id() != pol->cpu) { 1150 if (smp_processor_id() != pol->cpu) {
1151 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1151 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1164,7 +1164,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1164 fidvid_msr_init(); 1164 fidvid_msr_init();
1165 1165
1166 /* run on any CPU again */ 1166 /* run on any CPU again */
1167 set_cpus_allowed(current, oldmask); 1167 set_cpus_allowed_ptr(current, &oldmask);
1168 1168
1169 if (cpu_family == CPU_HW_PSTATE) 1169 if (cpu_family == CPU_HW_PSTATE)
1170 pol->cpus = cpumask_of_cpu(pol->cpu); 1170 pol->cpus = cpumask_of_cpu(pol->cpu);
@@ -1205,7 +1205,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1205 return 0; 1205 return 0;
1206 1206
1207err_out: 1207err_out:
1208 set_cpus_allowed(current, oldmask); 1208 set_cpus_allowed_ptr(current, &oldmask);
1209 powernow_k8_cpu_exit_acpi(data); 1209 powernow_k8_cpu_exit_acpi(data);
1210 1210
1211 kfree(data); 1211 kfree(data);
@@ -1242,10 +1242,11 @@ static unsigned int powernowk8_get (unsigned int cpu)
1242 if (!data) 1242 if (!data)
1243 return -EINVAL; 1243 return -EINVAL;
1244 1244
1245 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 1245 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
1246 if (smp_processor_id() != cpu) { 1246 if (smp_processor_id() != cpu) {
1247 printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu); 1247 printk(KERN_ERR PFX
1248 set_cpus_allowed(current, oldmask); 1248 "limiting to CPU %d failed in powernowk8_get\n", cpu);
1249 set_cpus_allowed_ptr(current, &oldmask);
1249 return 0; 1250 return 0;
1250 } 1251 }
1251 1252
@@ -1253,13 +1254,14 @@ static unsigned int powernowk8_get (unsigned int cpu)
1253 goto out; 1254 goto out;
1254 1255
1255 if (cpu_family == CPU_HW_PSTATE) 1256 if (cpu_family == CPU_HW_PSTATE)
1256 khz = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 1257 khz = find_khz_freq_from_pstate(data->powernow_table,
1258 data->currpstate);
1257 else 1259 else
1258 khz = find_khz_freq_from_fid(data->currfid); 1260 khz = find_khz_freq_from_fid(data->currfid);
1259 1261
1260 1262
1261out: 1263out:
1262 set_cpus_allowed(current, oldmask); 1264 set_cpus_allowed_ptr(current, &oldmask);
1263 return khz; 1265 return khz;
1264} 1266}
1265 1267
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 3031f1196192..908dd347c67e 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -315,7 +315,7 @@ static unsigned int get_cur_freq(unsigned int cpu)
315 cpumask_t saved_mask; 315 cpumask_t saved_mask;
316 316
317 saved_mask = current->cpus_allowed; 317 saved_mask = current->cpus_allowed;
318 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 318 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
319 if (smp_processor_id() != cpu) 319 if (smp_processor_id() != cpu)
320 return 0; 320 return 0;
321 321
@@ -333,7 +333,7 @@ static unsigned int get_cur_freq(unsigned int cpu)
333 clock_freq = extract_clock(l, cpu, 1); 333 clock_freq = extract_clock(l, cpu, 1);
334 } 334 }
335 335
336 set_cpus_allowed(current, saved_mask); 336 set_cpus_allowed_ptr(current, &saved_mask);
337 return clock_freq; 337 return clock_freq;
338} 338}
339 339
@@ -487,7 +487,7 @@ static int centrino_target (struct cpufreq_policy *policy,
487 else 487 else
488 cpu_set(j, set_mask); 488 cpu_set(j, set_mask);
489 489
490 set_cpus_allowed(current, set_mask); 490 set_cpus_allowed_ptr(current, &set_mask);
491 preempt_disable(); 491 preempt_disable();
492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) {
493 dprintk("couldn't limit to CPUs in this domain\n"); 493 dprintk("couldn't limit to CPUs in this domain\n");
@@ -555,7 +555,8 @@ static int centrino_target (struct cpufreq_policy *policy,
555 555
556 if (!cpus_empty(covered_cpus)) { 556 if (!cpus_empty(covered_cpus)) {
557 for_each_cpu_mask(j, covered_cpus) { 557 for_each_cpu_mask(j, covered_cpus) {
558 set_cpus_allowed(current, cpumask_of_cpu(j)); 558 set_cpus_allowed_ptr(current,
559 &cpumask_of_cpu(j));
559 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
560 } 561 }
561 } 562 }
@@ -569,12 +570,12 @@ static int centrino_target (struct cpufreq_policy *policy,
569 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
570 } 571 }
571 } 572 }
572 set_cpus_allowed(current, saved_mask); 573 set_cpus_allowed_ptr(current, &saved_mask);
573 return 0; 574 return 0;
574 575
575migrate_end: 576migrate_end:
576 preempt_enable(); 577 preempt_enable();
577 set_cpus_allowed(current, saved_mask); 578 set_cpus_allowed_ptr(current, &saved_mask);
578 return 0; 579 return 0;
579} 580}
580 581
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 14d68aa301ee..1b50244b1fdf 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -229,22 +229,22 @@ static unsigned int speedstep_detect_chipset (void)
229 return 0; 229 return 0;
230} 230}
231 231
232static unsigned int _speedstep_get(cpumask_t cpus) 232static unsigned int _speedstep_get(const cpumask_t *cpus)
233{ 233{
234 unsigned int speed; 234 unsigned int speed;
235 cpumask_t cpus_allowed; 235 cpumask_t cpus_allowed;
236 236
237 cpus_allowed = current->cpus_allowed; 237 cpus_allowed = current->cpus_allowed;
238 set_cpus_allowed(current, cpus); 238 set_cpus_allowed_ptr(current, cpus);
239 speed = speedstep_get_processor_frequency(speedstep_processor); 239 speed = speedstep_get_processor_frequency(speedstep_processor);
240 set_cpus_allowed(current, cpus_allowed); 240 set_cpus_allowed_ptr(current, &cpus_allowed);
241 dprintk("detected %u kHz as current frequency\n", speed); 241 dprintk("detected %u kHz as current frequency\n", speed);
242 return speed; 242 return speed;
243} 243}
244 244
245static unsigned int speedstep_get(unsigned int cpu) 245static unsigned int speedstep_get(unsigned int cpu)
246{ 246{
247 return _speedstep_get(cpumask_of_cpu(cpu)); 247 return _speedstep_get(&cpumask_of_cpu(cpu));
248} 248}
249 249
250/** 250/**
@@ -267,7 +267,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
267 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) 267 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
268 return -EINVAL; 268 return -EINVAL;
269 269
270 freqs.old = _speedstep_get(policy->cpus); 270 freqs.old = _speedstep_get(&policy->cpus);
271 freqs.new = speedstep_freqs[newstate].frequency; 271 freqs.new = speedstep_freqs[newstate].frequency;
272 freqs.cpu = policy->cpu; 272 freqs.cpu = policy->cpu;
273 273
@@ -285,12 +285,12 @@ static int speedstep_target (struct cpufreq_policy *policy,
285 } 285 }
286 286
287 /* switch to physical CPU where state is to be changed */ 287 /* switch to physical CPU where state is to be changed */
288 set_cpus_allowed(current, policy->cpus); 288 set_cpus_allowed_ptr(current, &policy->cpus);
289 289
290 speedstep_set_state(newstate); 290 speedstep_set_state(newstate);
291 291
292 /* allow to be run on all CPUs */ 292 /* allow to be run on all CPUs */
293 set_cpus_allowed(current, cpus_allowed); 293 set_cpus_allowed_ptr(current, &cpus_allowed);
294 294
295 for_each_cpu_mask(i, policy->cpus) { 295 for_each_cpu_mask(i, policy->cpus) {
296 freqs.cpu = i; 296 freqs.cpu = i;
@@ -326,7 +326,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
326#endif 326#endif
327 327
328 cpus_allowed = current->cpus_allowed; 328 cpus_allowed = current->cpus_allowed;
329 set_cpus_allowed(current, policy->cpus); 329 set_cpus_allowed_ptr(current, &policy->cpus);
330 330
331 /* detect low and high frequency and transition latency */ 331 /* detect low and high frequency and transition latency */
332 result = speedstep_get_freqs(speedstep_processor, 332 result = speedstep_get_freqs(speedstep_processor,
@@ -334,12 +334,12 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
334 &speedstep_freqs[SPEEDSTEP_HIGH].frequency, 334 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
335 &policy->cpuinfo.transition_latency, 335 &policy->cpuinfo.transition_latency,
336 &speedstep_set_state); 336 &speedstep_set_state);
337 set_cpus_allowed(current, cpus_allowed); 337 set_cpus_allowed_ptr(current, &cpus_allowed);
338 if (result) 338 if (result)
339 return result; 339 return result;
340 340
341 /* get current speed setting */ 341 /* get current speed setting */
342 speed = _speedstep_get(policy->cpus); 342 speed = _speedstep_get(&policy->cpus);
343 if (!speed) 343 if (!speed)
344 return -EIO; 344 return -EIO;
345 345
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1b889860eb73..26d615dcb149 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -129,7 +129,7 @@ struct _cpuid4_info {
129 union _cpuid4_leaf_ebx ebx; 129 union _cpuid4_leaf_ebx ebx;
130 union _cpuid4_leaf_ecx ecx; 130 union _cpuid4_leaf_ecx ecx;
131 unsigned long size; 131 unsigned long size;
132 cpumask_t shared_cpu_map; 132 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
133}; 133};
134 134
135unsigned short num_cache_leaves; 135unsigned short num_cache_leaves;
@@ -451,8 +451,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
451} 451}
452 452
453/* pointer to _cpuid4_info array (for each cache leaf) */ 453/* pointer to _cpuid4_info array (for each cache leaf) */
454static struct _cpuid4_info *cpuid4_info[NR_CPUS]; 454static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
455#define CPUID4_INFO_IDX(x,y) (&((cpuid4_info[x])[y])) 455#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
456 456
457#ifdef CONFIG_SMP 457#ifdef CONFIG_SMP
458static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 458static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -474,7 +474,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
474 if (cpu_data(i).apicid >> index_msb == 474 if (cpu_data(i).apicid >> index_msb ==
475 c->apicid >> index_msb) { 475 c->apicid >> index_msb) {
476 cpu_set(i, this_leaf->shared_cpu_map); 476 cpu_set(i, this_leaf->shared_cpu_map);
477 if (i != cpu && cpuid4_info[i]) { 477 if (i != cpu && per_cpu(cpuid4_info, i)) {
478 sibling_leaf = CPUID4_INFO_IDX(i, index); 478 sibling_leaf = CPUID4_INFO_IDX(i, index);
479 cpu_set(cpu, sibling_leaf->shared_cpu_map); 479 cpu_set(cpu, sibling_leaf->shared_cpu_map);
480 } 480 }
@@ -505,8 +505,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
505 for (i = 0; i < num_cache_leaves; i++) 505 for (i = 0; i < num_cache_leaves; i++)
506 cache_remove_shared_cpu_map(cpu, i); 506 cache_remove_shared_cpu_map(cpu, i);
507 507
508 kfree(cpuid4_info[cpu]); 508 kfree(per_cpu(cpuid4_info, cpu));
509 cpuid4_info[cpu] = NULL; 509 per_cpu(cpuid4_info, cpu) = NULL;
510} 510}
511 511
512static int __cpuinit detect_cache_attributes(unsigned int cpu) 512static int __cpuinit detect_cache_attributes(unsigned int cpu)
@@ -519,13 +519,13 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
519 if (num_cache_leaves == 0) 519 if (num_cache_leaves == 0)
520 return -ENOENT; 520 return -ENOENT;
521 521
522 cpuid4_info[cpu] = kzalloc( 522 per_cpu(cpuid4_info, cpu) = kzalloc(
523 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 523 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
524 if (cpuid4_info[cpu] == NULL) 524 if (per_cpu(cpuid4_info, cpu) == NULL)
525 return -ENOMEM; 525 return -ENOMEM;
526 526
527 oldmask = current->cpus_allowed; 527 oldmask = current->cpus_allowed;
528 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); 528 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
529 if (retval) 529 if (retval)
530 goto out; 530 goto out;
531 531
@@ -542,12 +542,12 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
542 } 542 }
543 cache_shared_cpu_map_setup(cpu, j); 543 cache_shared_cpu_map_setup(cpu, j);
544 } 544 }
545 set_cpus_allowed(current, oldmask); 545 set_cpus_allowed_ptr(current, &oldmask);
546 546
547out: 547out:
548 if (retval) { 548 if (retval) {
549 kfree(cpuid4_info[cpu]); 549 kfree(per_cpu(cpuid4_info, cpu));
550 cpuid4_info[cpu] = NULL; 550 per_cpu(cpuid4_info, cpu) = NULL;
551 } 551 }
552 552
553 return retval; 553 return retval;
@@ -561,7 +561,7 @@ out:
561extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ 561extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
562 562
563/* pointer to kobject for cpuX/cache */ 563/* pointer to kobject for cpuX/cache */
564static struct kobject * cache_kobject[NR_CPUS]; 564static DEFINE_PER_CPU(struct kobject *, cache_kobject);
565 565
566struct _index_kobject { 566struct _index_kobject {
567 struct kobject kobj; 567 struct kobject kobj;
@@ -570,8 +570,8 @@ struct _index_kobject {
570}; 570};
571 571
572/* pointer to array of kobjects for cpuX/cache/indexY */ 572/* pointer to array of kobjects for cpuX/cache/indexY */
573static struct _index_kobject *index_kobject[NR_CPUS]; 573static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
574#define INDEX_KOBJECT_PTR(x,y) (&((index_kobject[x])[y])) 574#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
575 575
576#define show_one_plus(file_name, object, val) \ 576#define show_one_plus(file_name, object, val) \
577static ssize_t show_##file_name \ 577static ssize_t show_##file_name \
@@ -591,11 +591,32 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
591 return sprintf (buf, "%luK\n", this_leaf->size / 1024); 591 return sprintf (buf, "%luK\n", this_leaf->size / 1024);
592} 592}
593 593
594static ssize_t show_shared_cpu_map(struct _cpuid4_info *this_leaf, char *buf) 594static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
595 int type, char *buf)
595{ 596{
596 char mask_str[NR_CPUS]; 597 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
597 cpumask_scnprintf(mask_str, NR_CPUS, this_leaf->shared_cpu_map); 598 int n = 0;
598 return sprintf(buf, "%s\n", mask_str); 599
600 if (len > 1) {
601 cpumask_t *mask = &this_leaf->shared_cpu_map;
602
603 n = type?
604 cpulist_scnprintf(buf, len-2, *mask):
605 cpumask_scnprintf(buf, len-2, *mask);
606 buf[n++] = '\n';
607 buf[n] = '\0';
608 }
609 return n;
610}
611
612static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf)
613{
614 return show_shared_cpu_map_func(leaf, 0, buf);
615}
616
617static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
618{
619 return show_shared_cpu_map_func(leaf, 1, buf);
599} 620}
600 621
601static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { 622static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
@@ -633,6 +654,7 @@ define_one_ro(ways_of_associativity);
633define_one_ro(number_of_sets); 654define_one_ro(number_of_sets);
634define_one_ro(size); 655define_one_ro(size);
635define_one_ro(shared_cpu_map); 656define_one_ro(shared_cpu_map);
657define_one_ro(shared_cpu_list);
636 658
637static struct attribute * default_attrs[] = { 659static struct attribute * default_attrs[] = {
638 &type.attr, 660 &type.attr,
@@ -643,6 +665,7 @@ static struct attribute * default_attrs[] = {
643 &number_of_sets.attr, 665 &number_of_sets.attr,
644 &size.attr, 666 &size.attr,
645 &shared_cpu_map.attr, 667 &shared_cpu_map.attr,
668 &shared_cpu_list.attr,
646 NULL 669 NULL
647}; 670};
648 671
@@ -684,10 +707,10 @@ static struct kobj_type ktype_percpu_entry = {
684 707
685static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) 708static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
686{ 709{
687 kfree(cache_kobject[cpu]); 710 kfree(per_cpu(cache_kobject, cpu));
688 kfree(index_kobject[cpu]); 711 kfree(per_cpu(index_kobject, cpu));
689 cache_kobject[cpu] = NULL; 712 per_cpu(cache_kobject, cpu) = NULL;
690 index_kobject[cpu] = NULL; 713 per_cpu(index_kobject, cpu) = NULL;
691 free_cache_attributes(cpu); 714 free_cache_attributes(cpu);
692} 715}
693 716
@@ -703,13 +726,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
703 return err; 726 return err;
704 727
705 /* Allocate all required memory */ 728 /* Allocate all required memory */
706 cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL); 729 per_cpu(cache_kobject, cpu) =
707 if (unlikely(cache_kobject[cpu] == NULL)) 730 kzalloc(sizeof(struct kobject), GFP_KERNEL);
731 if (unlikely(per_cpu(cache_kobject, cpu) == NULL))
708 goto err_out; 732 goto err_out;
709 733
710 index_kobject[cpu] = kzalloc( 734 per_cpu(index_kobject, cpu) = kzalloc(
711 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); 735 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
712 if (unlikely(index_kobject[cpu] == NULL)) 736 if (unlikely(per_cpu(index_kobject, cpu) == NULL))
713 goto err_out; 737 goto err_out;
714 738
715 return 0; 739 return 0;
@@ -733,7 +757,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
733 if (unlikely(retval < 0)) 757 if (unlikely(retval < 0))
734 return retval; 758 return retval;
735 759
736 retval = kobject_init_and_add(cache_kobject[cpu], &ktype_percpu_entry, 760 retval = kobject_init_and_add(per_cpu(cache_kobject, cpu),
761 &ktype_percpu_entry,
737 &sys_dev->kobj, "%s", "cache"); 762 &sys_dev->kobj, "%s", "cache");
738 if (retval < 0) { 763 if (retval < 0) {
739 cpuid4_cache_sysfs_exit(cpu); 764 cpuid4_cache_sysfs_exit(cpu);
@@ -745,13 +770,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
745 this_object->cpu = cpu; 770 this_object->cpu = cpu;
746 this_object->index = i; 771 this_object->index = i;
747 retval = kobject_init_and_add(&(this_object->kobj), 772 retval = kobject_init_and_add(&(this_object->kobj),
748 &ktype_cache, cache_kobject[cpu], 773 &ktype_cache,
774 per_cpu(cache_kobject, cpu),
749 "index%1lu", i); 775 "index%1lu", i);
750 if (unlikely(retval)) { 776 if (unlikely(retval)) {
751 for (j = 0; j < i; j++) { 777 for (j = 0; j < i; j++) {
752 kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); 778 kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj));
753 } 779 }
754 kobject_put(cache_kobject[cpu]); 780 kobject_put(per_cpu(cache_kobject, cpu));
755 cpuid4_cache_sysfs_exit(cpu); 781 cpuid4_cache_sysfs_exit(cpu);
756 break; 782 break;
757 } 783 }
@@ -760,7 +786,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
760 if (!retval) 786 if (!retval)
761 cpu_set(cpu, cache_dev_map); 787 cpu_set(cpu, cache_dev_map);
762 788
763 kobject_uevent(cache_kobject[cpu], KOBJ_ADD); 789 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
764 return retval; 790 return retval;
765} 791}
766 792
@@ -769,7 +795,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
769 unsigned int cpu = sys_dev->id; 795 unsigned int cpu = sys_dev->id;
770 unsigned long i; 796 unsigned long i;
771 797
772 if (cpuid4_info[cpu] == NULL) 798 if (per_cpu(cpuid4_info, cpu) == NULL)
773 return; 799 return;
774 if (!cpu_isset(cpu, cache_dev_map)) 800 if (!cpu_isset(cpu, cache_dev_map))
775 return; 801 return;
@@ -777,7 +803,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
777 803
778 for (i = 0; i < num_cache_leaves; i++) 804 for (i = 0; i < num_cache_leaves; i++)
779 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 805 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
780 kobject_put(cache_kobject[cpu]); 806 kobject_put(per_cpu(cache_kobject, cpu));
781 cpuid4_cache_sysfs_exit(cpu); 807 cpuid4_cache_sysfs_exit(cpu);
782} 808}
783 809
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 32671da8184e..7c9a813e1193 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -251,18 +251,18 @@ struct threshold_attr {
251 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 251 ssize_t(*store) (struct threshold_block *, const char *, size_t count);
252}; 252};
253 253
254static cpumask_t affinity_set(unsigned int cpu) 254static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
255 cpumask_t *newmask)
255{ 256{
256 cpumask_t oldmask = current->cpus_allowed; 257 *oldmask = current->cpus_allowed;
257 cpumask_t newmask = CPU_MASK_NONE; 258 cpus_clear(*newmask);
258 cpu_set(cpu, newmask); 259 cpu_set(cpu, *newmask);
259 set_cpus_allowed(current, newmask); 260 set_cpus_allowed_ptr(current, newmask);
260 return oldmask;
261} 261}
262 262
263static void affinity_restore(cpumask_t oldmask) 263static void affinity_restore(const cpumask_t *oldmask)
264{ 264{
265 set_cpus_allowed(current, oldmask); 265 set_cpus_allowed_ptr(current, oldmask);
266} 266}
267 267
268#define SHOW_FIELDS(name) \ 268#define SHOW_FIELDS(name) \
@@ -277,15 +277,15 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
277 const char *buf, size_t count) 277 const char *buf, size_t count)
278{ 278{
279 char *end; 279 char *end;
280 cpumask_t oldmask; 280 cpumask_t oldmask, newmask;
281 unsigned long new = simple_strtoul(buf, &end, 0); 281 unsigned long new = simple_strtoul(buf, &end, 0);
282 if (end == buf) 282 if (end == buf)
283 return -EINVAL; 283 return -EINVAL;
284 b->interrupt_enable = !!new; 284 b->interrupt_enable = !!new;
285 285
286 oldmask = affinity_set(b->cpu); 286 affinity_set(b->cpu, &oldmask, &newmask);
287 threshold_restart_bank(b, 0, 0); 287 threshold_restart_bank(b, 0, 0);
288 affinity_restore(oldmask); 288 affinity_restore(&oldmask);
289 289
290 return end - buf; 290 return end - buf;
291} 291}
@@ -294,7 +294,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
294 const char *buf, size_t count) 294 const char *buf, size_t count)
295{ 295{
296 char *end; 296 char *end;
297 cpumask_t oldmask; 297 cpumask_t oldmask, newmask;
298 u16 old; 298 u16 old;
299 unsigned long new = simple_strtoul(buf, &end, 0); 299 unsigned long new = simple_strtoul(buf, &end, 0);
300 if (end == buf) 300 if (end == buf)
@@ -306,9 +306,9 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
306 old = b->threshold_limit; 306 old = b->threshold_limit;
307 b->threshold_limit = new; 307 b->threshold_limit = new;
308 308
309 oldmask = affinity_set(b->cpu); 309 affinity_set(b->cpu, &oldmask, &newmask);
310 threshold_restart_bank(b, 0, old); 310 threshold_restart_bank(b, 0, old);
311 affinity_restore(oldmask); 311 affinity_restore(&oldmask);
312 312
313 return end - buf; 313 return end - buf;
314} 314}
@@ -316,10 +316,10 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
316static ssize_t show_error_count(struct threshold_block *b, char *buf) 316static ssize_t show_error_count(struct threshold_block *b, char *buf)
317{ 317{
318 u32 high, low; 318 u32 high, low;
319 cpumask_t oldmask; 319 cpumask_t oldmask, newmask;
320 oldmask = affinity_set(b->cpu); 320 affinity_set(b->cpu, &oldmask, &newmask);
321 rdmsr(b->address, low, high); 321 rdmsr(b->address, low, high);
322 affinity_restore(oldmask); 322 affinity_restore(&oldmask);
323 return sprintf(buf, "%x\n", 323 return sprintf(buf, "%x\n",
324 (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); 324 (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
325} 325}
@@ -327,10 +327,10 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf)
327static ssize_t store_error_count(struct threshold_block *b, 327static ssize_t store_error_count(struct threshold_block *b,
328 const char *buf, size_t count) 328 const char *buf, size_t count)
329{ 329{
330 cpumask_t oldmask; 330 cpumask_t oldmask, newmask;
331 oldmask = affinity_set(b->cpu); 331 affinity_set(b->cpu, &oldmask, &newmask);
332 threshold_restart_bank(b, 1, 0); 332 threshold_restart_bank(b, 1, 0);
333 affinity_restore(oldmask); 333 affinity_restore(&oldmask);
334 return 1; 334 return 1;
335} 335}
336 336
@@ -468,7 +468,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
468{ 468{
469 int i, err = 0; 469 int i, err = 0;
470 struct threshold_bank *b = NULL; 470 struct threshold_bank *b = NULL;
471 cpumask_t oldmask = CPU_MASK_NONE; 471 cpumask_t oldmask, newmask;
472 char name[32]; 472 char name[32];
473 473
474 sprintf(name, "threshold_bank%i", bank); 474 sprintf(name, "threshold_bank%i", bank);
@@ -519,10 +519,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
519 519
520 per_cpu(threshold_banks, cpu)[bank] = b; 520 per_cpu(threshold_banks, cpu)[bank] = b;
521 521
522 oldmask = affinity_set(cpu); 522 affinity_set(cpu, &oldmask, &newmask);
523 err = allocate_threshold_blocks(cpu, bank, 0, 523 err = allocate_threshold_blocks(cpu, bank, 0,
524 MSR_IA32_MC0_MISC + bank * 4); 524 MSR_IA32_MC0_MISC + bank * 4);
525 affinity_restore(oldmask); 525 affinity_restore(&oldmask);
526 526
527 if (err) 527 if (err)
528 goto out_free; 528 goto out_free;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index b54464b26658..9ba11d07920f 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -785,7 +785,7 @@ static void __clear_irq_vector(int irq)
785 per_cpu(vector_irq, cpu)[vector] = -1; 785 per_cpu(vector_irq, cpu)[vector] = -1;
786 786
787 cfg->vector = 0; 787 cfg->vector = 0;
788 cfg->domain = CPU_MASK_NONE; 788 cpus_clear(cfg->domain);
789} 789}
790 790
791void __setup_vector_irq(int cpu) 791void __setup_vector_irq(int cpu)
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 25cf6dee4e56..69729e38b78a 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -402,7 +402,7 @@ static int do_microcode_update (void)
402 402
403 if (!uci->valid) 403 if (!uci->valid)
404 continue; 404 continue;
405 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 405 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
406 error = get_maching_microcode(new_mc, cpu); 406 error = get_maching_microcode(new_mc, cpu);
407 if (error < 0) 407 if (error < 0)
408 goto out; 408 goto out;
@@ -416,7 +416,7 @@ out:
416 vfree(new_mc); 416 vfree(new_mc);
417 if (cursor < 0) 417 if (cursor < 0)
418 error = cursor; 418 error = cursor;
419 set_cpus_allowed(current, old); 419 set_cpus_allowed_ptr(current, &old);
420 return error; 420 return error;
421} 421}
422 422
@@ -579,7 +579,7 @@ static int apply_microcode_check_cpu(int cpu)
579 return 0; 579 return 0;
580 580
581 old = current->cpus_allowed; 581 old = current->cpus_allowed;
582 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 582 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
583 583
584 /* Check if the microcode we have in memory matches the CPU */ 584 /* Check if the microcode we have in memory matches the CPU */
585 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 585 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
@@ -610,7 +610,7 @@ static int apply_microcode_check_cpu(int cpu)
610 " sig=0x%x, pf=0x%x, rev=0x%x\n", 610 " sig=0x%x, pf=0x%x, rev=0x%x\n",
611 cpu, uci->sig, uci->pf, uci->rev); 611 cpu, uci->sig, uci->pf, uci->rev);
612 612
613 set_cpus_allowed(current, old); 613 set_cpus_allowed_ptr(current, &old);
614 return err; 614 return err;
615} 615}
616 616
@@ -621,13 +621,13 @@ static void microcode_init_cpu(int cpu, int resume)
621 621
622 old = current->cpus_allowed; 622 old = current->cpus_allowed;
623 623
624 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 624 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
625 mutex_lock(&microcode_mutex); 625 mutex_lock(&microcode_mutex);
626 collect_cpu_info(cpu); 626 collect_cpu_info(cpu);
627 if (uci->valid && system_state == SYSTEM_RUNNING && !resume) 627 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
628 cpu_request_microcode(cpu); 628 cpu_request_microcode(cpu);
629 mutex_unlock(&microcode_mutex); 629 mutex_unlock(&microcode_mutex);
630 set_cpus_allowed(current, old); 630 set_cpus_allowed_ptr(current, &old);
631} 631}
632 632
633static void microcode_fini_cpu(int cpu) 633static void microcode_fini_cpu(int cpu)
@@ -657,14 +657,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
657 old = current->cpus_allowed; 657 old = current->cpus_allowed;
658 658
659 get_online_cpus(); 659 get_online_cpus();
660 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 660 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
661 661
662 mutex_lock(&microcode_mutex); 662 mutex_lock(&microcode_mutex);
663 if (uci->valid) 663 if (uci->valid)
664 err = cpu_request_microcode(cpu); 664 err = cpu_request_microcode(cpu);
665 mutex_unlock(&microcode_mutex); 665 mutex_unlock(&microcode_mutex);
666 put_online_cpus(); 666 put_online_cpus();
667 set_cpus_allowed(current, old); 667 set_cpus_allowed_ptr(current, &old);
668 } 668 }
669 if (err) 669 if (err)
670 return err; 670 return err;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9692202d3bfb..19c9386ac118 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -420,7 +420,7 @@ static void native_machine_shutdown(void)
420 reboot_cpu_id = smp_processor_id(); 420 reboot_cpu_id = smp_processor_id();
421 421
422 /* Make certain I only run on the appropriate processor */ 422 /* Make certain I only run on the appropriate processor */
423 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); 423 set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id));
424 424
425 /* O.K Now that I'm on the appropriate processor, 425 /* O.K Now that I'm on the appropriate processor,
426 * stop all of the others. 426 * stop all of the others.
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ed157c90412e..0d1f44ae6eea 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -54,6 +54,24 @@ static void __init setup_per_cpu_maps(void)
54#endif 54#endif
55} 55}
56 56
57#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
58cpumask_t *cpumask_of_cpu_map __read_mostly;
59EXPORT_SYMBOL(cpumask_of_cpu_map);
60
61/* requires nr_cpu_ids to be initialized */
62static void __init setup_cpumask_of_cpu(void)
63{
64 int i;
65
66 /* alloc_bootmem zeroes memory */
67 cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
68 for (i = 0; i < nr_cpu_ids; i++)
69 cpu_set(i, cpumask_of_cpu_map[i]);
70}
71#else
72static inline void setup_cpumask_of_cpu(void) { }
73#endif
74
57#ifdef CONFIG_X86_32 75#ifdef CONFIG_X86_32
58/* 76/*
59 * Great future not-so-futuristic plan: make i386 and x86_64 do it 77 * Great future not-so-futuristic plan: make i386 and x86_64 do it
@@ -70,7 +88,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
70 */ 88 */
71void __init setup_per_cpu_areas(void) 89void __init setup_per_cpu_areas(void)
72{ 90{
73 int i; 91 int i, highest_cpu = 0;
74 unsigned long size; 92 unsigned long size;
75 93
76#ifdef CONFIG_HOTPLUG_CPU 94#ifdef CONFIG_HOTPLUG_CPU
@@ -104,10 +122,18 @@ void __init setup_per_cpu_areas(void)
104 __per_cpu_offset[i] = ptr - __per_cpu_start; 122 __per_cpu_offset[i] = ptr - __per_cpu_start;
105#endif 123#endif
106 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 124 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
125
126 highest_cpu = i;
107 } 127 }
108 128
129 nr_cpu_ids = highest_cpu + 1;
130 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
131
109 /* Setup percpu data maps */ 132 /* Setup percpu data maps */
110 setup_per_cpu_maps(); 133 setup_per_cpu_maps();
134
135 /* Setup cpumask_of_cpu map */
136 setup_cpumask_of_cpu();
111} 137}
112 138
113#endif 139#endif
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index cb3170186355..9a6892200b27 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -386,9 +386,10 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
386 * Sets up the system RAM area from start_pfn to end_pfn according to the 386 * Sets up the system RAM area from start_pfn to end_pfn according to the
387 * numa=fake command-line option. 387 * numa=fake command-line option.
388 */ 388 */
389static struct bootnode nodes[MAX_NUMNODES] __initdata;
390
389static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 391static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
390{ 392{
391 struct bootnode nodes[MAX_NUMNODES];
392 u64 size, addr = start_pfn << PAGE_SHIFT; 393 u64 size, addr = start_pfn << PAGE_SHIFT;
393 u64 max_addr = end_pfn << PAGE_SHIFT; 394 u64 max_addr = end_pfn << PAGE_SHIFT;
394 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 395 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 1f11cf0a307f..cc48d3fde545 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -23,8 +23,8 @@
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24 24
25static struct op_x86_model_spec const *model; 25static struct op_x86_model_spec const *model;
26static struct op_msrs cpu_msrs[NR_CPUS]; 26static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
27static unsigned long saved_lvtpc[NR_CPUS]; 27static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
28 28
29static int nmi_start(void); 29static int nmi_start(void);
30static void nmi_stop(void); 30static void nmi_stop(void);
@@ -89,7 +89,7 @@ static int profile_exceptions_notify(struct notifier_block *self,
89 89
90 switch (val) { 90 switch (val) {
91 case DIE_NMI: 91 case DIE_NMI:
92 if (model->check_ctrs(args->regs, &cpu_msrs[cpu])) 92 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
93 ret = NOTIFY_STOP; 93 ret = NOTIFY_STOP;
94 break; 94 break;
95 default: 95 default:
@@ -126,7 +126,7 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs)
126static void nmi_save_registers(void *dummy) 126static void nmi_save_registers(void *dummy)
127{ 127{
128 int cpu = smp_processor_id(); 128 int cpu = smp_processor_id();
129 struct op_msrs *msrs = &cpu_msrs[cpu]; 129 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
130 nmi_cpu_save_registers(msrs); 130 nmi_cpu_save_registers(msrs);
131} 131}
132 132
@@ -134,10 +134,10 @@ static void free_msrs(void)
134{ 134{
135 int i; 135 int i;
136 for_each_possible_cpu(i) { 136 for_each_possible_cpu(i) {
137 kfree(cpu_msrs[i].counters); 137 kfree(per_cpu(cpu_msrs, i).counters);
138 cpu_msrs[i].counters = NULL; 138 per_cpu(cpu_msrs, i).counters = NULL;
139 kfree(cpu_msrs[i].controls); 139 kfree(per_cpu(cpu_msrs, i).controls);
140 cpu_msrs[i].controls = NULL; 140 per_cpu(cpu_msrs, i).controls = NULL;
141 } 141 }
142} 142}
143 143
@@ -149,13 +149,15 @@ static int allocate_msrs(void)
149 149
150 int i; 150 int i;
151 for_each_possible_cpu(i) { 151 for_each_possible_cpu(i) {
152 cpu_msrs[i].counters = kmalloc(counters_size, GFP_KERNEL); 152 per_cpu(cpu_msrs, i).counters = kmalloc(counters_size,
153 if (!cpu_msrs[i].counters) { 153 GFP_KERNEL);
154 if (!per_cpu(cpu_msrs, i).counters) {
154 success = 0; 155 success = 0;
155 break; 156 break;
156 } 157 }
157 cpu_msrs[i].controls = kmalloc(controls_size, GFP_KERNEL); 158 per_cpu(cpu_msrs, i).controls = kmalloc(controls_size,
158 if (!cpu_msrs[i].controls) { 159 GFP_KERNEL);
160 if (!per_cpu(cpu_msrs, i).controls) {
159 success = 0; 161 success = 0;
160 break; 162 break;
161 } 163 }
@@ -170,11 +172,11 @@ static int allocate_msrs(void)
170static void nmi_cpu_setup(void *dummy) 172static void nmi_cpu_setup(void *dummy)
171{ 173{
172 int cpu = smp_processor_id(); 174 int cpu = smp_processor_id();
173 struct op_msrs *msrs = &cpu_msrs[cpu]; 175 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
174 spin_lock(&oprofilefs_lock); 176 spin_lock(&oprofilefs_lock);
175 model->setup_ctrs(msrs); 177 model->setup_ctrs(msrs);
176 spin_unlock(&oprofilefs_lock); 178 spin_unlock(&oprofilefs_lock);
177 saved_lvtpc[cpu] = apic_read(APIC_LVTPC); 179 per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
178 apic_write(APIC_LVTPC, APIC_DM_NMI); 180 apic_write(APIC_LVTPC, APIC_DM_NMI);
179} 181}
180 182
@@ -203,13 +205,15 @@ static int nmi_setup(void)
203 */ 205 */
204 206
205 /* Assume saved/restored counters are the same on all CPUs */ 207 /* Assume saved/restored counters are the same on all CPUs */
206 model->fill_in_addresses(&cpu_msrs[0]); 208 model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
207 for_each_possible_cpu(cpu) { 209 for_each_possible_cpu(cpu) {
208 if (cpu != 0) { 210 if (cpu != 0) {
209 memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters, 211 memcpy(per_cpu(cpu_msrs, cpu).counters,
212 per_cpu(cpu_msrs, 0).counters,
210 sizeof(struct op_msr) * model->num_counters); 213 sizeof(struct op_msr) * model->num_counters);
211 214
212 memcpy(cpu_msrs[cpu].controls, cpu_msrs[0].controls, 215 memcpy(per_cpu(cpu_msrs, cpu).controls,
216 per_cpu(cpu_msrs, 0).controls,
213 sizeof(struct op_msr) * model->num_controls); 217 sizeof(struct op_msr) * model->num_controls);
214 } 218 }
215 219
@@ -249,7 +253,7 @@ static void nmi_cpu_shutdown(void *dummy)
249{ 253{
250 unsigned int v; 254 unsigned int v;
251 int cpu = smp_processor_id(); 255 int cpu = smp_processor_id();
252 struct op_msrs *msrs = &cpu_msrs[cpu]; 256 struct op_msrs *msrs = &__get_cpu_var(cpu_msrs);
253 257
254 /* restoring APIC_LVTPC can trigger an apic error because the delivery 258 /* restoring APIC_LVTPC can trigger an apic error because the delivery
255 * mode and vector nr combination can be illegal. That's by design: on 259 * mode and vector nr combination can be illegal. That's by design: on
@@ -258,23 +262,24 @@ static void nmi_cpu_shutdown(void *dummy)
258 */ 262 */
259 v = apic_read(APIC_LVTERR); 263 v = apic_read(APIC_LVTERR);
260 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); 264 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
261 apic_write(APIC_LVTPC, saved_lvtpc[cpu]); 265 apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
262 apic_write(APIC_LVTERR, v); 266 apic_write(APIC_LVTERR, v);
263 nmi_restore_registers(msrs); 267 nmi_restore_registers(msrs);
264} 268}
265 269
266static void nmi_shutdown(void) 270static void nmi_shutdown(void)
267{ 271{
272 struct op_msrs *msrs = &__get_cpu_var(cpu_msrs);
268 nmi_enabled = 0; 273 nmi_enabled = 0;
269 on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); 274 on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1);
270 unregister_die_notifier(&profile_exceptions_nb); 275 unregister_die_notifier(&profile_exceptions_nb);
271 model->shutdown(cpu_msrs); 276 model->shutdown(msrs);
272 free_msrs(); 277 free_msrs();
273} 278}
274 279
275static void nmi_cpu_start(void *dummy) 280static void nmi_cpu_start(void *dummy)
276{ 281{
277 struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()]; 282 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
278 model->start(msrs); 283 model->start(msrs);
279} 284}
280 285
@@ -286,7 +291,7 @@ static int nmi_start(void)
286 291
287static void nmi_cpu_stop(void *dummy) 292static void nmi_cpu_stop(void *dummy)
288{ 293{
289 struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()]; 294 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
290 model->stop(msrs); 295 model->stop(msrs);
291} 296}
292 297
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index 1b8e592a8241..0bba3a914e86 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -838,10 +838,10 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr)
838 * Migrate task to the cpu pointed by pr. 838 * Migrate task to the cpu pointed by pr.
839 */ 839 */
840 saved_mask = current->cpus_allowed; 840 saved_mask = current->cpus_allowed;
841 set_cpus_allowed(current, cpumask_of_cpu(pr->id)); 841 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pr->id));
842 ret = pr->throttling.acpi_processor_get_throttling(pr); 842 ret = pr->throttling.acpi_processor_get_throttling(pr);
843 /* restore the previous state */ 843 /* restore the previous state */
844 set_cpus_allowed(current, saved_mask); 844 set_cpus_allowed_ptr(current, &saved_mask);
845 845
846 return ret; 846 return ret;
847} 847}
@@ -1025,7 +1025,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state)
1025 * it can be called only for the cpu pointed by pr. 1025 * it can be called only for the cpu pointed by pr.
1026 */ 1026 */
1027 if (p_throttling->shared_type == DOMAIN_COORD_TYPE_SW_ANY) { 1027 if (p_throttling->shared_type == DOMAIN_COORD_TYPE_SW_ANY) {
1028 set_cpus_allowed(current, cpumask_of_cpu(pr->id)); 1028 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pr->id));
1029 ret = p_throttling->acpi_processor_set_throttling(pr, 1029 ret = p_throttling->acpi_processor_set_throttling(pr,
1030 t_state.target_state); 1030 t_state.target_state);
1031 } else { 1031 } else {
@@ -1056,7 +1056,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state)
1056 continue; 1056 continue;
1057 } 1057 }
1058 t_state.cpu = i; 1058 t_state.cpu = i;
1059 set_cpus_allowed(current, cpumask_of_cpu(i)); 1059 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
1060 ret = match_pr->throttling. 1060 ret = match_pr->throttling.
1061 acpi_processor_set_throttling( 1061 acpi_processor_set_throttling(
1062 match_pr, t_state.target_state); 1062 match_pr, t_state.target_state);
@@ -1074,7 +1074,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state)
1074 &t_state); 1074 &t_state);
1075 } 1075 }
1076 /* restore the previous state */ 1076 /* restore the previous state */
1077 set_cpus_allowed(current, saved_mask); 1077 set_cpus_allowed_ptr(current, &saved_mask);
1078 return ret; 1078 return ret;
1079} 1079}
1080 1080
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 499b003f9278..2c76afff3b15 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -103,6 +103,51 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
103#endif 103#endif
104 104
105/* 105/*
106 * Print cpu online, possible, present, and system maps
107 */
108static ssize_t print_cpus_map(char *buf, cpumask_t *map)
109{
110 int n = cpulist_scnprintf(buf, PAGE_SIZE-2, *map);
111
112 buf[n++] = '\n';
113 buf[n] = '\0';
114 return n;
115}
116
117#define print_cpus_func(type) \
118static ssize_t print_cpus_##type(struct sysdev_class *class, char *buf) \
119{ \
120 return print_cpus_map(buf, &cpu_##type##_map); \
121} \
122struct sysdev_class_attribute attr_##type##_map = \
123 _SYSDEV_CLASS_ATTR(type, 0444, print_cpus_##type, NULL)
124
125print_cpus_func(online);
126print_cpus_func(possible);
127print_cpus_func(present);
128
129struct sysdev_class_attribute *cpu_state_attr[] = {
130 &attr_online_map,
131 &attr_possible_map,
132 &attr_present_map,
133};
134
135static int cpu_states_init(void)
136{
137 int i;
138 int err = 0;
139
140 for (i = 0; i < ARRAY_SIZE(cpu_state_attr); i++) {
141 int ret;
142 ret = sysdev_class_create_file(&cpu_sysdev_class,
143 cpu_state_attr[i]);
144 if (!err)
145 err = ret;
146 }
147 return err;
148}
149
150/*
106 * register_cpu - Setup a sysfs device for a CPU. 151 * register_cpu - Setup a sysfs device for a CPU.
107 * @cpu - cpu->hotpluggable field set to 1 will generate a control file in 152 * @cpu - cpu->hotpluggable field set to 1 will generate a control file in
108 * sysfs for this CPU. 153 * sysfs for this CPU.
@@ -147,6 +192,9 @@ int __init cpu_dev_init(void)
147 int err; 192 int err;
148 193
149 err = sysdev_class_register(&cpu_sysdev_class); 194 err = sysdev_class_register(&cpu_sysdev_class);
195 if (!err)
196 err = cpu_states_init();
197
150#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 198#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
151 if (!err) 199 if (!err)
152 err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class); 200 err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index e59861f18ce5..12fde2d03d69 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -19,21 +19,34 @@ static struct sysdev_class node_class = {
19}; 19};
20 20
21 21
22static ssize_t node_read_cpumap(struct sys_device * dev, char * buf) 22static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
23{ 23{
24 struct node *node_dev = to_node(dev); 24 struct node *node_dev = to_node(dev);
25 cpumask_t mask = node_to_cpumask(node_dev->sysdev.id); 25 node_to_cpumask_ptr(mask, node_dev->sysdev.id);
26 int len; 26 int len;
27 27
28 /* 2004/06/03: buf currently PAGE_SIZE, need > 1 char per 4 bits. */ 28 /* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
29 BUILD_BUG_ON(MAX_NUMNODES/4 > PAGE_SIZE/2); 29 BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
30 30
31 len = cpumask_scnprintf(buf, PAGE_SIZE-1, mask); 31 len = type?
32 len += sprintf(buf + len, "\n"); 32 cpulist_scnprintf(buf, PAGE_SIZE-2, *mask):
33 cpumask_scnprintf(buf, PAGE_SIZE-2, *mask);
34 buf[len++] = '\n';
35 buf[len] = '\0';
33 return len; 36 return len;
34} 37}
35 38
36static SYSDEV_ATTR(cpumap, S_IRUGO, node_read_cpumap, NULL); 39static inline ssize_t node_read_cpumask(struct sys_device *dev, char *buf)
40{
41 return node_read_cpumap(dev, 0, buf);
42}
43static inline ssize_t node_read_cpulist(struct sys_device *dev, char *buf)
44{
45 return node_read_cpumap(dev, 1, buf);
46}
47
48static SYSDEV_ATTR(cpumap, S_IRUGO, node_read_cpumask, NULL);
49static SYSDEV_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL);
37 50
38#define K(x) ((x) << (PAGE_SHIFT - 10)) 51#define K(x) ((x) << (PAGE_SHIFT - 10))
39static ssize_t node_read_meminfo(struct sys_device * dev, char * buf) 52static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
@@ -149,6 +162,7 @@ int register_node(struct node *node, int num, struct node *parent)
149 162
150 if (!error){ 163 if (!error){
151 sysdev_create_file(&node->sysdev, &attr_cpumap); 164 sysdev_create_file(&node->sysdev, &attr_cpumap);
165 sysdev_create_file(&node->sysdev, &attr_cpulist);
152 sysdev_create_file(&node->sysdev, &attr_meminfo); 166 sysdev_create_file(&node->sysdev, &attr_meminfo);
153 sysdev_create_file(&node->sysdev, &attr_numastat); 167 sysdev_create_file(&node->sysdev, &attr_numastat);
154 sysdev_create_file(&node->sysdev, &attr_distance); 168 sysdev_create_file(&node->sysdev, &attr_distance);
@@ -166,6 +180,7 @@ int register_node(struct node *node, int num, struct node *parent)
166void unregister_node(struct node *node) 180void unregister_node(struct node *node)
167{ 181{
168 sysdev_remove_file(&node->sysdev, &attr_cpumap); 182 sysdev_remove_file(&node->sysdev, &attr_cpumap);
183 sysdev_remove_file(&node->sysdev, &attr_cpulist);
169 sysdev_remove_file(&node->sysdev, &attr_meminfo); 184 sysdev_remove_file(&node->sysdev, &attr_meminfo);
170 sysdev_remove_file(&node->sysdev, &attr_numastat); 185 sysdev_remove_file(&node->sysdev, &attr_numastat);
171 sysdev_remove_file(&node->sysdev, &attr_distance); 186 sysdev_remove_file(&node->sysdev, &attr_distance);
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index e1d3ad4db2f0..fdf4044d2e74 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -40,15 +40,38 @@ static ssize_t show_##name(struct sys_device *dev, char *buf) \
40 return sprintf(buf, "%d\n", topology_##name(cpu)); \ 40 return sprintf(buf, "%d\n", topology_##name(cpu)); \
41} 41}
42 42
43#define define_siblings_show_func(name) \ 43static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
44static ssize_t show_##name(struct sys_device *dev, char *buf) \ 44{
45 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
46 int n = 0;
47
48 if (len > 1) {
49 n = type?
50 cpulist_scnprintf(buf, len-2, *mask):
51 cpumask_scnprintf(buf, len-2, *mask);
52 buf[n++] = '\n';
53 buf[n] = '\0';
54 }
55 return n;
56}
57
58#define define_siblings_show_map(name) \
59static inline ssize_t show_##name(struct sys_device *dev, char *buf) \
45{ \ 60{ \
46 ssize_t len = -1; \
47 unsigned int cpu = dev->id; \ 61 unsigned int cpu = dev->id; \
48 len = cpumask_scnprintf(buf, NR_CPUS+1, topology_##name(cpu)); \ 62 return show_cpumap(0, &(topology_##name(cpu)), buf); \
49 return (len + sprintf(buf + len, "\n")); \
50} 63}
51 64
65#define define_siblings_show_list(name) \
66static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
67{ \
68 unsigned int cpu = dev->id; \
69 return show_cpumap(1, &(topology_##name(cpu)), buf); \
70}
71
72#define define_siblings_show_func(name) \
73 define_siblings_show_map(name); define_siblings_show_list(name)
74
52#ifdef topology_physical_package_id 75#ifdef topology_physical_package_id
53define_id_show_func(physical_package_id); 76define_id_show_func(physical_package_id);
54define_one_ro(physical_package_id); 77define_one_ro(physical_package_id);
@@ -68,7 +91,9 @@ define_one_ro(core_id);
68#ifdef topology_thread_siblings 91#ifdef topology_thread_siblings
69define_siblings_show_func(thread_siblings); 92define_siblings_show_func(thread_siblings);
70define_one_ro(thread_siblings); 93define_one_ro(thread_siblings);
71#define ref_thread_siblings_attr &attr_thread_siblings.attr, 94define_one_ro(thread_siblings_list);
95#define ref_thread_siblings_attr \
96 &attr_thread_siblings.attr, &attr_thread_siblings_list.attr,
72#else 97#else
73#define ref_thread_siblings_attr 98#define ref_thread_siblings_attr
74#endif 99#endif
@@ -76,7 +101,9 @@ define_one_ro(thread_siblings);
76#ifdef topology_core_siblings 101#ifdef topology_core_siblings
77define_siblings_show_func(core_siblings); 102define_siblings_show_func(core_siblings);
78define_one_ro(core_siblings); 103define_one_ro(core_siblings);
79#define ref_core_siblings_attr &attr_core_siblings.attr, 104define_one_ro(core_siblings_list);
105#define ref_core_siblings_attr \
106 &attr_core_siblings.attr, &attr_core_siblings_list.attr,
80#else 107#else
81#define ref_core_siblings_attr 108#define ref_core_siblings_attr
82#endif 109#endif
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 1636806ec55e..0ffef3b7c6ca 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -265,7 +265,7 @@ static int smi_request(struct smi_cmd *smi_cmd)
265 265
266 /* SMI requires CPU 0 */ 266 /* SMI requires CPU 0 */
267 old_mask = current->cpus_allowed; 267 old_mask = current->cpus_allowed;
268 set_cpus_allowed(current, cpumask_of_cpu(0)); 268 set_cpus_allowed_ptr(current, &cpumask_of_cpu(0));
269 if (smp_processor_id() != 0) { 269 if (smp_processor_id() != 0) {
270 dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n", 270 dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
271 __FUNCTION__); 271 __FUNCTION__);
@@ -285,7 +285,7 @@ static int smi_request(struct smi_cmd *smi_cmd)
285 ); 285 );
286 286
287out: 287out:
288 set_cpus_allowed(current, old_mask); 288 set_cpus_allowed_ptr(current, &old_mask);
289 return ret; 289 return ret;
290} 290}
291 291
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index e571c72e6753..e8d94fafc280 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -182,15 +182,18 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
182 struct mempolicy *oldpol; 182 struct mempolicy *oldpol;
183 cpumask_t oldmask = current->cpus_allowed; 183 cpumask_t oldmask = current->cpus_allowed;
184 int node = pcibus_to_node(dev->bus); 184 int node = pcibus_to_node(dev->bus);
185 if (node >= 0 && node_online(node)) 185
186 set_cpus_allowed(current, node_to_cpumask(node)); 186 if (node >= 0) {
187 node_to_cpumask_ptr(nodecpumask, node);
188 set_cpus_allowed_ptr(current, nodecpumask);
189 }
187 /* And set default memory allocation policy */ 190 /* And set default memory allocation policy */
188 oldpol = current->mempolicy; 191 oldpol = current->mempolicy;
189 current->mempolicy = NULL; /* fall back to system default policy */ 192 current->mempolicy = NULL; /* fall back to system default policy */
190#endif 193#endif
191 error = drv->probe(dev, id); 194 error = drv->probe(dev, id);
192#ifdef CONFIG_NUMA 195#ifdef CONFIG_NUMA
193 set_cpus_allowed(current, oldmask); 196 set_cpus_allowed_ptr(current, &oldmask);
194 current->mempolicy = oldpol; 197 current->mempolicy = oldpol;
195#endif 198#endif
196 return error; 199 return error;
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 8dcf1458aa2f..8d9d648daeba 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -73,8 +73,23 @@ static ssize_t local_cpus_show(struct device *dev,
73 73
74 mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); 74 mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
75 len = cpumask_scnprintf(buf, PAGE_SIZE-2, mask); 75 len = cpumask_scnprintf(buf, PAGE_SIZE-2, mask);
76 strcat(buf,"\n"); 76 buf[len++] = '\n';
77 return 1+len; 77 buf[len] = '\0';
78 return len;
79}
80
81
82static ssize_t local_cpulist_show(struct device *dev,
83 struct device_attribute *attr, char *buf)
84{
85 cpumask_t mask;
86 int len;
87
88 mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
89 len = cpulist_scnprintf(buf, PAGE_SIZE-2, mask);
90 buf[len++] = '\n';
91 buf[len] = '\0';
92 return len;
78} 93}
79 94
80/* show resources */ 95/* show resources */
@@ -201,6 +216,7 @@ struct device_attribute pci_dev_attrs[] = {
201 __ATTR_RO(class), 216 __ATTR_RO(class),
202 __ATTR_RO(irq), 217 __ATTR_RO(irq),
203 __ATTR_RO(local_cpus), 218 __ATTR_RO(local_cpus),
219 __ATTR_RO(local_cpulist),
204 __ATTR_RO(modalias), 220 __ATTR_RO(modalias),
205#ifdef CONFIG_NUMA 221#ifdef CONFIG_NUMA
206 __ATTR_RO(numa_node), 222 __ATTR_RO(numa_node),
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 2db2e4bb0d1e..4b3011a23eff 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -82,6 +82,7 @@ void pci_remove_legacy_files(struct pci_bus *bus) { return; }
82 * PCI Bus Class Devices 82 * PCI Bus Class Devices
83 */ 83 */
84static ssize_t pci_bus_show_cpuaffinity(struct device *dev, 84static ssize_t pci_bus_show_cpuaffinity(struct device *dev,
85 int type,
85 struct device_attribute *attr, 86 struct device_attribute *attr,
86 char *buf) 87 char *buf)
87{ 88{
@@ -89,12 +90,30 @@ static ssize_t pci_bus_show_cpuaffinity(struct device *dev,
89 cpumask_t cpumask; 90 cpumask_t cpumask;
90 91
91 cpumask = pcibus_to_cpumask(to_pci_bus(dev)); 92 cpumask = pcibus_to_cpumask(to_pci_bus(dev));
92 ret = cpumask_scnprintf(buf, PAGE_SIZE, cpumask); 93 ret = type?
93 if (ret < PAGE_SIZE) 94 cpulist_scnprintf(buf, PAGE_SIZE-2, cpumask):
94 buf[ret++] = '\n'; 95 cpumask_scnprintf(buf, PAGE_SIZE-2, cpumask);
96 buf[ret++] = '\n';
97 buf[ret] = '\0';
95 return ret; 98 return ret;
96} 99}
97DEVICE_ATTR(cpuaffinity, S_IRUGO, pci_bus_show_cpuaffinity, NULL); 100
101static ssize_t inline pci_bus_show_cpumaskaffinity(struct device *dev,
102 struct device_attribute *attr,
103 char *buf)
104{
105 return pci_bus_show_cpuaffinity(dev, 0, attr, buf);
106}
107
108static ssize_t inline pci_bus_show_cpulistaffinity(struct device *dev,
109 struct device_attribute *attr,
110 char *buf)
111{
112 return pci_bus_show_cpuaffinity(dev, 1, attr, buf);
113}
114
115DEVICE_ATTR(cpuaffinity, S_IRUGO, pci_bus_show_cpumaskaffinity, NULL);
116DEVICE_ATTR(cpulistaffinity, S_IRUGO, pci_bus_show_cpulistaffinity, NULL);
98 117
99/* 118/*
100 * PCI Bus Class 119 * PCI Bus Class
diff --git a/include/asm-alpha/topology.h b/include/asm-alpha/topology.h
index 420ccde6b916..149532e162c4 100644
--- a/include/asm-alpha/topology.h
+++ b/include/asm-alpha/topology.h
@@ -41,8 +41,7 @@ static inline cpumask_t node_to_cpumask(int node)
41 41
42#define pcibus_to_cpumask(bus) (cpu_online_map) 42#define pcibus_to_cpumask(bus) (cpu_online_map)
43 43
44#else /* CONFIG_NUMA */
45# include <asm-generic/topology.h>
46#endif /* !CONFIG_NUMA */ 44#endif /* !CONFIG_NUMA */
45# include <asm-generic/topology.h>
47 46
48#endif /* _ASM_ALPHA_TOPOLOGY_H */ 47#endif /* _ASM_ALPHA_TOPOLOGY_H */
diff --git a/include/asm-frv/topology.h b/include/asm-frv/topology.h
index abe7298742ac..942724352705 100644
--- a/include/asm-frv/topology.h
+++ b/include/asm-frv/topology.h
@@ -5,10 +5,8 @@
5 5
6#error NUMA not supported yet 6#error NUMA not supported yet
7 7
8#else /* !CONFIG_NUMA */ 8#endif /* CONFIG_NUMA */
9 9
10#include <asm-generic/topology.h> 10#include <asm-generic/topology.h>
11 11
12#endif /* CONFIG_NUMA */
13
14#endif /* _ASM_TOPOLOGY_H */ 12#endif /* _ASM_TOPOLOGY_H */
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index 342a2a0105c4..a6aea79bca4f 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -27,6 +27,8 @@
27#ifndef _ASM_GENERIC_TOPOLOGY_H 27#ifndef _ASM_GENERIC_TOPOLOGY_H
28#define _ASM_GENERIC_TOPOLOGY_H 28#define _ASM_GENERIC_TOPOLOGY_H
29 29
30#ifndef CONFIG_NUMA
31
30/* Other architectures wishing to use this simple topology API should fill 32/* Other architectures wishing to use this simple topology API should fill
31 in the below functions as appropriate in their own <asm/topology.h> file. */ 33 in the below functions as appropriate in their own <asm/topology.h> file. */
32#ifndef cpu_to_node 34#ifndef cpu_to_node
@@ -52,4 +54,16 @@
52 ) 54 )
53#endif 55#endif
54 56
57#endif /* CONFIG_NUMA */
58
59/* returns pointer to cpumask for specified node */
60#ifndef node_to_cpumask_ptr
61
62#define node_to_cpumask_ptr(v, node) \
63 cpumask_t _##v = node_to_cpumask(node), *v = &_##v
64
65#define node_to_cpumask_ptr_next(v, node) \
66 _##v = node_to_cpumask(node)
67#endif
68
55#endif /* _ASM_GENERIC_TOPOLOGY_H */ 69#endif /* _ASM_GENERIC_TOPOLOGY_H */
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 2d67b72b18d0..f2f72ef2a897 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -93,7 +93,7 @@ void build_cpu_to_node_map(void);
93 .cache_nice_tries = 2, \ 93 .cache_nice_tries = 2, \
94 .busy_idx = 3, \ 94 .busy_idx = 3, \
95 .idle_idx = 2, \ 95 .idle_idx = 2, \
96 .newidle_idx = 0, /* unused */ \ 96 .newidle_idx = 2, \
97 .wake_idx = 1, \ 97 .wake_idx = 1, \
98 .forkexec_idx = 1, \ 98 .forkexec_idx = 1, \
99 .flags = SD_LOAD_BALANCE \ 99 .flags = SD_LOAD_BALANCE \
@@ -116,6 +116,11 @@ void build_cpu_to_node_map(void);
116#define smt_capable() (smp_num_siblings > 1) 116#define smt_capable() (smp_num_siblings > 1)
117#endif 117#endif
118 118
119#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
120 CPU_MASK_ALL : \
121 node_to_cpumask(pcibus_to_node(bus)) \
122 )
123
119#include <asm-generic/topology.h> 124#include <asm-generic/topology.h>
120 125
121#endif /* _ASM_IA64_TOPOLOGY_H */ 126#endif /* _ASM_IA64_TOPOLOGY_H */
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index ca23b681ad05..100c6fbfc587 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -96,11 +96,10 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev,
96{ 96{
97} 97}
98 98
99#endif /* CONFIG_NUMA */
99 100
100#include <asm-generic/topology.h> 101#include <asm-generic/topology.h>
101 102
102#endif /* CONFIG_NUMA */
103
104#ifdef CONFIG_SMP 103#ifdef CONFIG_SMP
105#include <asm/cputable.h> 104#include <asm/cputable.h>
106#define smt_capable() (cpu_has_feature(CPU_FTR_SMT)) 105#define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
diff --git a/include/asm-sh/topology.h b/include/asm-sh/topology.h
index f402a3b1cfa4..34cdb28e8f44 100644
--- a/include/asm-sh/topology.h
+++ b/include/asm-sh/topology.h
@@ -16,7 +16,7 @@
16 .cache_nice_tries = 2, \ 16 .cache_nice_tries = 2, \
17 .busy_idx = 3, \ 17 .busy_idx = 3, \
18 .idle_idx = 2, \ 18 .idle_idx = 2, \
19 .newidle_idx = 0, \ 19 .newidle_idx = 2, \
20 .wake_idx = 1, \ 20 .wake_idx = 1, \
21 .forkexec_idx = 1, \ 21 .forkexec_idx = 1, \
22 .flags = SD_LOAD_BALANCE \ 22 .flags = SD_LOAD_BALANCE \
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 81a29eb08ac4..22073268b481 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -88,6 +88,17 @@ static inline int cpu_to_node(int cpu)
88#endif 88#endif
89 return per_cpu(x86_cpu_to_node_map, cpu); 89 return per_cpu(x86_cpu_to_node_map, cpu);
90} 90}
91
92#ifdef CONFIG_NUMA
93
94/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
95#define node_to_cpumask_ptr(v, node) \
96 cpumask_t *v = &(node_to_cpumask_map[node])
97
98#define node_to_cpumask_ptr_next(v, node) \
99 v = &(node_to_cpumask_map[node])
100#endif
101
91#endif /* CONFIG_X86_64 */ 102#endif /* CONFIG_X86_64 */
92 103
93/* 104/*
@@ -136,17 +147,13 @@ extern unsigned long node_remap_size[];
136 147
137# define SD_CACHE_NICE_TRIES 2 148# define SD_CACHE_NICE_TRIES 2
138# define SD_IDLE_IDX 2 149# define SD_IDLE_IDX 2
139# define SD_NEWIDLE_IDX 0 150# define SD_NEWIDLE_IDX 2
140# define SD_FORKEXEC_IDX 1 151# define SD_FORKEXEC_IDX 1
141 152
142#endif 153#endif
143 154
144/* sched_domains SD_NODE_INIT for NUMAQ machines */ 155/* sched_domains SD_NODE_INIT for NUMAQ machines */
145#define SD_NODE_INIT (struct sched_domain) { \ 156#define SD_NODE_INIT (struct sched_domain) { \
146 .span = CPU_MASK_NONE, \
147 .parent = NULL, \
148 .child = NULL, \
149 .groups = NULL, \
150 .min_interval = 8, \ 157 .min_interval = 8, \
151 .max_interval = 32, \ 158 .max_interval = 32, \
152 .busy_factor = 32, \ 159 .busy_factor = 32, \
@@ -164,7 +171,6 @@ extern unsigned long node_remap_size[];
164 | SD_WAKE_BALANCE, \ 171 | SD_WAKE_BALANCE, \
165 .last_balance = jiffies, \ 172 .last_balance = jiffies, \
166 .balance_interval = 1, \ 173 .balance_interval = 1, \
167 .nr_balance_failed = 0, \
168} 174}
169 175
170#ifdef CONFIG_X86_64_ACPI_NUMA 176#ifdef CONFIG_X86_64_ACPI_NUMA
@@ -174,10 +180,10 @@ extern int __node_distance(int, int);
174 180
175#else /* CONFIG_NUMA */ 181#else /* CONFIG_NUMA */
176 182
177#include <asm-generic/topology.h>
178
179#endif 183#endif
180 184
185#include <asm-generic/topology.h>
186
181extern cpumask_t cpu_coregroup_map(int cpu); 187extern cpumask_t cpu_coregroup_map(int cpu);
182 188
183#ifdef ENABLE_TOPO_DEFINES 189#ifdef ENABLE_TOPO_DEFINES
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index acad1105d942..1dbe074f1c64 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -108,6 +108,7 @@ extern int __bitmap_weight(const unsigned long *bitmap, int bits);
108 108
109extern int bitmap_scnprintf(char *buf, unsigned int len, 109extern int bitmap_scnprintf(char *buf, unsigned int len,
110 const unsigned long *src, int nbits); 110 const unsigned long *src, int nbits);
111extern int bitmap_scnprintf_len(unsigned int len);
111extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user, 112extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
112 unsigned long *dst, int nbits); 113 unsigned long *dst, int nbits);
113extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, 114extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 7047f58306a7..259c8051155d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -222,8 +222,13 @@ int __next_cpu(int n, const cpumask_t *srcp);
222#define next_cpu(n, src) ({ (void)(src); 1; }) 222#define next_cpu(n, src) ({ (void)(src); 1; })
223#endif 223#endif
224 224
225#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
226extern cpumask_t *cpumask_of_cpu_map;
227#define cpumask_of_cpu(cpu) (cpumask_of_cpu_map[cpu])
228
229#else
225#define cpumask_of_cpu(cpu) \ 230#define cpumask_of_cpu(cpu) \
226({ \ 231(*({ \
227 typeof(_unused_cpumask_arg_) m; \ 232 typeof(_unused_cpumask_arg_) m; \
228 if (sizeof(m) == sizeof(unsigned long)) { \ 233 if (sizeof(m) == sizeof(unsigned long)) { \
229 m.bits[0] = 1UL<<(cpu); \ 234 m.bits[0] = 1UL<<(cpu); \
@@ -231,8 +236,9 @@ int __next_cpu(int n, const cpumask_t *srcp);
231 cpus_clear(m); \ 236 cpus_clear(m); \
232 cpu_set((cpu), m); \ 237 cpu_set((cpu), m); \
233 } \ 238 } \
234 m; \ 239 &m; \
235}) 240}))
241#endif
236 242
237#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS) 243#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
238 244
@@ -243,6 +249,8 @@ int __next_cpu(int n, const cpumask_t *srcp);
243 [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ 249 [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \
244} } 250} }
245 251
252#define CPU_MASK_ALL_PTR (&CPU_MASK_ALL)
253
246#else 254#else
247 255
248#define CPU_MASK_ALL \ 256#define CPU_MASK_ALL \
@@ -251,6 +259,10 @@ int __next_cpu(int n, const cpumask_t *srcp);
251 [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ 259 [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \
252} } 260} }
253 261
262/* cpu_mask_all is in init/main.c */
263extern cpumask_t cpu_mask_all;
264#define CPU_MASK_ALL_PTR (&cpu_mask_all)
265
254#endif 266#endif
255 267
256#define CPU_MASK_NONE \ 268#define CPU_MASK_NONE \
@@ -273,6 +285,13 @@ static inline int __cpumask_scnprintf(char *buf, int len,
273 return bitmap_scnprintf(buf, len, srcp->bits, nbits); 285 return bitmap_scnprintf(buf, len, srcp->bits, nbits);
274} 286}
275 287
288#define cpumask_scnprintf_len(len) \
289 __cpumask_scnprintf_len((len))
290static inline int __cpumask_scnprintf_len(int len)
291{
292 return bitmap_scnprintf_len(len);
293}
294
276#define cpumask_parse_user(ubuf, ulen, dst) \ 295#define cpumask_parse_user(ubuf, ulen, dst) \
277 __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS) 296 __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS)
278static inline int __cpumask_parse_user(const char __user *buf, int len, 297static inline int __cpumask_parse_user(const char __user *buf, int len,
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 0a26be353cb3..726761e24003 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,8 +20,8 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
20extern int cpuset_init_early(void); 20extern int cpuset_init_early(void);
21extern int cpuset_init(void); 21extern int cpuset_init(void);
22extern void cpuset_init_smp(void); 22extern void cpuset_init_smp(void);
23extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); 23extern void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask);
24extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p); 24extern void cpuset_cpus_allowed_locked(struct task_struct *p, cpumask_t *mask);
25extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 25extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
26#define cpuset_current_mems_allowed (current->mems_allowed) 26#define cpuset_current_mems_allowed (current->mems_allowed)
27void cpuset_init_current_mems_allowed(void); 27void cpuset_init_current_mems_allowed(void);
@@ -84,13 +84,14 @@ static inline int cpuset_init_early(void) { return 0; }
84static inline int cpuset_init(void) { return 0; } 84static inline int cpuset_init(void) { return 0; }
85static inline void cpuset_init_smp(void) {} 85static inline void cpuset_init_smp(void) {}
86 86
87static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p) 87static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask)
88{ 88{
89 return cpu_possible_map; 89 *mask = cpu_possible_map;
90} 90}
91static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p) 91static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
92 cpumask_t *mask)
92{ 93{
93 return cpu_possible_map; 94 *mask = cpu_possible_map;
94} 95}
95 96
96static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) 97static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f74e1d7415f..37a6f5bc4a92 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -151,6 +151,9 @@ extern struct group_info init_groups;
151 .cpus_allowed = CPU_MASK_ALL, \ 151 .cpus_allowed = CPU_MASK_ALL, \
152 .mm = NULL, \ 152 .mm = NULL, \
153 .active_mm = &init_mm, \ 153 .active_mm = &init_mm, \
154 .se = { \
155 .group_node = LIST_HEAD_INIT(tsk.se.group_node), \
156 }, \
154 .rt = { \ 157 .rt = { \
155 .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ 158 .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
156 .time_slice = HZ, \ 159 .time_slice = HZ, \
diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 2cd7fa73d1af..ce5983225be4 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -327,4 +327,10 @@ extern void ktime_get_ts(struct timespec *ts);
327/* Get the real (wall-) time in timespec format: */ 327/* Get the real (wall-) time in timespec format: */
328#define ktime_get_real_ts(ts) getnstimeofday(ts) 328#define ktime_get_real_ts(ts) getnstimeofday(ts)
329 329
330static inline ktime_t ns_to_ktime(u64 ns)
331{
332 static const ktime_t ktime_zero = { .tv64 = 0 };
333 return ktime_add_ns(ktime_zero, ns);
334}
335
330#endif 336#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6a1e7afb099b..be6914014c70 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -704,6 +704,7 @@ enum cpu_idle_type {
704#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ 704#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
705#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ 705#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
706#define SD_SERIALIZE 1024 /* Only a single load balancing instance */ 706#define SD_SERIALIZE 1024 /* Only a single load balancing instance */
707#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
707 708
708#define BALANCE_FOR_MC_POWER \ 709#define BALANCE_FOR_MC_POWER \
709 (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) 710 (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
@@ -733,12 +734,31 @@ struct sched_group {
733 u32 reciprocal_cpu_power; 734 u32 reciprocal_cpu_power;
734}; 735};
735 736
737enum sched_domain_level {
738 SD_LV_NONE = 0,
739 SD_LV_SIBLING,
740 SD_LV_MC,
741 SD_LV_CPU,
742 SD_LV_NODE,
743 SD_LV_ALLNODES,
744 SD_LV_MAX
745};
746
747struct sched_domain_attr {
748 int relax_domain_level;
749};
750
751#define SD_ATTR_INIT (struct sched_domain_attr) { \
752 .relax_domain_level = -1, \
753}
754
736struct sched_domain { 755struct sched_domain {
737 /* These fields must be setup */ 756 /* These fields must be setup */
738 struct sched_domain *parent; /* top domain must be null terminated */ 757 struct sched_domain *parent; /* top domain must be null terminated */
739 struct sched_domain *child; /* bottom domain must be null terminated */ 758 struct sched_domain *child; /* bottom domain must be null terminated */
740 struct sched_group *groups; /* the balancing groups of the domain */ 759 struct sched_group *groups; /* the balancing groups of the domain */
741 cpumask_t span; /* span of all CPUs in this domain */ 760 cpumask_t span; /* span of all CPUs in this domain */
761 int first_cpu; /* cache of the first cpu in this domain */
742 unsigned long min_interval; /* Minimum balance interval ms */ 762 unsigned long min_interval; /* Minimum balance interval ms */
743 unsigned long max_interval; /* Maximum balance interval ms */ 763 unsigned long max_interval; /* Maximum balance interval ms */
744 unsigned int busy_factor; /* less balancing by factor if busy */ 764 unsigned int busy_factor; /* less balancing by factor if busy */
@@ -750,6 +770,7 @@ struct sched_domain {
750 unsigned int wake_idx; 770 unsigned int wake_idx;
751 unsigned int forkexec_idx; 771 unsigned int forkexec_idx;
752 int flags; /* See SD_* */ 772 int flags; /* See SD_* */
773 enum sched_domain_level level;
753 774
754 /* Runtime fields. */ 775 /* Runtime fields. */
755 unsigned long last_balance; /* init to jiffies. units in jiffies */ 776 unsigned long last_balance; /* init to jiffies. units in jiffies */
@@ -789,7 +810,8 @@ struct sched_domain {
789#endif 810#endif
790}; 811};
791 812
792extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new); 813extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
814 struct sched_domain_attr *dattr_new);
793extern int arch_reinit_sched_domains(void); 815extern int arch_reinit_sched_domains(void);
794 816
795#endif /* CONFIG_SMP */ 817#endif /* CONFIG_SMP */
@@ -889,7 +911,8 @@ struct sched_class {
889 void (*set_curr_task) (struct rq *rq); 911 void (*set_curr_task) (struct rq *rq);
890 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 912 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
891 void (*task_new) (struct rq *rq, struct task_struct *p); 913 void (*task_new) (struct rq *rq, struct task_struct *p);
892 void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask); 914 void (*set_cpus_allowed)(struct task_struct *p,
915 const cpumask_t *newmask);
893 916
894 void (*join_domain)(struct rq *rq); 917 void (*join_domain)(struct rq *rq);
895 void (*leave_domain)(struct rq *rq); 918 void (*leave_domain)(struct rq *rq);
@@ -923,6 +946,7 @@ struct load_weight {
923struct sched_entity { 946struct sched_entity {
924 struct load_weight load; /* for load-balancing */ 947 struct load_weight load; /* for load-balancing */
925 struct rb_node run_node; 948 struct rb_node run_node;
949 struct list_head group_node;
926 unsigned int on_rq; 950 unsigned int on_rq;
927 951
928 u64 exec_start; 952 u64 exec_start;
@@ -982,6 +1006,7 @@ struct sched_rt_entity {
982 unsigned long timeout; 1006 unsigned long timeout;
983 int nr_cpus_allowed; 1007 int nr_cpus_allowed;
984 1008
1009 struct sched_rt_entity *back;
985#ifdef CONFIG_RT_GROUP_SCHED 1010#ifdef CONFIG_RT_GROUP_SCHED
986 struct sched_rt_entity *parent; 1011 struct sched_rt_entity *parent;
987 /* rq on which this entity is (to be) queued: */ 1012 /* rq on which this entity is (to be) queued: */
@@ -1502,15 +1527,21 @@ static inline void put_task_struct(struct task_struct *t)
1502#define used_math() tsk_used_math(current) 1527#define used_math() tsk_used_math(current)
1503 1528
1504#ifdef CONFIG_SMP 1529#ifdef CONFIG_SMP
1505extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask); 1530extern int set_cpus_allowed_ptr(struct task_struct *p,
1531 const cpumask_t *new_mask);
1506#else 1532#else
1507static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 1533static inline int set_cpus_allowed_ptr(struct task_struct *p,
1534 const cpumask_t *new_mask)
1508{ 1535{
1509 if (!cpu_isset(0, new_mask)) 1536 if (!cpu_isset(0, *new_mask))
1510 return -EINVAL; 1537 return -EINVAL;
1511 return 0; 1538 return 0;
1512} 1539}
1513#endif 1540#endif
1541static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1542{
1543 return set_cpus_allowed_ptr(p, &new_mask);
1544}
1514 1545
1515extern unsigned long long sched_clock(void); 1546extern unsigned long long sched_clock(void);
1516 1547
@@ -1551,7 +1582,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
1551extern unsigned int sysctl_sched_latency; 1582extern unsigned int sysctl_sched_latency;
1552extern unsigned int sysctl_sched_min_granularity; 1583extern unsigned int sysctl_sched_min_granularity;
1553extern unsigned int sysctl_sched_wakeup_granularity; 1584extern unsigned int sysctl_sched_wakeup_granularity;
1554extern unsigned int sysctl_sched_batch_wakeup_granularity;
1555extern unsigned int sysctl_sched_child_runs_first; 1585extern unsigned int sysctl_sched_child_runs_first;
1556extern unsigned int sysctl_sched_features; 1586extern unsigned int sysctl_sched_features;
1557extern unsigned int sysctl_sched_migration_cost; 1587extern unsigned int sysctl_sched_migration_cost;
@@ -1564,6 +1594,10 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
1564extern unsigned int sysctl_sched_rt_period; 1594extern unsigned int sysctl_sched_rt_period;
1565extern int sysctl_sched_rt_runtime; 1595extern int sysctl_sched_rt_runtime;
1566 1596
1597int sched_rt_handler(struct ctl_table *table, int write,
1598 struct file *filp, void __user *buffer, size_t *lenp,
1599 loff_t *ppos);
1600
1567extern unsigned int sysctl_sched_compat_yield; 1601extern unsigned int sysctl_sched_compat_yield;
1568 1602
1569#ifdef CONFIG_RT_MUTEXES 1603#ifdef CONFIG_RT_MUTEXES
@@ -2031,7 +2065,7 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
2031} 2065}
2032#endif 2066#endif
2033 2067
2034extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); 2068extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
2035extern long sched_getaffinity(pid_t pid, cpumask_t *mask); 2069extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
2036 2070
2037extern int sched_mc_power_savings, sched_smt_power_savings; 2071extern int sched_mc_power_savings, sched_smt_power_savings;
@@ -2041,8 +2075,11 @@ extern void normalize_rt_tasks(void);
2041#ifdef CONFIG_GROUP_SCHED 2075#ifdef CONFIG_GROUP_SCHED
2042 2076
2043extern struct task_group init_task_group; 2077extern struct task_group init_task_group;
2078#ifdef CONFIG_USER_SCHED
2079extern struct task_group root_task_group;
2080#endif
2044 2081
2045extern struct task_group *sched_create_group(void); 2082extern struct task_group *sched_create_group(struct task_group *parent);
2046extern void sched_destroy_group(struct task_group *tg); 2083extern void sched_destroy_group(struct task_group *tg);
2047extern void sched_move_task(struct task_struct *tsk); 2084extern void sched_move_task(struct task_struct *tsk);
2048#ifdef CONFIG_FAIR_GROUP_SCHED 2085#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2053,6 +2090,9 @@ extern unsigned long sched_group_shares(struct task_group *tg);
2053extern int sched_group_set_rt_runtime(struct task_group *tg, 2090extern int sched_group_set_rt_runtime(struct task_group *tg,
2054 long rt_runtime_us); 2091 long rt_runtime_us);
2055extern long sched_group_rt_runtime(struct task_group *tg); 2092extern long sched_group_rt_runtime(struct task_group *tg);
2093extern int sched_group_set_rt_period(struct task_group *tg,
2094 long rt_period_us);
2095extern long sched_group_rt_period(struct task_group *tg);
2056#endif 2096#endif
2057#endif 2097#endif
2058 2098
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h
index f752e73bf977..f2767bc6b735 100644
--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@@ -45,12 +45,16 @@ struct sysdev_class_attribute {
45 ssize_t (*store)(struct sysdev_class *, const char *, size_t); 45 ssize_t (*store)(struct sysdev_class *, const char *, size_t);
46}; 46};
47 47
48#define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) \ 48#define _SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) \
49struct sysdev_class_attribute attr_##_name = { \ 49{ \
50 .attr = {.name = __stringify(_name), .mode = _mode }, \ 50 .attr = {.name = __stringify(_name), .mode = _mode }, \
51 .show = _show, \ 51 .show = _show, \
52 .store = _store, \ 52 .store = _store, \
53}; 53}
54
55#define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) \
56 struct sysdev_class_attribute attr_##_name = \
57 _SYSDEV_CLASS_ATTR(_name,_mode,_show,_store)
54 58
55 59
56extern int sysdev_class_register(struct sysdev_class *); 60extern int sysdev_class_register(struct sysdev_class *);
@@ -100,15 +104,16 @@ struct sysdev_attribute {
100}; 104};
101 105
102 106
103#define _SYSDEV_ATTR(_name,_mode,_show,_store) \ 107#define _SYSDEV_ATTR(_name, _mode, _show, _store) \
104{ \ 108{ \
105 .attr = { .name = __stringify(_name), .mode = _mode }, \ 109 .attr = { .name = __stringify(_name), .mode = _mode }, \
106 .show = _show, \ 110 .show = _show, \
107 .store = _store, \ 111 .store = _store, \
108} 112}
109 113
110#define SYSDEV_ATTR(_name,_mode,_show,_store) \ 114#define SYSDEV_ATTR(_name, _mode, _show, _store) \
111struct sysdev_attribute attr_##_name = _SYSDEV_ATTR(_name,_mode,_show,_store); 115 struct sysdev_attribute attr_##_name = \
116 _SYSDEV_ATTR(_name, _mode, _show, _store);
112 117
113extern int sysdev_create_file(struct sys_device *, struct sysdev_attribute *); 118extern int sysdev_create_file(struct sys_device *, struct sysdev_attribute *);
114extern void sysdev_remove_file(struct sys_device *, struct sysdev_attribute *); 119extern void sysdev_remove_file(struct sys_device *, struct sysdev_attribute *);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index bd14f8b30f09..4bb7074a2c3a 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -38,16 +38,15 @@
38#endif 38#endif
39 39
40#ifndef nr_cpus_node 40#ifndef nr_cpus_node
41#define nr_cpus_node(node) \ 41#define nr_cpus_node(node) \
42 ({ \ 42 ({ \
43 cpumask_t __tmp__; \ 43 node_to_cpumask_ptr(__tmp__, node); \
44 __tmp__ = node_to_cpumask(node); \ 44 cpus_weight(*__tmp__); \
45 cpus_weight(__tmp__); \
46 }) 45 })
47#endif 46#endif
48 47
49#define for_each_node_with_cpus(node) \ 48#define for_each_node_with_cpus(node) \
50 for_each_online_node(node) \ 49 for_each_online_node(node) \
51 if (nr_cpus_node(node)) 50 if (nr_cpus_node(node))
52 51
53void arch_update_cpu_topology(void); 52void arch_update_cpu_topology(void);
@@ -80,7 +79,9 @@ void arch_update_cpu_topology(void);
80 * by defining their own arch-specific initializer in include/asm/topology.h. 79 * by defining their own arch-specific initializer in include/asm/topology.h.
81 * A definition there will automagically override these default initializers 80 * A definition there will automagically override these default initializers
82 * and allow arch-specific performance tuning of sched_domains. 81 * and allow arch-specific performance tuning of sched_domains.
82 * (Only non-zero and non-null fields need be specified.)
83 */ 83 */
84
84#ifdef CONFIG_SCHED_SMT 85#ifdef CONFIG_SCHED_SMT
85/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, 86/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is,
86 * so can't we drop this in favor of CONFIG_SCHED_SMT? 87 * so can't we drop this in favor of CONFIG_SCHED_SMT?
@@ -89,20 +90,10 @@ void arch_update_cpu_topology(void);
89/* Common values for SMT siblings */ 90/* Common values for SMT siblings */
90#ifndef SD_SIBLING_INIT 91#ifndef SD_SIBLING_INIT
91#define SD_SIBLING_INIT (struct sched_domain) { \ 92#define SD_SIBLING_INIT (struct sched_domain) { \
92 .span = CPU_MASK_NONE, \
93 .parent = NULL, \
94 .child = NULL, \
95 .groups = NULL, \
96 .min_interval = 1, \ 93 .min_interval = 1, \
97 .max_interval = 2, \ 94 .max_interval = 2, \
98 .busy_factor = 64, \ 95 .busy_factor = 64, \
99 .imbalance_pct = 110, \ 96 .imbalance_pct = 110, \
100 .cache_nice_tries = 0, \
101 .busy_idx = 0, \
102 .idle_idx = 0, \
103 .newidle_idx = 0, \
104 .wake_idx = 0, \
105 .forkexec_idx = 0, \
106 .flags = SD_LOAD_BALANCE \ 97 .flags = SD_LOAD_BALANCE \
107 | SD_BALANCE_NEWIDLE \ 98 | SD_BALANCE_NEWIDLE \
108 | SD_BALANCE_FORK \ 99 | SD_BALANCE_FORK \
@@ -112,7 +103,6 @@ void arch_update_cpu_topology(void);
112 | SD_SHARE_CPUPOWER, \ 103 | SD_SHARE_CPUPOWER, \
113 .last_balance = jiffies, \ 104 .last_balance = jiffies, \
114 .balance_interval = 1, \ 105 .balance_interval = 1, \
115 .nr_balance_failed = 0, \
116} 106}
117#endif 107#endif
118#endif /* CONFIG_SCHED_SMT */ 108#endif /* CONFIG_SCHED_SMT */
@@ -121,18 +111,12 @@ void arch_update_cpu_topology(void);
121/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ 111/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
122#ifndef SD_MC_INIT 112#ifndef SD_MC_INIT
123#define SD_MC_INIT (struct sched_domain) { \ 113#define SD_MC_INIT (struct sched_domain) { \
124 .span = CPU_MASK_NONE, \
125 .parent = NULL, \
126 .child = NULL, \
127 .groups = NULL, \
128 .min_interval = 1, \ 114 .min_interval = 1, \
129 .max_interval = 4, \ 115 .max_interval = 4, \
130 .busy_factor = 64, \ 116 .busy_factor = 64, \
131 .imbalance_pct = 125, \ 117 .imbalance_pct = 125, \
132 .cache_nice_tries = 1, \ 118 .cache_nice_tries = 1, \
133 .busy_idx = 2, \ 119 .busy_idx = 2, \
134 .idle_idx = 0, \
135 .newidle_idx = 0, \
136 .wake_idx = 1, \ 120 .wake_idx = 1, \
137 .forkexec_idx = 1, \ 121 .forkexec_idx = 1, \
138 .flags = SD_LOAD_BALANCE \ 122 .flags = SD_LOAD_BALANCE \
@@ -144,7 +128,6 @@ void arch_update_cpu_topology(void);
144 | BALANCE_FOR_MC_POWER, \ 128 | BALANCE_FOR_MC_POWER, \
145 .last_balance = jiffies, \ 129 .last_balance = jiffies, \
146 .balance_interval = 1, \ 130 .balance_interval = 1, \
147 .nr_balance_failed = 0, \
148} 131}
149#endif 132#endif
150#endif /* CONFIG_SCHED_MC */ 133#endif /* CONFIG_SCHED_MC */
@@ -152,10 +135,6 @@ void arch_update_cpu_topology(void);
152/* Common values for CPUs */ 135/* Common values for CPUs */
153#ifndef SD_CPU_INIT 136#ifndef SD_CPU_INIT
154#define SD_CPU_INIT (struct sched_domain) { \ 137#define SD_CPU_INIT (struct sched_domain) { \
155 .span = CPU_MASK_NONE, \
156 .parent = NULL, \
157 .child = NULL, \
158 .groups = NULL, \
159 .min_interval = 1, \ 138 .min_interval = 1, \
160 .max_interval = 4, \ 139 .max_interval = 4, \
161 .busy_factor = 64, \ 140 .busy_factor = 64, \
@@ -174,16 +153,11 @@ void arch_update_cpu_topology(void);
174 | BALANCE_FOR_PKG_POWER,\ 153 | BALANCE_FOR_PKG_POWER,\
175 .last_balance = jiffies, \ 154 .last_balance = jiffies, \
176 .balance_interval = 1, \ 155 .balance_interval = 1, \
177 .nr_balance_failed = 0, \
178} 156}
179#endif 157#endif
180 158
181/* sched_domains SD_ALLNODES_INIT for NUMA machines */ 159/* sched_domains SD_ALLNODES_INIT for NUMA machines */
182#define SD_ALLNODES_INIT (struct sched_domain) { \ 160#define SD_ALLNODES_INIT (struct sched_domain) { \
183 .span = CPU_MASK_NONE, \
184 .parent = NULL, \
185 .child = NULL, \
186 .groups = NULL, \
187 .min_interval = 64, \ 161 .min_interval = 64, \
188 .max_interval = 64*num_online_cpus(), \ 162 .max_interval = 64*num_online_cpus(), \
189 .busy_factor = 128, \ 163 .busy_factor = 128, \
@@ -191,14 +165,10 @@ void arch_update_cpu_topology(void);
191 .cache_nice_tries = 1, \ 165 .cache_nice_tries = 1, \
192 .busy_idx = 3, \ 166 .busy_idx = 3, \
193 .idle_idx = 3, \ 167 .idle_idx = 3, \
194 .newidle_idx = 0, /* unused */ \
195 .wake_idx = 0, /* unused */ \
196 .forkexec_idx = 0, /* unused */ \
197 .flags = SD_LOAD_BALANCE \ 168 .flags = SD_LOAD_BALANCE \
198 | SD_SERIALIZE, \ 169 | SD_SERIALIZE, \
199 .last_balance = jiffies, \ 170 .last_balance = jiffies, \
200 .balance_interval = 64, \ 171 .balance_interval = 64, \
201 .nr_balance_failed = 0, \
202} 172}
203 173
204#ifdef CONFIG_NUMA 174#ifdef CONFIG_NUMA
diff --git a/init/Kconfig b/init/Kconfig
index 7fccf09bb95a..ba3a389fab94 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -328,6 +328,13 @@ config RT_GROUP_SCHED
328 depends on EXPERIMENTAL 328 depends on EXPERIMENTAL
329 depends on GROUP_SCHED 329 depends on GROUP_SCHED
330 default n 330 default n
331 help
332 This feature lets you explicitly allocate real CPU bandwidth
333 to users or control groups (depending on the "Basis for grouping tasks"
334 setting below. If enabled, it will also make it impossible to
335 schedule realtime tasks for non-root users until you allocate
336 realtime bandwidth for them.
337 See Documentation/sched-rt-group.txt for more information.
331 338
332choice 339choice
333 depends on GROUP_SCHED 340 depends on GROUP_SCHED
diff --git a/init/main.c b/init/main.c
index 99ce94930b09..833a67df1f7e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -359,10 +359,31 @@ static void __init smp_init(void)
359#endif 359#endif
360 360
361static inline void setup_per_cpu_areas(void) { } 361static inline void setup_per_cpu_areas(void) { }
362static inline void setup_nr_cpu_ids(void) { }
362static inline void smp_prepare_cpus(unsigned int maxcpus) { } 363static inline void smp_prepare_cpus(unsigned int maxcpus) { }
363 364
364#else 365#else
365 366
367#if NR_CPUS > BITS_PER_LONG
368cpumask_t cpu_mask_all __read_mostly = CPU_MASK_ALL;
369EXPORT_SYMBOL(cpu_mask_all);
370#endif
371
372/* Setup number of possible processor ids */
373int nr_cpu_ids __read_mostly = NR_CPUS;
374EXPORT_SYMBOL(nr_cpu_ids);
375
376/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
377static void __init setup_nr_cpu_ids(void)
378{
379 int cpu, highest_cpu = 0;
380
381 for_each_possible_cpu(cpu)
382 highest_cpu = cpu;
383
384 nr_cpu_ids = highest_cpu + 1;
385}
386
366#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA 387#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
367unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 388unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
368 389
@@ -537,6 +558,7 @@ asmlinkage void __init start_kernel(void)
537 setup_command_line(command_line); 558 setup_command_line(command_line);
538 unwind_setup(); 559 unwind_setup();
539 setup_per_cpu_areas(); 560 setup_per_cpu_areas();
561 setup_nr_cpu_ids();
540 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ 562 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
541 563
542 /* 564 /*
@@ -811,7 +833,7 @@ static int __init kernel_init(void * unused)
811 /* 833 /*
812 * init can run on any cpu. 834 * init can run on any cpu.
813 */ 835 */
814 set_cpus_allowed(current, CPU_MASK_ALL); 836 set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
815 /* 837 /*
816 * Tell the world that we're going to be the grim 838 * Tell the world that we're going to be the grim
817 * reaper of innocent orphaned children. 839 * reaper of innocent orphaned children.
diff --git a/kernel/compat.c b/kernel/compat.c
index 9c48abfcd4a5..e1ef04870c2a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -445,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
445 if (retval) 445 if (retval)
446 return retval; 446 return retval;
447 447
448 return sched_setaffinity(pid, new_mask); 448 return sched_setaffinity(pid, &new_mask);
449} 449}
450 450
451asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 451asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2eff3f63abed..2011ad8d2697 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -232,9 +232,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
232 232
233 /* Ensure that we are not runnable on dying cpu */ 233 /* Ensure that we are not runnable on dying cpu */
234 old_allowed = current->cpus_allowed; 234 old_allowed = current->cpus_allowed;
235 tmp = CPU_MASK_ALL; 235 cpus_setall(tmp);
236 cpu_clear(cpu, tmp); 236 cpu_clear(cpu, tmp);
237 set_cpus_allowed(current, tmp); 237 set_cpus_allowed_ptr(current, &tmp);
238 238
239 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); 239 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
240 240
@@ -268,7 +268,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
268out_thread: 268out_thread:
269 err = kthread_stop(p); 269 err = kthread_stop(p);
270out_allowed: 270out_allowed:
271 set_cpus_allowed(current, old_allowed); 271 set_cpus_allowed_ptr(current, &old_allowed);
272out_release: 272out_release:
273 cpu_hotplug_done(); 273 cpu_hotplug_done();
274 return err; 274 return err;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a1b61f414228..8b35fbd8292f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -98,6 +98,9 @@ struct cpuset {
98 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
99 int pn; 99 int pn;
100 100
101 /* for custom sched domain */
102 int relax_domain_level;
103
101 /* used for walking a cpuset heirarchy */ 104 /* used for walking a cpuset heirarchy */
102 struct list_head stack_list; 105 struct list_head stack_list;
103}; 106};
@@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
478 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 481 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
479} 482}
480 483
484static void
485update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
486{
487 if (!dattr)
488 return;
489 if (dattr->relax_domain_level < c->relax_domain_level)
490 dattr->relax_domain_level = c->relax_domain_level;
491 return;
492}
493
481/* 494/*
482 * rebuild_sched_domains() 495 * rebuild_sched_domains()
483 * 496 *
@@ -553,12 +566,14 @@ static void rebuild_sched_domains(void)
553 int csn; /* how many cpuset ptrs in csa so far */ 566 int csn; /* how many cpuset ptrs in csa so far */
554 int i, j, k; /* indices for partition finding loops */ 567 int i, j, k; /* indices for partition finding loops */
555 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 568 cpumask_t *doms; /* resulting partition; i.e. sched domains */
569 struct sched_domain_attr *dattr; /* attributes for custom domains */
556 int ndoms; /* number of sched domains in result */ 570 int ndoms; /* number of sched domains in result */
557 int nslot; /* next empty doms[] cpumask_t slot */ 571 int nslot; /* next empty doms[] cpumask_t slot */
558 572
559 q = NULL; 573 q = NULL;
560 csa = NULL; 574 csa = NULL;
561 doms = NULL; 575 doms = NULL;
576 dattr = NULL;
562 577
563 /* Special case for the 99% of systems with one, full, sched domain */ 578 /* Special case for the 99% of systems with one, full, sched domain */
564 if (is_sched_load_balance(&top_cpuset)) { 579 if (is_sched_load_balance(&top_cpuset)) {
@@ -566,6 +581,11 @@ static void rebuild_sched_domains(void)
566 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 581 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
567 if (!doms) 582 if (!doms)
568 goto rebuild; 583 goto rebuild;
584 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
585 if (dattr) {
586 *dattr = SD_ATTR_INIT;
587 update_domain_attr(dattr, &top_cpuset);
588 }
569 *doms = top_cpuset.cpus_allowed; 589 *doms = top_cpuset.cpus_allowed;
570 goto rebuild; 590 goto rebuild;
571 } 591 }
@@ -622,6 +642,7 @@ restart:
622 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 642 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
623 if (!doms) 643 if (!doms)
624 goto rebuild; 644 goto rebuild;
645 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
625 646
626 for (nslot = 0, i = 0; i < csn; i++) { 647 for (nslot = 0, i = 0; i < csn; i++) {
627 struct cpuset *a = csa[i]; 648 struct cpuset *a = csa[i];
@@ -644,12 +665,15 @@ restart:
644 } 665 }
645 666
646 cpus_clear(*dp); 667 cpus_clear(*dp);
668 if (dattr)
669 *(dattr + nslot) = SD_ATTR_INIT;
647 for (j = i; j < csn; j++) { 670 for (j = i; j < csn; j++) {
648 struct cpuset *b = csa[j]; 671 struct cpuset *b = csa[j];
649 672
650 if (apn == b->pn) { 673 if (apn == b->pn) {
651 cpus_or(*dp, *dp, b->cpus_allowed); 674 cpus_or(*dp, *dp, b->cpus_allowed);
652 b->pn = -1; 675 b->pn = -1;
676 update_domain_attr(dattr, b);
653 } 677 }
654 } 678 }
655 nslot++; 679 nslot++;
@@ -660,7 +684,7 @@ restart:
660rebuild: 684rebuild:
661 /* Have scheduler rebuild sched domains */ 685 /* Have scheduler rebuild sched domains */
662 get_online_cpus(); 686 get_online_cpus();
663 partition_sched_domains(ndoms, doms); 687 partition_sched_domains(ndoms, doms, dattr);
664 put_online_cpus(); 688 put_online_cpus();
665 689
666done: 690done:
@@ -668,6 +692,7 @@ done:
668 kfifo_free(q); 692 kfifo_free(q);
669 kfree(csa); 693 kfree(csa);
670 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 694 /* Don't kfree(doms) -- partition_sched_domains() does that. */
695 /* Don't kfree(dattr) -- partition_sched_domains() does that. */
671} 696}
672 697
673static inline int started_after_time(struct task_struct *t1, 698static inline int started_after_time(struct task_struct *t1,
@@ -729,7 +754,7 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
729 */ 754 */
730void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 755void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
731{ 756{
732 set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); 757 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
733} 758}
734 759
735/** 760/**
@@ -1011,6 +1036,21 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1011 return 0; 1036 return 0;
1012} 1037}
1013 1038
1039static int update_relax_domain_level(struct cpuset *cs, char *buf)
1040{
1041 int val = simple_strtol(buf, NULL, 10);
1042
1043 if (val < 0)
1044 val = -1;
1045
1046 if (val != cs->relax_domain_level) {
1047 cs->relax_domain_level = val;
1048 rebuild_sched_domains();
1049 }
1050
1051 return 0;
1052}
1053
1014/* 1054/*
1015 * update_flag - read a 0 or a 1 in a file and update associated flag 1055 * update_flag - read a 0 or a 1 in a file and update associated flag
1016 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1056 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -1178,7 +1218,7 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1178 1218
1179 mutex_lock(&callback_mutex); 1219 mutex_lock(&callback_mutex);
1180 guarantee_online_cpus(cs, &cpus); 1220 guarantee_online_cpus(cs, &cpus);
1181 set_cpus_allowed(tsk, cpus); 1221 set_cpus_allowed_ptr(tsk, &cpus);
1182 mutex_unlock(&callback_mutex); 1222 mutex_unlock(&callback_mutex);
1183 1223
1184 from = oldcs->mems_allowed; 1224 from = oldcs->mems_allowed;
@@ -1202,6 +1242,7 @@ typedef enum {
1202 FILE_CPU_EXCLUSIVE, 1242 FILE_CPU_EXCLUSIVE,
1203 FILE_MEM_EXCLUSIVE, 1243 FILE_MEM_EXCLUSIVE,
1204 FILE_SCHED_LOAD_BALANCE, 1244 FILE_SCHED_LOAD_BALANCE,
1245 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1205 FILE_MEMORY_PRESSURE_ENABLED, 1246 FILE_MEMORY_PRESSURE_ENABLED,
1206 FILE_MEMORY_PRESSURE, 1247 FILE_MEMORY_PRESSURE,
1207 FILE_SPREAD_PAGE, 1248 FILE_SPREAD_PAGE,
@@ -1256,6 +1297,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1256 case FILE_SCHED_LOAD_BALANCE: 1297 case FILE_SCHED_LOAD_BALANCE:
1257 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); 1298 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
1258 break; 1299 break;
1300 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1301 retval = update_relax_domain_level(cs, buffer);
1302 break;
1259 case FILE_MEMORY_MIGRATE: 1303 case FILE_MEMORY_MIGRATE:
1260 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1304 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
1261 break; 1305 break;
@@ -1354,6 +1398,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
1354 case FILE_SCHED_LOAD_BALANCE: 1398 case FILE_SCHED_LOAD_BALANCE:
1355 *s++ = is_sched_load_balance(cs) ? '1' : '0'; 1399 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1356 break; 1400 break;
1401 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1402 s += sprintf(s, "%d", cs->relax_domain_level);
1403 break;
1357 case FILE_MEMORY_MIGRATE: 1404 case FILE_MEMORY_MIGRATE:
1358 *s++ = is_memory_migrate(cs) ? '1' : '0'; 1405 *s++ = is_memory_migrate(cs) ? '1' : '0';
1359 break; 1406 break;
@@ -1424,6 +1471,13 @@ static struct cftype cft_sched_load_balance = {
1424 .private = FILE_SCHED_LOAD_BALANCE, 1471 .private = FILE_SCHED_LOAD_BALANCE,
1425}; 1472};
1426 1473
1474static struct cftype cft_sched_relax_domain_level = {
1475 .name = "sched_relax_domain_level",
1476 .read = cpuset_common_file_read,
1477 .write = cpuset_common_file_write,
1478 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1479};
1480
1427static struct cftype cft_memory_migrate = { 1481static struct cftype cft_memory_migrate = {
1428 .name = "memory_migrate", 1482 .name = "memory_migrate",
1429 .read = cpuset_common_file_read, 1483 .read = cpuset_common_file_read,
@@ -1475,6 +1529,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1475 return err; 1529 return err;
1476 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) 1530 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1477 return err; 1531 return err;
1532 if ((err = cgroup_add_file(cont, ss,
1533 &cft_sched_relax_domain_level)) < 0)
1534 return err;
1478 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) 1535 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1479 return err; 1536 return err;
1480 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) 1537 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
@@ -1555,10 +1612,11 @@ static struct cgroup_subsys_state *cpuset_create(
1555 if (is_spread_slab(parent)) 1612 if (is_spread_slab(parent))
1556 set_bit(CS_SPREAD_SLAB, &cs->flags); 1613 set_bit(CS_SPREAD_SLAB, &cs->flags);
1557 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1614 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1558 cs->cpus_allowed = CPU_MASK_NONE; 1615 cpus_clear(cs->cpus_allowed);
1559 cs->mems_allowed = NODE_MASK_NONE; 1616 nodes_clear(cs->mems_allowed);
1560 cs->mems_generation = cpuset_mems_generation++; 1617 cs->mems_generation = cpuset_mems_generation++;
1561 fmeter_init(&cs->fmeter); 1618 fmeter_init(&cs->fmeter);
1619 cs->relax_domain_level = -1;
1562 1620
1563 cs->parent = parent; 1621 cs->parent = parent;
1564 number_of_cpusets++; 1622 number_of_cpusets++;
@@ -1625,12 +1683,13 @@ int __init cpuset_init(void)
1625{ 1683{
1626 int err = 0; 1684 int err = 0;
1627 1685
1628 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1686 cpus_setall(top_cpuset.cpus_allowed);
1629 top_cpuset.mems_allowed = NODE_MASK_ALL; 1687 nodes_setall(top_cpuset.mems_allowed);
1630 1688
1631 fmeter_init(&top_cpuset.fmeter); 1689 fmeter_init(&top_cpuset.fmeter);
1632 top_cpuset.mems_generation = cpuset_mems_generation++; 1690 top_cpuset.mems_generation = cpuset_mems_generation++;
1633 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1691 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1692 top_cpuset.relax_domain_level = -1;
1634 1693
1635 err = register_filesystem(&cpuset_fs_type); 1694 err = register_filesystem(&cpuset_fs_type);
1636 if (err < 0) 1695 if (err < 0)
@@ -1844,6 +1903,7 @@ void __init cpuset_init_smp(void)
1844 1903
1845 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 1904 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1846 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 1905 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1906 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
1847 * 1907 *
1848 * Description: Returns the cpumask_t cpus_allowed of the cpuset 1908 * Description: Returns the cpumask_t cpus_allowed of the cpuset
1849 * attached to the specified @tsk. Guaranteed to return some non-empty 1909 * attached to the specified @tsk. Guaranteed to return some non-empty
@@ -1851,35 +1911,27 @@ void __init cpuset_init_smp(void)
1851 * tasks cpuset. 1911 * tasks cpuset.
1852 **/ 1912 **/
1853 1913
1854cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) 1914void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
1855{ 1915{
1856 cpumask_t mask;
1857
1858 mutex_lock(&callback_mutex); 1916 mutex_lock(&callback_mutex);
1859 mask = cpuset_cpus_allowed_locked(tsk); 1917 cpuset_cpus_allowed_locked(tsk, pmask);
1860 mutex_unlock(&callback_mutex); 1918 mutex_unlock(&callback_mutex);
1861
1862 return mask;
1863} 1919}
1864 1920
1865/** 1921/**
1866 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 1922 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1867 * Must be called with callback_mutex held. 1923 * Must be called with callback_mutex held.
1868 **/ 1924 **/
1869cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) 1925void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
1870{ 1926{
1871 cpumask_t mask;
1872
1873 task_lock(tsk); 1927 task_lock(tsk);
1874 guarantee_online_cpus(task_cs(tsk), &mask); 1928 guarantee_online_cpus(task_cs(tsk), pmask);
1875 task_unlock(tsk); 1929 task_unlock(tsk);
1876
1877 return mask;
1878} 1930}
1879 1931
1880void cpuset_init_current_mems_allowed(void) 1932void cpuset_init_current_mems_allowed(void)
1881{ 1933{
1882 current->mems_allowed = NODE_MASK_ALL; 1934 nodes_setall(current->mems_allowed);
1883} 1935}
1884 1936
1885/** 1937/**
@@ -2261,8 +2313,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2261 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, 2313 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count,
2262 task->cpus_allowed); 2314 task->cpus_allowed);
2263 seq_printf(m, "\n"); 2315 seq_printf(m, "\n");
2316 seq_printf(m, "Cpus_allowed_list:\t");
2317 m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count,
2318 task->cpus_allowed);
2319 seq_printf(m, "\n");
2264 seq_printf(m, "Mems_allowed:\t"); 2320 seq_printf(m, "Mems_allowed:\t");
2265 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, 2321 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count,
2266 task->mems_allowed); 2322 task->mems_allowed);
2267 seq_printf(m, "\n"); 2323 seq_printf(m, "\n");
2324 seq_printf(m, "Mems_allowed_list:\t");
2325 m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count,
2326 task->mems_allowed);
2327 seq_printf(m, "\n");
2268} 2328}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fdb3fbe2b0c4..964964baefa2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq)
47 desc->irq_count = 0; 47 desc->irq_count = 0;
48 desc->irqs_unhandled = 0; 48 desc->irqs_unhandled = 0;
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50 desc->affinity = CPU_MASK_ALL; 50 cpus_setall(desc->affinity);
51#endif 51#endif
52 spin_unlock_irqrestore(&desc->lock, flags); 52 spin_unlock_irqrestore(&desc->lock, flags);
53} 53}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 22be3ff3f363..e2764047ec03 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -165,7 +165,7 @@ static int ____call_usermodehelper(void *data)
165 } 165 }
166 166
167 /* We can run anywhere, unlike our parent keventd(). */ 167 /* We can run anywhere, unlike our parent keventd(). */
168 set_cpus_allowed(current, CPU_MASK_ALL); 168 set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
169 169
170 /* 170 /*
171 * Our parent is keventd, which runs with elevated scheduling priority. 171 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0ac887882f90..25241d6ec8cd 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
180 wait_task_inactive(k); 180 wait_task_inactive(k);
181 set_task_cpu(k, cpu); 181 set_task_cpu(k, cpu);
182 k->cpus_allowed = cpumask_of_cpu(cpu); 182 k->cpus_allowed = cpumask_of_cpu(cpu);
183 k->rt.nr_cpus_allowed = 1;
183} 184}
184EXPORT_SYMBOL(kthread_bind); 185EXPORT_SYMBOL(kthread_bind);
185 186
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b4e3c85abe74..7c74dab0d21b 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
64 return; 64 return;
65 65
66 for (i = 0; i < MAXLR; i++) { 66 for (i = 0; i < MAXLR; i++) {
67 int q; 67 int q, same = 1;
68 int same = 1; 68
69 /* Nothing stored: */ 69 /* Nothing stored: */
70 if (!latency_record[i].backtrace[0]) { 70 if (!latency_record[i].backtrace[0]) {
71 if (firstnonnull > i) 71 if (firstnonnull > i)
@@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
73 continue; 73 continue;
74 } 74 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
76 if (latency_record[i].backtrace[q] != 76 unsigned long record = lat->backtrace[q];
77 lat->backtrace[q]) 77
78 if (latency_record[i].backtrace[q] != record) {
78 same = 0; 79 same = 0;
79 if (same && lat->backtrace[q] == 0)
80 break; 80 break;
81 if (same && lat->backtrace[q] == ULONG_MAX) 81 }
82
83 /* 0 and ULONG_MAX entries mean end of backtrace: */
84 if (record == 0 || record == ULONG_MAX)
82 break; 85 break;
83 } 86 }
84 if (same) { 87 if (same) {
@@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
143 for (i = 0; i < LT_SAVECOUNT ; i++) { 146 for (i = 0; i < LT_SAVECOUNT ; i++) {
144 struct latency_record *mylat; 147 struct latency_record *mylat;
145 int same = 1; 148 int same = 1;
149
146 mylat = &tsk->latency_record[i]; 150 mylat = &tsk->latency_record[i];
147 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 151 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
148 if (mylat->backtrace[q] != 152 unsigned long record = lat.backtrace[q];
149 lat.backtrace[q]) 153
154 if (mylat->backtrace[q] != record) {
150 same = 0; 155 same = 0;
151 if (same && lat.backtrace[q] == 0)
152 break; 156 break;
153 if (same && lat.backtrace[q] == ULONG_MAX) 157 }
158
159 /* 0 and ULONG_MAX entries mean end of backtrace: */
160 if (record == 0 || record == ULONG_MAX)
154 break; 161 break;
155 } 162 }
156 if (same) { 163 if (same) {
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e9517014b57c..e1cdf196a515 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1007,10 +1007,10 @@ void __synchronize_sched(void)
1007 if (sched_getaffinity(0, &oldmask) < 0) 1007 if (sched_getaffinity(0, &oldmask) < 0)
1008 oldmask = cpu_possible_map; 1008 oldmask = cpu_possible_map;
1009 for_each_online_cpu(cpu) { 1009 for_each_online_cpu(cpu) {
1010 sched_setaffinity(0, cpumask_of_cpu(cpu)); 1010 sched_setaffinity(0, &cpumask_of_cpu(cpu));
1011 schedule(); 1011 schedule();
1012 } 1012 }
1013 sched_setaffinity(0, oldmask); 1013 sched_setaffinity(0, &oldmask);
1014} 1014}
1015EXPORT_SYMBOL_GPL(__synchronize_sched); 1015EXPORT_SYMBOL_GPL(__synchronize_sched);
1016 1016
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index fd599829e72a..47894f919d4e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -723,9 +723,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
723 */ 723 */
724static void rcu_torture_shuffle_tasks(void) 724static void rcu_torture_shuffle_tasks(void)
725{ 725{
726 cpumask_t tmp_mask = CPU_MASK_ALL; 726 cpumask_t tmp_mask;
727 int i; 727 int i;
728 728
729 cpus_setall(tmp_mask);
729 get_online_cpus(); 730 get_online_cpus();
730 731
731 /* No point in shuffling if there is only one online CPU (ex: UP) */ 732 /* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -737,25 +738,27 @@ static void rcu_torture_shuffle_tasks(void)
737 if (rcu_idle_cpu != -1) 738 if (rcu_idle_cpu != -1)
738 cpu_clear(rcu_idle_cpu, tmp_mask); 739 cpu_clear(rcu_idle_cpu, tmp_mask);
739 740
740 set_cpus_allowed(current, tmp_mask); 741 set_cpus_allowed_ptr(current, &tmp_mask);
741 742
742 if (reader_tasks) { 743 if (reader_tasks) {
743 for (i = 0; i < nrealreaders; i++) 744 for (i = 0; i < nrealreaders; i++)
744 if (reader_tasks[i]) 745 if (reader_tasks[i])
745 set_cpus_allowed(reader_tasks[i], tmp_mask); 746 set_cpus_allowed_ptr(reader_tasks[i],
747 &tmp_mask);
746 } 748 }
747 749
748 if (fakewriter_tasks) { 750 if (fakewriter_tasks) {
749 for (i = 0; i < nfakewriters; i++) 751 for (i = 0; i < nfakewriters; i++)
750 if (fakewriter_tasks[i]) 752 if (fakewriter_tasks[i])
751 set_cpus_allowed(fakewriter_tasks[i], tmp_mask); 753 set_cpus_allowed_ptr(fakewriter_tasks[i],
754 &tmp_mask);
752 } 755 }
753 756
754 if (writer_task) 757 if (writer_task)
755 set_cpus_allowed(writer_task, tmp_mask); 758 set_cpus_allowed_ptr(writer_task, &tmp_mask);
756 759
757 if (stats_task) 760 if (stats_task)
758 set_cpus_allowed(stats_task, tmp_mask); 761 set_cpus_allowed_ptr(stats_task, &tmp_mask);
759 762
760 if (rcu_idle_cpu == -1) 763 if (rcu_idle_cpu == -1)
761 rcu_idle_cpu = num_online_cpus() - 1; 764 rcu_idle_cpu = num_online_cpus() - 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index 8dcdec6fe0fe..57ba7ea9b744 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -66,6 +66,10 @@
66#include <linux/unistd.h> 66#include <linux/unistd.h>
67#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h> 68#include <linux/hrtimer.h>
69#include <linux/tick.h>
70#include <linux/bootmem.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
69 73
70#include <asm/tlb.h> 74#include <asm/tlb.h>
71#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
@@ -114,6 +118,11 @@ unsigned long long __attribute__((weak)) sched_clock(void)
114 */ 118 */
115#define DEF_TIMESLICE (100 * HZ / 1000) 119#define DEF_TIMESLICE (100 * HZ / 1000)
116 120
121/*
122 * single value that denotes runtime == period, ie unlimited time.
123 */
124#define RUNTIME_INF ((u64)~0ULL)
125
117#ifdef CONFIG_SMP 126#ifdef CONFIG_SMP
118/* 127/*
119 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -155,6 +164,84 @@ struct rt_prio_array {
155 struct list_head queue[MAX_RT_PRIO]; 164 struct list_head queue[MAX_RT_PRIO];
156}; 165};
157 166
167struct rt_bandwidth {
168 /* nests inside the rq lock: */
169 spinlock_t rt_runtime_lock;
170 ktime_t rt_period;
171 u64 rt_runtime;
172 struct hrtimer rt_period_timer;
173};
174
175static struct rt_bandwidth def_rt_bandwidth;
176
177static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
178
179static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
180{
181 struct rt_bandwidth *rt_b =
182 container_of(timer, struct rt_bandwidth, rt_period_timer);
183 ktime_t now;
184 int overrun;
185 int idle = 0;
186
187 for (;;) {
188 now = hrtimer_cb_get_time(timer);
189 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
190
191 if (!overrun)
192 break;
193
194 idle = do_sched_rt_period_timer(rt_b, overrun);
195 }
196
197 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
198}
199
200static
201void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
202{
203 rt_b->rt_period = ns_to_ktime(period);
204 rt_b->rt_runtime = runtime;
205
206 spin_lock_init(&rt_b->rt_runtime_lock);
207
208 hrtimer_init(&rt_b->rt_period_timer,
209 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
210 rt_b->rt_period_timer.function = sched_rt_period_timer;
211 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
212}
213
214static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
215{
216 ktime_t now;
217
218 if (rt_b->rt_runtime == RUNTIME_INF)
219 return;
220
221 if (hrtimer_active(&rt_b->rt_period_timer))
222 return;
223
224 spin_lock(&rt_b->rt_runtime_lock);
225 for (;;) {
226 if (hrtimer_active(&rt_b->rt_period_timer))
227 break;
228
229 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
230 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
231 hrtimer_start(&rt_b->rt_period_timer,
232 rt_b->rt_period_timer.expires,
233 HRTIMER_MODE_ABS);
234 }
235 spin_unlock(&rt_b->rt_runtime_lock);
236}
237
238#ifdef CONFIG_RT_GROUP_SCHED
239static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
240{
241 hrtimer_cancel(&rt_b->rt_period_timer);
242}
243#endif
244
158#ifdef CONFIG_GROUP_SCHED 245#ifdef CONFIG_GROUP_SCHED
159 246
160#include <linux/cgroup.h> 247#include <linux/cgroup.h>
@@ -181,29 +268,39 @@ struct task_group {
181 struct sched_rt_entity **rt_se; 268 struct sched_rt_entity **rt_se;
182 struct rt_rq **rt_rq; 269 struct rt_rq **rt_rq;
183 270
184 u64 rt_runtime; 271 struct rt_bandwidth rt_bandwidth;
185#endif 272#endif
186 273
187 struct rcu_head rcu; 274 struct rcu_head rcu;
188 struct list_head list; 275 struct list_head list;
276
277 struct task_group *parent;
278 struct list_head siblings;
279 struct list_head children;
189}; 280};
190 281
282#ifdef CONFIG_USER_SCHED
283
284/*
285 * Root task group.
286 * Every UID task group (including init_task_group aka UID-0) will
287 * be a child to this group.
288 */
289struct task_group root_task_group;
290
191#ifdef CONFIG_FAIR_GROUP_SCHED 291#ifdef CONFIG_FAIR_GROUP_SCHED
192/* Default task group's sched entity on each cpu */ 292/* Default task group's sched entity on each cpu */
193static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 293static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
194/* Default task group's cfs_rq on each cpu */ 294/* Default task group's cfs_rq on each cpu */
195static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 295static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
196
197static struct sched_entity *init_sched_entity_p[NR_CPUS];
198static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
199#endif 296#endif
200 297
201#ifdef CONFIG_RT_GROUP_SCHED 298#ifdef CONFIG_RT_GROUP_SCHED
202static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 299static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
203static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 300static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
204 301#endif
205static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; 302#else
206static struct rt_rq *init_rt_rq_p[NR_CPUS]; 303#define root_task_group init_task_group
207#endif 304#endif
208 305
209/* task_group_lock serializes add/remove of task groups and also changes to 306/* task_group_lock serializes add/remove of task groups and also changes to
@@ -221,23 +318,15 @@ static DEFINE_MUTEX(doms_cur_mutex);
221# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 318# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
222#endif 319#endif
223 320
321#define MIN_SHARES 2
322
224static int init_task_group_load = INIT_TASK_GROUP_LOAD; 323static int init_task_group_load = INIT_TASK_GROUP_LOAD;
225#endif 324#endif
226 325
227/* Default task group. 326/* Default task group.
228 * Every task in system belong to this group at bootup. 327 * Every task in system belong to this group at bootup.
229 */ 328 */
230struct task_group init_task_group = { 329struct task_group init_task_group;
231#ifdef CONFIG_FAIR_GROUP_SCHED
232 .se = init_sched_entity_p,
233 .cfs_rq = init_cfs_rq_p,
234#endif
235
236#ifdef CONFIG_RT_GROUP_SCHED
237 .rt_se = init_sched_rt_entity_p,
238 .rt_rq = init_rt_rq_p,
239#endif
240};
241 330
242/* return group to which a task belongs */ 331/* return group to which a task belongs */
243static inline struct task_group *task_group(struct task_struct *p) 332static inline struct task_group *task_group(struct task_struct *p)
@@ -297,8 +386,12 @@ struct cfs_rq {
297 386
298 struct rb_root tasks_timeline; 387 struct rb_root tasks_timeline;
299 struct rb_node *rb_leftmost; 388 struct rb_node *rb_leftmost;
300 struct rb_node *rb_load_balance_curr; 389
301 /* 'curr' points to currently running entity on this cfs_rq. 390 struct list_head tasks;
391 struct list_head *balance_iterator;
392
393 /*
394 * 'curr' points to currently running entity on this cfs_rq.
302 * It is set to NULL otherwise (i.e when none are currently running). 395 * It is set to NULL otherwise (i.e when none are currently running).
303 */ 396 */
304 struct sched_entity *curr, *next; 397 struct sched_entity *curr, *next;
@@ -318,6 +411,43 @@ struct cfs_rq {
318 */ 411 */
319 struct list_head leaf_cfs_rq_list; 412 struct list_head leaf_cfs_rq_list;
320 struct task_group *tg; /* group that "owns" this runqueue */ 413 struct task_group *tg; /* group that "owns" this runqueue */
414
415#ifdef CONFIG_SMP
416 unsigned long task_weight;
417 unsigned long shares;
418 /*
419 * We need space to build a sched_domain wide view of the full task
420 * group tree, in order to avoid depending on dynamic memory allocation
421 * during the load balancing we place this in the per cpu task group
422 * hierarchy. This limits the load balancing to one instance per cpu,
423 * but more should not be needed anyway.
424 */
425 struct aggregate_struct {
426 /*
427 * load = weight(cpus) * f(tg)
428 *
429 * Where f(tg) is the recursive weight fraction assigned to
430 * this group.
431 */
432 unsigned long load;
433
434 /*
435 * part of the group weight distributed to this span.
436 */
437 unsigned long shares;
438
439 /*
440 * The sum of all runqueue weights within this span.
441 */
442 unsigned long rq_weight;
443
444 /*
445 * Weight contributed by tasks; this is the part we can
446 * influence by moving tasks around.
447 */
448 unsigned long task_weight;
449 } aggregate;
450#endif
321#endif 451#endif
322}; 452};
323 453
@@ -334,6 +464,9 @@ struct rt_rq {
334#endif 464#endif
335 int rt_throttled; 465 int rt_throttled;
336 u64 rt_time; 466 u64 rt_time;
467 u64 rt_runtime;
468 /* Nests inside the rq lock: */
469 spinlock_t rt_runtime_lock;
337 470
338#ifdef CONFIG_RT_GROUP_SCHED 471#ifdef CONFIG_RT_GROUP_SCHED
339 unsigned long rt_nr_boosted; 472 unsigned long rt_nr_boosted;
@@ -396,6 +529,7 @@ struct rq {
396 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 529 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
397 unsigned char idle_at_tick; 530 unsigned char idle_at_tick;
398#ifdef CONFIG_NO_HZ 531#ifdef CONFIG_NO_HZ
532 unsigned long last_tick_seen;
399 unsigned char in_nohz_recently; 533 unsigned char in_nohz_recently;
400#endif 534#endif
401 /* capture load from *all* tasks on this cpu: */ 535 /* capture load from *all* tasks on this cpu: */
@@ -405,8 +539,6 @@ struct rq {
405 539
406 struct cfs_rq cfs; 540 struct cfs_rq cfs;
407 struct rt_rq rt; 541 struct rt_rq rt;
408 u64 rt_period_expire;
409 int rt_throttled;
410 542
411#ifdef CONFIG_FAIR_GROUP_SCHED 543#ifdef CONFIG_FAIR_GROUP_SCHED
412 /* list of leaf cfs_rq on this cpu: */ 544 /* list of leaf cfs_rq on this cpu: */
@@ -499,6 +631,32 @@ static inline int cpu_of(struct rq *rq)
499#endif 631#endif
500} 632}
501 633
634#ifdef CONFIG_NO_HZ
635static inline bool nohz_on(int cpu)
636{
637 return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
638}
639
640static inline u64 max_skipped_ticks(struct rq *rq)
641{
642 return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
643}
644
645static inline void update_last_tick_seen(struct rq *rq)
646{
647 rq->last_tick_seen = jiffies;
648}
649#else
650static inline u64 max_skipped_ticks(struct rq *rq)
651{
652 return 1;
653}
654
655static inline void update_last_tick_seen(struct rq *rq)
656{
657}
658#endif
659
502/* 660/*
503 * Update the per-runqueue clock, as finegrained as the platform can give 661 * Update the per-runqueue clock, as finegrained as the platform can give
504 * us, but without assuming monotonicity, etc.: 662 * us, but without assuming monotonicity, etc.:
@@ -523,9 +681,12 @@ static void __update_rq_clock(struct rq *rq)
523 /* 681 /*
524 * Catch too large forward jumps too: 682 * Catch too large forward jumps too:
525 */ 683 */
526 if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { 684 u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
527 if (clock < rq->tick_timestamp + TICK_NSEC) 685 u64 max_time = rq->tick_timestamp + max_jump;
528 clock = rq->tick_timestamp + TICK_NSEC; 686
687 if (unlikely(clock + delta > max_time)) {
688 if (clock < max_time)
689 clock = max_time;
529 else 690 else
530 clock++; 691 clock++;
531 rq->clock_overflows++; 692 rq->clock_overflows++;
@@ -561,23 +722,6 @@ static void update_rq_clock(struct rq *rq)
561#define task_rq(p) cpu_rq(task_cpu(p)) 722#define task_rq(p) cpu_rq(task_cpu(p))
562#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 723#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
563 724
564unsigned long rt_needs_cpu(int cpu)
565{
566 struct rq *rq = cpu_rq(cpu);
567 u64 delta;
568
569 if (!rq->rt_throttled)
570 return 0;
571
572 if (rq->clock > rq->rt_period_expire)
573 return 1;
574
575 delta = rq->rt_period_expire - rq->clock;
576 do_div(delta, NSEC_PER_SEC / HZ);
577
578 return (unsigned long)delta;
579}
580
581/* 725/*
582 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 726 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
583 */ 727 */
@@ -590,22 +734,137 @@ unsigned long rt_needs_cpu(int cpu)
590/* 734/*
591 * Debugging: various feature bits 735 * Debugging: various feature bits
592 */ 736 */
737
738#define SCHED_FEAT(name, enabled) \
739 __SCHED_FEAT_##name ,
740
593enum { 741enum {
594 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, 742#include "sched_features.h"
595 SCHED_FEAT_WAKEUP_PREEMPT = 2,
596 SCHED_FEAT_START_DEBIT = 4,
597 SCHED_FEAT_HRTICK = 8,
598 SCHED_FEAT_DOUBLE_TICK = 16,
599}; 743};
600 744
745#undef SCHED_FEAT
746
747#define SCHED_FEAT(name, enabled) \
748 (1UL << __SCHED_FEAT_##name) * enabled |
749
601const_debug unsigned int sysctl_sched_features = 750const_debug unsigned int sysctl_sched_features =
602 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | 751#include "sched_features.h"
603 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 752 0;
604 SCHED_FEAT_START_DEBIT * 1 | 753
605 SCHED_FEAT_HRTICK * 1 | 754#undef SCHED_FEAT
606 SCHED_FEAT_DOUBLE_TICK * 0; 755
756#ifdef CONFIG_SCHED_DEBUG
757#define SCHED_FEAT(name, enabled) \
758 #name ,
759
760__read_mostly char *sched_feat_names[] = {
761#include "sched_features.h"
762 NULL
763};
764
765#undef SCHED_FEAT
766
767int sched_feat_open(struct inode *inode, struct file *filp)
768{
769 filp->private_data = inode->i_private;
770 return 0;
771}
772
773static ssize_t
774sched_feat_read(struct file *filp, char __user *ubuf,
775 size_t cnt, loff_t *ppos)
776{
777 char *buf;
778 int r = 0;
779 int len = 0;
780 int i;
607 781
608#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 782 for (i = 0; sched_feat_names[i]; i++) {
783 len += strlen(sched_feat_names[i]);
784 len += 4;
785 }
786
787 buf = kmalloc(len + 2, GFP_KERNEL);
788 if (!buf)
789 return -ENOMEM;
790
791 for (i = 0; sched_feat_names[i]; i++) {
792 if (sysctl_sched_features & (1UL << i))
793 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
794 else
795 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
796 }
797
798 r += sprintf(buf + r, "\n");
799 WARN_ON(r >= len + 2);
800
801 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
802
803 kfree(buf);
804
805 return r;
806}
807
808static ssize_t
809sched_feat_write(struct file *filp, const char __user *ubuf,
810 size_t cnt, loff_t *ppos)
811{
812 char buf[64];
813 char *cmp = buf;
814 int neg = 0;
815 int i;
816
817 if (cnt > 63)
818 cnt = 63;
819
820 if (copy_from_user(&buf, ubuf, cnt))
821 return -EFAULT;
822
823 buf[cnt] = 0;
824
825 if (strncmp(buf, "NO_", 3) == 0) {
826 neg = 1;
827 cmp += 3;
828 }
829
830 for (i = 0; sched_feat_names[i]; i++) {
831 int len = strlen(sched_feat_names[i]);
832
833 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
834 if (neg)
835 sysctl_sched_features &= ~(1UL << i);
836 else
837 sysctl_sched_features |= (1UL << i);
838 break;
839 }
840 }
841
842 if (!sched_feat_names[i])
843 return -EINVAL;
844
845 filp->f_pos += cnt;
846
847 return cnt;
848}
849
850static struct file_operations sched_feat_fops = {
851 .open = sched_feat_open,
852 .read = sched_feat_read,
853 .write = sched_feat_write,
854};
855
856static __init int sched_init_debug(void)
857{
858 debugfs_create_file("sched_features", 0644, NULL, NULL,
859 &sched_feat_fops);
860
861 return 0;
862}
863late_initcall(sched_init_debug);
864
865#endif
866
867#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
609 868
610/* 869/*
611 * Number of tasks to iterate in a single balance run. 870 * Number of tasks to iterate in a single balance run.
@@ -627,16 +886,52 @@ static __read_mostly int scheduler_running;
627 */ 886 */
628int sysctl_sched_rt_runtime = 950000; 887int sysctl_sched_rt_runtime = 950000;
629 888
630/* 889static inline u64 global_rt_period(void)
631 * single value that denotes runtime == period, ie unlimited time. 890{
632 */ 891 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
633#define RUNTIME_INF ((u64)~0ULL) 892}
893
894static inline u64 global_rt_runtime(void)
895{
896 if (sysctl_sched_rt_period < 0)
897 return RUNTIME_INF;
898
899 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
900}
901
902static const unsigned long long time_sync_thresh = 100000;
903
904static DEFINE_PER_CPU(unsigned long long, time_offset);
905static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
634 906
635/* 907/*
636 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 908 * Global lock which we take every now and then to synchronize
637 * clock constructed from sched_clock(): 909 * the CPUs time. This method is not warp-safe, but it's good
910 * enough to synchronize slowly diverging time sources and thus
911 * it's good enough for tracing:
638 */ 912 */
639unsigned long long cpu_clock(int cpu) 913static DEFINE_SPINLOCK(time_sync_lock);
914static unsigned long long prev_global_time;
915
916static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
917{
918 unsigned long flags;
919
920 spin_lock_irqsave(&time_sync_lock, flags);
921
922 if (time < prev_global_time) {
923 per_cpu(time_offset, cpu) += prev_global_time - time;
924 time = prev_global_time;
925 } else {
926 prev_global_time = time;
927 }
928
929 spin_unlock_irqrestore(&time_sync_lock, flags);
930
931 return time;
932}
933
934static unsigned long long __cpu_clock(int cpu)
640{ 935{
641 unsigned long long now; 936 unsigned long long now;
642 unsigned long flags; 937 unsigned long flags;
@@ -657,6 +952,24 @@ unsigned long long cpu_clock(int cpu)
657 952
658 return now; 953 return now;
659} 954}
955
956/*
957 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
958 * clock constructed from sched_clock():
959 */
960unsigned long long cpu_clock(int cpu)
961{
962 unsigned long long prev_cpu_time, time, delta_time;
963
964 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
965 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
966 delta_time = time-prev_cpu_time;
967
968 if (unlikely(delta_time > time_sync_thresh))
969 time = __sync_cpu_clock(time, cpu);
970
971 return time;
972}
660EXPORT_SYMBOL_GPL(cpu_clock); 973EXPORT_SYMBOL_GPL(cpu_clock);
661 974
662#ifndef prepare_arch_switch 975#ifndef prepare_arch_switch
@@ -1116,6 +1429,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1116 */ 1429 */
1117#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1430#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1118 1431
1432/*
1433 * delta *= weight / lw
1434 */
1119static unsigned long 1435static unsigned long
1120calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1436calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1121 struct load_weight *lw) 1437 struct load_weight *lw)
@@ -1138,12 +1454,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1138 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1454 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1139} 1455}
1140 1456
1141static inline unsigned long
1142calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1143{
1144 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1145}
1146
1147static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1457static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1148{ 1458{
1149 lw->weight += inc; 1459 lw->weight += inc;
@@ -1241,11 +1551,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1241static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1551static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1242#endif 1552#endif
1243 1553
1554static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1555{
1556 update_load_add(&rq->load, load);
1557}
1558
1559static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1560{
1561 update_load_sub(&rq->load, load);
1562}
1563
1244#ifdef CONFIG_SMP 1564#ifdef CONFIG_SMP
1245static unsigned long source_load(int cpu, int type); 1565static unsigned long source_load(int cpu, int type);
1246static unsigned long target_load(int cpu, int type); 1566static unsigned long target_load(int cpu, int type);
1247static unsigned long cpu_avg_load_per_task(int cpu); 1567static unsigned long cpu_avg_load_per_task(int cpu);
1248static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1568static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1569
1570#ifdef CONFIG_FAIR_GROUP_SCHED
1571
1572/*
1573 * Group load balancing.
1574 *
1575 * We calculate a few balance domain wide aggregate numbers; load and weight.
1576 * Given the pictures below, and assuming each item has equal weight:
1577 *
1578 * root 1 - thread
1579 * / | \ A - group
1580 * A 1 B
1581 * /|\ / \
1582 * C 2 D 3 4
1583 * | |
1584 * 5 6
1585 *
1586 * load:
1587 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1588 * which equals 1/9-th of the total load.
1589 *
1590 * shares:
1591 * The weight of this group on the selected cpus.
1592 *
1593 * rq_weight:
1594 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1595 * B would get 2.
1596 *
1597 * task_weight:
1598 * Part of the rq_weight contributed by tasks; all groups except B would
1599 * get 1, B gets 2.
1600 */
1601
1602static inline struct aggregate_struct *
1603aggregate(struct task_group *tg, struct sched_domain *sd)
1604{
1605 return &tg->cfs_rq[sd->first_cpu]->aggregate;
1606}
1607
1608typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
1609
1610/*
1611 * Iterate the full tree, calling @down when first entering a node and @up when
1612 * leaving it for the final time.
1613 */
1614static
1615void aggregate_walk_tree(aggregate_func down, aggregate_func up,
1616 struct sched_domain *sd)
1617{
1618 struct task_group *parent, *child;
1619
1620 rcu_read_lock();
1621 parent = &root_task_group;
1622down:
1623 (*down)(parent, sd);
1624 list_for_each_entry_rcu(child, &parent->children, siblings) {
1625 parent = child;
1626 goto down;
1627
1628up:
1629 continue;
1630 }
1631 (*up)(parent, sd);
1632
1633 child = parent;
1634 parent = parent->parent;
1635 if (parent)
1636 goto up;
1637 rcu_read_unlock();
1638}
1639
1640/*
1641 * Calculate the aggregate runqueue weight.
1642 */
1643static
1644void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1645{
1646 unsigned long rq_weight = 0;
1647 unsigned long task_weight = 0;
1648 int i;
1649
1650 for_each_cpu_mask(i, sd->span) {
1651 rq_weight += tg->cfs_rq[i]->load.weight;
1652 task_weight += tg->cfs_rq[i]->task_weight;
1653 }
1654
1655 aggregate(tg, sd)->rq_weight = rq_weight;
1656 aggregate(tg, sd)->task_weight = task_weight;
1657}
1658
1659/*
1660 * Redistribute tg->shares amongst all tg->cfs_rq[]s.
1661 */
1662static void __aggregate_redistribute_shares(struct task_group *tg)
1663{
1664 int i, max_cpu = smp_processor_id();
1665 unsigned long rq_weight = 0;
1666 unsigned long shares, max_shares = 0, shares_rem = tg->shares;
1667
1668 for_each_possible_cpu(i)
1669 rq_weight += tg->cfs_rq[i]->load.weight;
1670
1671 for_each_possible_cpu(i) {
1672 /*
1673 * divide shares proportional to the rq_weights.
1674 */
1675 shares = tg->shares * tg->cfs_rq[i]->load.weight;
1676 shares /= rq_weight + 1;
1677
1678 tg->cfs_rq[i]->shares = shares;
1679
1680 if (shares > max_shares) {
1681 max_shares = shares;
1682 max_cpu = i;
1683 }
1684 shares_rem -= shares;
1685 }
1686
1687 /*
1688 * Ensure it all adds up to tg->shares; we can loose a few
1689 * due to rounding down when computing the per-cpu shares.
1690 */
1691 if (shares_rem)
1692 tg->cfs_rq[max_cpu]->shares += shares_rem;
1693}
1694
1695/*
1696 * Compute the weight of this group on the given cpus.
1697 */
1698static
1699void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1700{
1701 unsigned long shares = 0;
1702 int i;
1703
1704again:
1705 for_each_cpu_mask(i, sd->span)
1706 shares += tg->cfs_rq[i]->shares;
1707
1708 /*
1709 * When the span doesn't have any shares assigned, but does have
1710 * tasks to run do a machine wide rebalance (should be rare).
1711 */
1712 if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
1713 __aggregate_redistribute_shares(tg);
1714 goto again;
1715 }
1716
1717 aggregate(tg, sd)->shares = shares;
1718}
1719
1720/*
1721 * Compute the load fraction assigned to this group, relies on the aggregate
1722 * weight and this group's parent's load, i.e. top-down.
1723 */
1724static
1725void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1726{
1727 unsigned long load;
1728
1729 if (!tg->parent) {
1730 int i;
1731
1732 load = 0;
1733 for_each_cpu_mask(i, sd->span)
1734 load += cpu_rq(i)->load.weight;
1735
1736 } else {
1737 load = aggregate(tg->parent, sd)->load;
1738
1739 /*
1740 * shares is our weight in the parent's rq so
1741 * shares/parent->rq_weight gives our fraction of the load
1742 */
1743 load *= aggregate(tg, sd)->shares;
1744 load /= aggregate(tg->parent, sd)->rq_weight + 1;
1745 }
1746
1747 aggregate(tg, sd)->load = load;
1748}
1749
1750static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1751
1752/*
1753 * Calculate and set the cpu's group shares.
1754 */
1755static void
1756__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1757 int tcpu)
1758{
1759 int boost = 0;
1760 unsigned long shares;
1761 unsigned long rq_weight;
1762
1763 if (!tg->se[tcpu])
1764 return;
1765
1766 rq_weight = tg->cfs_rq[tcpu]->load.weight;
1767
1768 /*
1769 * If there are currently no tasks on the cpu pretend there is one of
1770 * average load so that when a new task gets to run here it will not
1771 * get delayed by group starvation.
1772 */
1773 if (!rq_weight) {
1774 boost = 1;
1775 rq_weight = NICE_0_LOAD;
1776 }
1777
1778 /*
1779 * \Sum shares * rq_weight
1780 * shares = -----------------------
1781 * \Sum rq_weight
1782 *
1783 */
1784 shares = aggregate(tg, sd)->shares * rq_weight;
1785 shares /= aggregate(tg, sd)->rq_weight + 1;
1786
1787 /*
1788 * record the actual number of shares, not the boosted amount.
1789 */
1790 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
1791
1792 if (shares < MIN_SHARES)
1793 shares = MIN_SHARES;
1794
1795 __set_se_shares(tg->se[tcpu], shares);
1796}
1797
1798/*
1799 * Re-adjust the weights on the cpu the task came from and on the cpu the
1800 * task went to.
1801 */
1802static void
1803__move_group_shares(struct task_group *tg, struct sched_domain *sd,
1804 int scpu, int dcpu)
1805{
1806 unsigned long shares;
1807
1808 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1809
1810 __update_group_shares_cpu(tg, sd, scpu);
1811 __update_group_shares_cpu(tg, sd, dcpu);
1812
1813 /*
1814 * ensure we never loose shares due to rounding errors in the
1815 * above redistribution.
1816 */
1817 shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1818 if (shares)
1819 tg->cfs_rq[dcpu]->shares += shares;
1820}
1821
1822/*
1823 * Because changing a group's shares changes the weight of the super-group
1824 * we need to walk up the tree and change all shares until we hit the root.
1825 */
1826static void
1827move_group_shares(struct task_group *tg, struct sched_domain *sd,
1828 int scpu, int dcpu)
1829{
1830 while (tg) {
1831 __move_group_shares(tg, sd, scpu, dcpu);
1832 tg = tg->parent;
1833 }
1834}
1835
1836static
1837void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
1838{
1839 unsigned long shares = aggregate(tg, sd)->shares;
1840 int i;
1841
1842 for_each_cpu_mask(i, sd->span) {
1843 struct rq *rq = cpu_rq(i);
1844 unsigned long flags;
1845
1846 spin_lock_irqsave(&rq->lock, flags);
1847 __update_group_shares_cpu(tg, sd, i);
1848 spin_unlock_irqrestore(&rq->lock, flags);
1849 }
1850
1851 aggregate_group_shares(tg, sd);
1852
1853 /*
1854 * ensure we never loose shares due to rounding errors in the
1855 * above redistribution.
1856 */
1857 shares -= aggregate(tg, sd)->shares;
1858 if (shares) {
1859 tg->cfs_rq[sd->first_cpu]->shares += shares;
1860 aggregate(tg, sd)->shares += shares;
1861 }
1862}
1863
1864/*
1865 * Calculate the accumulative weight and recursive load of each task group
1866 * while walking down the tree.
1867 */
1868static
1869void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
1870{
1871 aggregate_group_weight(tg, sd);
1872 aggregate_group_shares(tg, sd);
1873 aggregate_group_load(tg, sd);
1874}
1875
1876/*
1877 * Rebalance the cpu shares while walking back up the tree.
1878 */
1879static
1880void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
1881{
1882 aggregate_group_set_shares(tg, sd);
1883}
1884
1885static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1886
1887static void __init init_aggregate(void)
1888{
1889 int i;
1890
1891 for_each_possible_cpu(i)
1892 spin_lock_init(&per_cpu(aggregate_lock, i));
1893}
1894
1895static int get_aggregate(struct sched_domain *sd)
1896{
1897 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
1898 return 0;
1899
1900 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
1901 return 1;
1902}
1903
1904static void put_aggregate(struct sched_domain *sd)
1905{
1906 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
1907}
1908
1909static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1910{
1911 cfs_rq->shares = shares;
1912}
1913
1914#else
1915
1916static inline void init_aggregate(void)
1917{
1918}
1919
1920static inline int get_aggregate(struct sched_domain *sd)
1921{
1922 return 0;
1923}
1924
1925static inline void put_aggregate(struct sched_domain *sd)
1926{
1927}
1928#endif
1929
1930#else /* CONFIG_SMP */
1931
1932#ifdef CONFIG_FAIR_GROUP_SCHED
1933static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1934{
1935}
1936#endif
1937
1249#endif /* CONFIG_SMP */ 1938#endif /* CONFIG_SMP */
1250 1939
1251#include "sched_stats.h" 1940#include "sched_stats.h"
@@ -1258,26 +1947,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1258 1947
1259#define sched_class_highest (&rt_sched_class) 1948#define sched_class_highest (&rt_sched_class)
1260 1949
1261static inline void inc_load(struct rq *rq, const struct task_struct *p) 1950static void inc_nr_running(struct rq *rq)
1262{
1263 update_load_add(&rq->load, p->se.load.weight);
1264}
1265
1266static inline void dec_load(struct rq *rq, const struct task_struct *p)
1267{
1268 update_load_sub(&rq->load, p->se.load.weight);
1269}
1270
1271static void inc_nr_running(struct task_struct *p, struct rq *rq)
1272{ 1951{
1273 rq->nr_running++; 1952 rq->nr_running++;
1274 inc_load(rq, p);
1275} 1953}
1276 1954
1277static void dec_nr_running(struct task_struct *p, struct rq *rq) 1955static void dec_nr_running(struct rq *rq)
1278{ 1956{
1279 rq->nr_running--; 1957 rq->nr_running--;
1280 dec_load(rq, p);
1281} 1958}
1282 1959
1283static void set_load_weight(struct task_struct *p) 1960static void set_load_weight(struct task_struct *p)
@@ -1369,7 +2046,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1369 rq->nr_uninterruptible--; 2046 rq->nr_uninterruptible--;
1370 2047
1371 enqueue_task(rq, p, wakeup); 2048 enqueue_task(rq, p, wakeup);
1372 inc_nr_running(p, rq); 2049 inc_nr_running(rq);
1373} 2050}
1374 2051
1375/* 2052/*
@@ -1381,7 +2058,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1381 rq->nr_uninterruptible++; 2058 rq->nr_uninterruptible++;
1382 2059
1383 dequeue_task(rq, p, sleep); 2060 dequeue_task(rq, p, sleep);
1384 dec_nr_running(p, rq); 2061 dec_nr_running(rq);
1385} 2062}
1386 2063
1387/** 2064/**
@@ -1438,7 +2115,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1438 /* 2115 /*
1439 * Buddy candidates are cache hot: 2116 * Buddy candidates are cache hot:
1440 */ 2117 */
1441 if (&p->se == cfs_rq_of(&p->se)->next) 2118 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
1442 return 1; 2119 return 1;
1443 2120
1444 if (p->sched_class != &fair_sched_class) 2121 if (p->sched_class != &fair_sched_class)
@@ -1728,17 +2405,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1728 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2405 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1729 */ 2406 */
1730static int 2407static int
1731find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 2408find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2409 cpumask_t *tmp)
1732{ 2410{
1733 cpumask_t tmp;
1734 unsigned long load, min_load = ULONG_MAX; 2411 unsigned long load, min_load = ULONG_MAX;
1735 int idlest = -1; 2412 int idlest = -1;
1736 int i; 2413 int i;
1737 2414
1738 /* Traverse only the allowed CPUs */ 2415 /* Traverse only the allowed CPUs */
1739 cpus_and(tmp, group->cpumask, p->cpus_allowed); 2416 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
1740 2417
1741 for_each_cpu_mask(i, tmp) { 2418 for_each_cpu_mask(i, *tmp) {
1742 load = weighted_cpuload(i); 2419 load = weighted_cpuload(i);
1743 2420
1744 if (load < min_load || (load == min_load && i == this_cpu)) { 2421 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1777,7 +2454,7 @@ static int sched_balance_self(int cpu, int flag)
1777 } 2454 }
1778 2455
1779 while (sd) { 2456 while (sd) {
1780 cpumask_t span; 2457 cpumask_t span, tmpmask;
1781 struct sched_group *group; 2458 struct sched_group *group;
1782 int new_cpu, weight; 2459 int new_cpu, weight;
1783 2460
@@ -1793,7 +2470,7 @@ static int sched_balance_self(int cpu, int flag)
1793 continue; 2470 continue;
1794 } 2471 }
1795 2472
1796 new_cpu = find_idlest_cpu(group, t, cpu); 2473 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
1797 if (new_cpu == -1 || new_cpu == cpu) { 2474 if (new_cpu == -1 || new_cpu == cpu) {
1798 /* Now try balancing at a lower domain level of cpu */ 2475 /* Now try balancing at a lower domain level of cpu */
1799 sd = sd->child; 2476 sd = sd->child;
@@ -1839,6 +2516,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1839 long old_state; 2516 long old_state;
1840 struct rq *rq; 2517 struct rq *rq;
1841 2518
2519 if (!sched_feat(SYNC_WAKEUPS))
2520 sync = 0;
2521
1842 smp_wmb(); 2522 smp_wmb();
1843 rq = task_rq_lock(p, &flags); 2523 rq = task_rq_lock(p, &flags);
1844 old_state = p->state; 2524 old_state = p->state;
@@ -1955,6 +2635,7 @@ static void __sched_fork(struct task_struct *p)
1955 2635
1956 INIT_LIST_HEAD(&p->rt.run_list); 2636 INIT_LIST_HEAD(&p->rt.run_list);
1957 p->se.on_rq = 0; 2637 p->se.on_rq = 0;
2638 INIT_LIST_HEAD(&p->se.group_node);
1958 2639
1959#ifdef CONFIG_PREEMPT_NOTIFIERS 2640#ifdef CONFIG_PREEMPT_NOTIFIERS
1960 INIT_HLIST_HEAD(&p->preempt_notifiers); 2641 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2030,7 +2711,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2030 * management (if any): 2711 * management (if any):
2031 */ 2712 */
2032 p->sched_class->task_new(rq, p); 2713 p->sched_class->task_new(rq, p);
2033 inc_nr_running(p, rq); 2714 inc_nr_running(rq);
2034 } 2715 }
2035 check_preempt_curr(rq, p); 2716 check_preempt_curr(rq, p);
2036#ifdef CONFIG_SMP 2717#ifdef CONFIG_SMP
@@ -2674,7 +3355,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2674static struct sched_group * 3355static struct sched_group *
2675find_busiest_group(struct sched_domain *sd, int this_cpu, 3356find_busiest_group(struct sched_domain *sd, int this_cpu,
2676 unsigned long *imbalance, enum cpu_idle_type idle, 3357 unsigned long *imbalance, enum cpu_idle_type idle,
2677 int *sd_idle, cpumask_t *cpus, int *balance) 3358 int *sd_idle, const cpumask_t *cpus, int *balance)
2678{ 3359{
2679 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3360 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2680 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3361 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2975,7 +3656,7 @@ ret:
2975 */ 3656 */
2976static struct rq * 3657static struct rq *
2977find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3658find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2978 unsigned long imbalance, cpumask_t *cpus) 3659 unsigned long imbalance, const cpumask_t *cpus)
2979{ 3660{
2980 struct rq *busiest = NULL, *rq; 3661 struct rq *busiest = NULL, *rq;
2981 unsigned long max_load = 0; 3662 unsigned long max_load = 0;
@@ -3014,14 +3695,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3014 */ 3695 */
3015static int load_balance(int this_cpu, struct rq *this_rq, 3696static int load_balance(int this_cpu, struct rq *this_rq,
3016 struct sched_domain *sd, enum cpu_idle_type idle, 3697 struct sched_domain *sd, enum cpu_idle_type idle,
3017 int *balance) 3698 int *balance, cpumask_t *cpus)
3018{ 3699{
3019 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3700 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3020 struct sched_group *group; 3701 struct sched_group *group;
3021 unsigned long imbalance; 3702 unsigned long imbalance;
3022 struct rq *busiest; 3703 struct rq *busiest;
3023 cpumask_t cpus = CPU_MASK_ALL;
3024 unsigned long flags; 3704 unsigned long flags;
3705 int unlock_aggregate;
3706
3707 cpus_setall(*cpus);
3708
3709 unlock_aggregate = get_aggregate(sd);
3025 3710
3026 /* 3711 /*
3027 * When power savings policy is enabled for the parent domain, idle 3712 * When power savings policy is enabled for the parent domain, idle
@@ -3037,7 +3722,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3037 3722
3038redo: 3723redo:
3039 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3724 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3040 &cpus, balance); 3725 cpus, balance);
3041 3726
3042 if (*balance == 0) 3727 if (*balance == 0)
3043 goto out_balanced; 3728 goto out_balanced;
@@ -3047,7 +3732,7 @@ redo:
3047 goto out_balanced; 3732 goto out_balanced;
3048 } 3733 }
3049 3734
3050 busiest = find_busiest_queue(group, idle, imbalance, &cpus); 3735 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3051 if (!busiest) { 3736 if (!busiest) {
3052 schedstat_inc(sd, lb_nobusyq[idle]); 3737 schedstat_inc(sd, lb_nobusyq[idle]);
3053 goto out_balanced; 3738 goto out_balanced;
@@ -3080,8 +3765,8 @@ redo:
3080 3765
3081 /* All tasks on this runqueue were pinned by CPU affinity */ 3766 /* All tasks on this runqueue were pinned by CPU affinity */
3082 if (unlikely(all_pinned)) { 3767 if (unlikely(all_pinned)) {
3083 cpu_clear(cpu_of(busiest), cpus); 3768 cpu_clear(cpu_of(busiest), *cpus);
3084 if (!cpus_empty(cpus)) 3769 if (!cpus_empty(*cpus))
3085 goto redo; 3770 goto redo;
3086 goto out_balanced; 3771 goto out_balanced;
3087 } 3772 }
@@ -3138,8 +3823,9 @@ redo:
3138 3823
3139 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3824 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3140 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3825 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3141 return -1; 3826 ld_moved = -1;
3142 return ld_moved; 3827
3828 goto out;
3143 3829
3144out_balanced: 3830out_balanced:
3145 schedstat_inc(sd, lb_balanced[idle]); 3831 schedstat_inc(sd, lb_balanced[idle]);
@@ -3154,8 +3840,13 @@ out_one_pinned:
3154 3840
3155 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3841 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3156 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3842 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3157 return -1; 3843 ld_moved = -1;
3158 return 0; 3844 else
3845 ld_moved = 0;
3846out:
3847 if (unlock_aggregate)
3848 put_aggregate(sd);
3849 return ld_moved;
3159} 3850}
3160 3851
3161/* 3852/*
@@ -3166,7 +3857,8 @@ out_one_pinned:
3166 * this_rq is locked. 3857 * this_rq is locked.
3167 */ 3858 */
3168static int 3859static int
3169load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) 3860load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3861 cpumask_t *cpus)
3170{ 3862{
3171 struct sched_group *group; 3863 struct sched_group *group;
3172 struct rq *busiest = NULL; 3864 struct rq *busiest = NULL;
@@ -3174,7 +3866,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3174 int ld_moved = 0; 3866 int ld_moved = 0;
3175 int sd_idle = 0; 3867 int sd_idle = 0;
3176 int all_pinned = 0; 3868 int all_pinned = 0;
3177 cpumask_t cpus = CPU_MASK_ALL; 3869
3870 cpus_setall(*cpus);
3178 3871
3179 /* 3872 /*
3180 * When power savings policy is enabled for the parent domain, idle 3873 * When power savings policy is enabled for the parent domain, idle
@@ -3189,14 +3882,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3189 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3882 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3190redo: 3883redo:
3191 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3884 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3192 &sd_idle, &cpus, NULL); 3885 &sd_idle, cpus, NULL);
3193 if (!group) { 3886 if (!group) {
3194 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); 3887 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
3195 goto out_balanced; 3888 goto out_balanced;
3196 } 3889 }
3197 3890
3198 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, 3891 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
3199 &cpus);
3200 if (!busiest) { 3892 if (!busiest) {
3201 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); 3893 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
3202 goto out_balanced; 3894 goto out_balanced;
@@ -3218,8 +3910,8 @@ redo:
3218 spin_unlock(&busiest->lock); 3910 spin_unlock(&busiest->lock);
3219 3911
3220 if (unlikely(all_pinned)) { 3912 if (unlikely(all_pinned)) {
3221 cpu_clear(cpu_of(busiest), cpus); 3913 cpu_clear(cpu_of(busiest), *cpus);
3222 if (!cpus_empty(cpus)) 3914 if (!cpus_empty(*cpus))
3223 goto redo; 3915 goto redo;
3224 } 3916 }
3225 } 3917 }
@@ -3253,6 +3945,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3253 struct sched_domain *sd; 3945 struct sched_domain *sd;
3254 int pulled_task = -1; 3946 int pulled_task = -1;
3255 unsigned long next_balance = jiffies + HZ; 3947 unsigned long next_balance = jiffies + HZ;
3948 cpumask_t tmpmask;
3256 3949
3257 for_each_domain(this_cpu, sd) { 3950 for_each_domain(this_cpu, sd) {
3258 unsigned long interval; 3951 unsigned long interval;
@@ -3262,8 +3955,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3262 3955
3263 if (sd->flags & SD_BALANCE_NEWIDLE) 3956 if (sd->flags & SD_BALANCE_NEWIDLE)
3264 /* If we've pulled tasks over stop searching: */ 3957 /* If we've pulled tasks over stop searching: */
3265 pulled_task = load_balance_newidle(this_cpu, 3958 pulled_task = load_balance_newidle(this_cpu, this_rq,
3266 this_rq, sd); 3959 sd, &tmpmask);
3267 3960
3268 interval = msecs_to_jiffies(sd->balance_interval); 3961 interval = msecs_to_jiffies(sd->balance_interval);
3269 if (time_after(next_balance, sd->last_balance + interval)) 3962 if (time_after(next_balance, sd->last_balance + interval))
@@ -3422,6 +4115,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3422 /* Earliest time when we have to do rebalance again */ 4115 /* Earliest time when we have to do rebalance again */
3423 unsigned long next_balance = jiffies + 60*HZ; 4116 unsigned long next_balance = jiffies + 60*HZ;
3424 int update_next_balance = 0; 4117 int update_next_balance = 0;
4118 cpumask_t tmp;
3425 4119
3426 for_each_domain(cpu, sd) { 4120 for_each_domain(cpu, sd) {
3427 if (!(sd->flags & SD_LOAD_BALANCE)) 4121 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3445,7 +4139,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3445 } 4139 }
3446 4140
3447 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4141 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3448 if (load_balance(cpu, rq, sd, idle, &balance)) { 4142 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
3449 /* 4143 /*
3450 * We've pulled tasks over so either we're no 4144 * We've pulled tasks over so either we're no
3451 * longer idle, or one of our SMT siblings is 4145 * longer idle, or one of our SMT siblings is
@@ -3561,7 +4255,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
3561 */ 4255 */
3562 int ilb = first_cpu(nohz.cpu_mask); 4256 int ilb = first_cpu(nohz.cpu_mask);
3563 4257
3564 if (ilb != NR_CPUS) 4258 if (ilb < nr_cpu_ids)
3565 resched_cpu(ilb); 4259 resched_cpu(ilb);
3566 } 4260 }
3567 } 4261 }
@@ -3765,9 +4459,9 @@ void scheduler_tick(void)
3765 rq->clock_underflows++; 4459 rq->clock_underflows++;
3766 } 4460 }
3767 rq->tick_timestamp = rq->clock; 4461 rq->tick_timestamp = rq->clock;
4462 update_last_tick_seen(rq);
3768 update_cpu_load(rq); 4463 update_cpu_load(rq);
3769 curr->sched_class->task_tick(rq, curr, 0); 4464 curr->sched_class->task_tick(rq, curr, 0);
3770 update_sched_rt_period(rq);
3771 spin_unlock(&rq->lock); 4465 spin_unlock(&rq->lock);
3772 4466
3773#ifdef CONFIG_SMP 4467#ifdef CONFIG_SMP
@@ -4367,10 +5061,8 @@ void set_user_nice(struct task_struct *p, long nice)
4367 goto out_unlock; 5061 goto out_unlock;
4368 } 5062 }
4369 on_rq = p->se.on_rq; 5063 on_rq = p->se.on_rq;
4370 if (on_rq) { 5064 if (on_rq)
4371 dequeue_task(rq, p, 0); 5065 dequeue_task(rq, p, 0);
4372 dec_load(rq, p);
4373 }
4374 5066
4375 p->static_prio = NICE_TO_PRIO(nice); 5067 p->static_prio = NICE_TO_PRIO(nice);
4376 set_load_weight(p); 5068 set_load_weight(p);
@@ -4380,7 +5072,6 @@ void set_user_nice(struct task_struct *p, long nice)
4380 5072
4381 if (on_rq) { 5073 if (on_rq) {
4382 enqueue_task(rq, p, 0); 5074 enqueue_task(rq, p, 0);
4383 inc_load(rq, p);
4384 /* 5075 /*
4385 * If the task increased its priority or is running and 5076 * If the task increased its priority or is running and
4386 * lowered its priority, then reschedule its CPU: 5077 * lowered its priority, then reschedule its CPU:
@@ -4602,7 +5293,7 @@ recheck:
4602 * Do not allow realtime tasks into groups that have no runtime 5293 * Do not allow realtime tasks into groups that have no runtime
4603 * assigned. 5294 * assigned.
4604 */ 5295 */
4605 if (rt_policy(policy) && task_group(p)->rt_runtime == 0) 5296 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
4606 return -EPERM; 5297 return -EPERM;
4607#endif 5298#endif
4608 5299
@@ -4764,9 +5455,10 @@ out_unlock:
4764 return retval; 5455 return retval;
4765} 5456}
4766 5457
4767long sched_setaffinity(pid_t pid, cpumask_t new_mask) 5458long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
4768{ 5459{
4769 cpumask_t cpus_allowed; 5460 cpumask_t cpus_allowed;
5461 cpumask_t new_mask = *in_mask;
4770 struct task_struct *p; 5462 struct task_struct *p;
4771 int retval; 5463 int retval;
4772 5464
@@ -4797,13 +5489,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4797 if (retval) 5489 if (retval)
4798 goto out_unlock; 5490 goto out_unlock;
4799 5491
4800 cpus_allowed = cpuset_cpus_allowed(p); 5492 cpuset_cpus_allowed(p, &cpus_allowed);
4801 cpus_and(new_mask, new_mask, cpus_allowed); 5493 cpus_and(new_mask, new_mask, cpus_allowed);
4802 again: 5494 again:
4803 retval = set_cpus_allowed(p, new_mask); 5495 retval = set_cpus_allowed_ptr(p, &new_mask);
4804 5496
4805 if (!retval) { 5497 if (!retval) {
4806 cpus_allowed = cpuset_cpus_allowed(p); 5498 cpuset_cpus_allowed(p, &cpus_allowed);
4807 if (!cpus_subset(new_mask, cpus_allowed)) { 5499 if (!cpus_subset(new_mask, cpus_allowed)) {
4808 /* 5500 /*
4809 * We must have raced with a concurrent cpuset 5501 * We must have raced with a concurrent cpuset
@@ -4847,7 +5539,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4847 if (retval) 5539 if (retval)
4848 return retval; 5540 return retval;
4849 5541
4850 return sched_setaffinity(pid, new_mask); 5542 return sched_setaffinity(pid, &new_mask);
4851} 5543}
4852 5544
4853/* 5545/*
@@ -5309,7 +6001,6 @@ static inline void sched_init_granularity(void)
5309 sysctl_sched_latency = limit; 6001 sysctl_sched_latency = limit;
5310 6002
5311 sysctl_sched_wakeup_granularity *= factor; 6003 sysctl_sched_wakeup_granularity *= factor;
5312 sysctl_sched_batch_wakeup_granularity *= factor;
5313} 6004}
5314 6005
5315#ifdef CONFIG_SMP 6006#ifdef CONFIG_SMP
@@ -5338,7 +6029,7 @@ static inline void sched_init_granularity(void)
5338 * task must not exit() & deallocate itself prematurely. The 6029 * task must not exit() & deallocate itself prematurely. The
5339 * call is not atomic; no spinlocks may be held. 6030 * call is not atomic; no spinlocks may be held.
5340 */ 6031 */
5341int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 6032int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5342{ 6033{
5343 struct migration_req req; 6034 struct migration_req req;
5344 unsigned long flags; 6035 unsigned long flags;
@@ -5346,23 +6037,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5346 int ret = 0; 6037 int ret = 0;
5347 6038
5348 rq = task_rq_lock(p, &flags); 6039 rq = task_rq_lock(p, &flags);
5349 if (!cpus_intersects(new_mask, cpu_online_map)) { 6040 if (!cpus_intersects(*new_mask, cpu_online_map)) {
5350 ret = -EINVAL; 6041 ret = -EINVAL;
5351 goto out; 6042 goto out;
5352 } 6043 }
5353 6044
5354 if (p->sched_class->set_cpus_allowed) 6045 if (p->sched_class->set_cpus_allowed)
5355 p->sched_class->set_cpus_allowed(p, &new_mask); 6046 p->sched_class->set_cpus_allowed(p, new_mask);
5356 else { 6047 else {
5357 p->cpus_allowed = new_mask; 6048 p->cpus_allowed = *new_mask;
5358 p->rt.nr_cpus_allowed = cpus_weight(new_mask); 6049 p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
5359 } 6050 }
5360 6051
5361 /* Can the task run on the task's current CPU? If so, we're done */ 6052 /* Can the task run on the task's current CPU? If so, we're done */
5362 if (cpu_isset(task_cpu(p), new_mask)) 6053 if (cpu_isset(task_cpu(p), *new_mask))
5363 goto out; 6054 goto out;
5364 6055
5365 if (migrate_task(p, any_online_cpu(new_mask), &req)) { 6056 if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
5366 /* Need help from migration thread: drop lock and wait. */ 6057 /* Need help from migration thread: drop lock and wait. */
5367 task_rq_unlock(rq, &flags); 6058 task_rq_unlock(rq, &flags);
5368 wake_up_process(rq->migration_thread); 6059 wake_up_process(rq->migration_thread);
@@ -5375,7 +6066,7 @@ out:
5375 6066
5376 return ret; 6067 return ret;
5377} 6068}
5378EXPORT_SYMBOL_GPL(set_cpus_allowed); 6069EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5379 6070
5380/* 6071/*
5381 * Move (not current) task off this cpu, onto dest cpu. We're doing 6072 * Move (not current) task off this cpu, onto dest cpu. We're doing
@@ -5513,12 +6204,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5513 dest_cpu = any_online_cpu(mask); 6204 dest_cpu = any_online_cpu(mask);
5514 6205
5515 /* On any allowed CPU? */ 6206 /* On any allowed CPU? */
5516 if (dest_cpu == NR_CPUS) 6207 if (dest_cpu >= nr_cpu_ids)
5517 dest_cpu = any_online_cpu(p->cpus_allowed); 6208 dest_cpu = any_online_cpu(p->cpus_allowed);
5518 6209
5519 /* No more Mr. Nice Guy. */ 6210 /* No more Mr. Nice Guy. */
5520 if (dest_cpu == NR_CPUS) { 6211 if (dest_cpu >= nr_cpu_ids) {
5521 cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); 6212 cpumask_t cpus_allowed;
6213
6214 cpuset_cpus_allowed_locked(p, &cpus_allowed);
5522 /* 6215 /*
5523 * Try to stay on the same cpuset, where the 6216 * Try to stay on the same cpuset, where the
5524 * current cpuset may be a subset of all cpus. 6217 * current cpuset may be a subset of all cpus.
@@ -5554,7 +6247,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5554 */ 6247 */
5555static void migrate_nr_uninterruptible(struct rq *rq_src) 6248static void migrate_nr_uninterruptible(struct rq *rq_src)
5556{ 6249{
5557 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 6250 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
5558 unsigned long flags; 6251 unsigned long flags;
5559 6252
5560 local_irq_save(flags); 6253 local_irq_save(flags);
@@ -5966,20 +6659,16 @@ void __init migration_init(void)
5966 6659
5967#ifdef CONFIG_SMP 6660#ifdef CONFIG_SMP
5968 6661
5969/* Number of possible processor ids */
5970int nr_cpu_ids __read_mostly = NR_CPUS;
5971EXPORT_SYMBOL(nr_cpu_ids);
5972
5973#ifdef CONFIG_SCHED_DEBUG 6662#ifdef CONFIG_SCHED_DEBUG
5974 6663
5975static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) 6664static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6665 cpumask_t *groupmask)
5976{ 6666{
5977 struct sched_group *group = sd->groups; 6667 struct sched_group *group = sd->groups;
5978 cpumask_t groupmask; 6668 char str[256];
5979 char str[NR_CPUS];
5980 6669
5981 cpumask_scnprintf(str, NR_CPUS, sd->span); 6670 cpulist_scnprintf(str, sizeof(str), sd->span);
5982 cpus_clear(groupmask); 6671 cpus_clear(*groupmask);
5983 6672
5984 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6673 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5985 6674
@@ -6023,25 +6712,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
6023 break; 6712 break;
6024 } 6713 }
6025 6714
6026 if (cpus_intersects(groupmask, group->cpumask)) { 6715 if (cpus_intersects(*groupmask, group->cpumask)) {
6027 printk(KERN_CONT "\n"); 6716 printk(KERN_CONT "\n");
6028 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6717 printk(KERN_ERR "ERROR: repeated CPUs\n");
6029 break; 6718 break;
6030 } 6719 }
6031 6720
6032 cpus_or(groupmask, groupmask, group->cpumask); 6721 cpus_or(*groupmask, *groupmask, group->cpumask);
6033 6722
6034 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 6723 cpulist_scnprintf(str, sizeof(str), group->cpumask);
6035 printk(KERN_CONT " %s", str); 6724 printk(KERN_CONT " %s", str);
6036 6725
6037 group = group->next; 6726 group = group->next;
6038 } while (group != sd->groups); 6727 } while (group != sd->groups);
6039 printk(KERN_CONT "\n"); 6728 printk(KERN_CONT "\n");
6040 6729
6041 if (!cpus_equal(sd->span, groupmask)) 6730 if (!cpus_equal(sd->span, *groupmask))
6042 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6731 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6043 6732
6044 if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) 6733 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
6045 printk(KERN_ERR "ERROR: parent span is not a superset " 6734 printk(KERN_ERR "ERROR: parent span is not a superset "
6046 "of domain->span\n"); 6735 "of domain->span\n");
6047 return 0; 6736 return 0;
@@ -6049,6 +6738,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
6049 6738
6050static void sched_domain_debug(struct sched_domain *sd, int cpu) 6739static void sched_domain_debug(struct sched_domain *sd, int cpu)
6051{ 6740{
6741 cpumask_t *groupmask;
6052 int level = 0; 6742 int level = 0;
6053 6743
6054 if (!sd) { 6744 if (!sd) {
@@ -6058,14 +6748,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6058 6748
6059 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6749 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6060 6750
6751 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6752 if (!groupmask) {
6753 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6754 return;
6755 }
6756
6061 for (;;) { 6757 for (;;) {
6062 if (sched_domain_debug_one(sd, cpu, level)) 6758 if (sched_domain_debug_one(sd, cpu, level, groupmask))
6063 break; 6759 break;
6064 level++; 6760 level++;
6065 sd = sd->parent; 6761 sd = sd->parent;
6066 if (!sd) 6762 if (!sd)
6067 break; 6763 break;
6068 } 6764 }
6765 kfree(groupmask);
6069} 6766}
6070#else 6767#else
6071# define sched_domain_debug(sd, cpu) do { } while (0) 6768# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6253,30 +6950,33 @@ __setup("isolcpus=", isolated_cpu_setup);
6253 * and ->cpu_power to 0. 6950 * and ->cpu_power to 0.
6254 */ 6951 */
6255static void 6952static void
6256init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, 6953init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6257 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 6954 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
6258 struct sched_group **sg)) 6955 struct sched_group **sg,
6956 cpumask_t *tmpmask),
6957 cpumask_t *covered, cpumask_t *tmpmask)
6259{ 6958{
6260 struct sched_group *first = NULL, *last = NULL; 6959 struct sched_group *first = NULL, *last = NULL;
6261 cpumask_t covered = CPU_MASK_NONE;
6262 int i; 6960 int i;
6263 6961
6264 for_each_cpu_mask(i, span) { 6962 cpus_clear(*covered);
6963
6964 for_each_cpu_mask(i, *span) {
6265 struct sched_group *sg; 6965 struct sched_group *sg;
6266 int group = group_fn(i, cpu_map, &sg); 6966 int group = group_fn(i, cpu_map, &sg, tmpmask);
6267 int j; 6967 int j;
6268 6968
6269 if (cpu_isset(i, covered)) 6969 if (cpu_isset(i, *covered))
6270 continue; 6970 continue;
6271 6971
6272 sg->cpumask = CPU_MASK_NONE; 6972 cpus_clear(sg->cpumask);
6273 sg->__cpu_power = 0; 6973 sg->__cpu_power = 0;
6274 6974
6275 for_each_cpu_mask(j, span) { 6975 for_each_cpu_mask(j, *span) {
6276 if (group_fn(j, cpu_map, NULL) != group) 6976 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6277 continue; 6977 continue;
6278 6978
6279 cpu_set(j, covered); 6979 cpu_set(j, *covered);
6280 cpu_set(j, sg->cpumask); 6980 cpu_set(j, sg->cpumask);
6281 } 6981 }
6282 if (!first) 6982 if (!first)
@@ -6302,7 +7002,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
6302 * 7002 *
6303 * Should use nodemask_t. 7003 * Should use nodemask_t.
6304 */ 7004 */
6305static int find_next_best_node(int node, unsigned long *used_nodes) 7005static int find_next_best_node(int node, nodemask_t *used_nodes)
6306{ 7006{
6307 int i, n, val, min_val, best_node = 0; 7007 int i, n, val, min_val, best_node = 0;
6308 7008
@@ -6316,7 +7016,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
6316 continue; 7016 continue;
6317 7017
6318 /* Skip already used nodes */ 7018 /* Skip already used nodes */
6319 if (test_bit(n, used_nodes)) 7019 if (node_isset(n, *used_nodes))
6320 continue; 7020 continue;
6321 7021
6322 /* Simple min distance search */ 7022 /* Simple min distance search */
@@ -6328,40 +7028,36 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
6328 } 7028 }
6329 } 7029 }
6330 7030
6331 set_bit(best_node, used_nodes); 7031 node_set(best_node, *used_nodes);
6332 return best_node; 7032 return best_node;
6333} 7033}
6334 7034
6335/** 7035/**
6336 * sched_domain_node_span - get a cpumask for a node's sched_domain 7036 * sched_domain_node_span - get a cpumask for a node's sched_domain
6337 * @node: node whose cpumask we're constructing 7037 * @node: node whose cpumask we're constructing
6338 * @size: number of nodes to include in this span
6339 * 7038 *
6340 * Given a node, construct a good cpumask for its sched_domain to span. It 7039 * Given a node, construct a good cpumask for its sched_domain to span. It
6341 * should be one that prevents unnecessary balancing, but also spreads tasks 7040 * should be one that prevents unnecessary balancing, but also spreads tasks
6342 * out optimally. 7041 * out optimally.
6343 */ 7042 */
6344static cpumask_t sched_domain_node_span(int node) 7043static void sched_domain_node_span(int node, cpumask_t *span)
6345{ 7044{
6346 DECLARE_BITMAP(used_nodes, MAX_NUMNODES); 7045 nodemask_t used_nodes;
6347 cpumask_t span, nodemask; 7046 node_to_cpumask_ptr(nodemask, node);
6348 int i; 7047 int i;
6349 7048
6350 cpus_clear(span); 7049 cpus_clear(*span);
6351 bitmap_zero(used_nodes, MAX_NUMNODES); 7050 nodes_clear(used_nodes);
6352 7051
6353 nodemask = node_to_cpumask(node); 7052 cpus_or(*span, *span, *nodemask);
6354 cpus_or(span, span, nodemask); 7053 node_set(node, used_nodes);
6355 set_bit(node, used_nodes);
6356 7054
6357 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7055 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6358 int next_node = find_next_best_node(node, used_nodes); 7056 int next_node = find_next_best_node(node, &used_nodes);
6359 7057
6360 nodemask = node_to_cpumask(next_node); 7058 node_to_cpumask_ptr_next(nodemask, next_node);
6361 cpus_or(span, span, nodemask); 7059 cpus_or(*span, *span, *nodemask);
6362 } 7060 }
6363
6364 return span;
6365} 7061}
6366#endif 7062#endif
6367 7063
@@ -6375,7 +7071,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6375static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7071static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6376 7072
6377static int 7073static int
6378cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7074cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7075 cpumask_t *unused)
6379{ 7076{
6380 if (sg) 7077 if (sg)
6381 *sg = &per_cpu(sched_group_cpus, cpu); 7078 *sg = &per_cpu(sched_group_cpus, cpu);
@@ -6393,19 +7090,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6393 7090
6394#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7091#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6395static int 7092static int
6396cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7093cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7094 cpumask_t *mask)
6397{ 7095{
6398 int group; 7096 int group;
6399 cpumask_t mask = per_cpu(cpu_sibling_map, cpu); 7097
6400 cpus_and(mask, mask, *cpu_map); 7098 *mask = per_cpu(cpu_sibling_map, cpu);
6401 group = first_cpu(mask); 7099 cpus_and(*mask, *mask, *cpu_map);
7100 group = first_cpu(*mask);
6402 if (sg) 7101 if (sg)
6403 *sg = &per_cpu(sched_group_core, group); 7102 *sg = &per_cpu(sched_group_core, group);
6404 return group; 7103 return group;
6405} 7104}
6406#elif defined(CONFIG_SCHED_MC) 7105#elif defined(CONFIG_SCHED_MC)
6407static int 7106static int
6408cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7107cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7108 cpumask_t *unused)
6409{ 7109{
6410 if (sg) 7110 if (sg)
6411 *sg = &per_cpu(sched_group_core, cpu); 7111 *sg = &per_cpu(sched_group_core, cpu);
@@ -6417,17 +7117,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6417static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7117static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6418 7118
6419static int 7119static int
6420cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7120cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7121 cpumask_t *mask)
6421{ 7122{
6422 int group; 7123 int group;
6423#ifdef CONFIG_SCHED_MC 7124#ifdef CONFIG_SCHED_MC
6424 cpumask_t mask = cpu_coregroup_map(cpu); 7125 *mask = cpu_coregroup_map(cpu);
6425 cpus_and(mask, mask, *cpu_map); 7126 cpus_and(*mask, *mask, *cpu_map);
6426 group = first_cpu(mask); 7127 group = first_cpu(*mask);
6427#elif defined(CONFIG_SCHED_SMT) 7128#elif defined(CONFIG_SCHED_SMT)
6428 cpumask_t mask = per_cpu(cpu_sibling_map, cpu); 7129 *mask = per_cpu(cpu_sibling_map, cpu);
6429 cpus_and(mask, mask, *cpu_map); 7130 cpus_and(*mask, *mask, *cpu_map);
6430 group = first_cpu(mask); 7131 group = first_cpu(*mask);
6431#else 7132#else
6432 group = cpu; 7133 group = cpu;
6433#endif 7134#endif
@@ -6443,19 +7144,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6443 * gets dynamically allocated. 7144 * gets dynamically allocated.
6444 */ 7145 */
6445static DEFINE_PER_CPU(struct sched_domain, node_domains); 7146static DEFINE_PER_CPU(struct sched_domain, node_domains);
6446static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 7147static struct sched_group ***sched_group_nodes_bycpu;
6447 7148
6448static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7149static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6449static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7150static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6450 7151
6451static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7152static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6452 struct sched_group **sg) 7153 struct sched_group **sg, cpumask_t *nodemask)
6453{ 7154{
6454 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6455 int group; 7155 int group;
6456 7156
6457 cpus_and(nodemask, nodemask, *cpu_map); 7157 *nodemask = node_to_cpumask(cpu_to_node(cpu));
6458 group = first_cpu(nodemask); 7158 cpus_and(*nodemask, *nodemask, *cpu_map);
7159 group = first_cpu(*nodemask);
6459 7160
6460 if (sg) 7161 if (sg)
6461 *sg = &per_cpu(sched_group_allnodes, group); 7162 *sg = &per_cpu(sched_group_allnodes, group);
@@ -6491,7 +7192,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6491 7192
6492#ifdef CONFIG_NUMA 7193#ifdef CONFIG_NUMA
6493/* Free memory allocated for various sched_group structures */ 7194/* Free memory allocated for various sched_group structures */
6494static void free_sched_groups(const cpumask_t *cpu_map) 7195static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6495{ 7196{
6496 int cpu, i; 7197 int cpu, i;
6497 7198
@@ -6503,11 +7204,11 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6503 continue; 7204 continue;
6504 7205
6505 for (i = 0; i < MAX_NUMNODES; i++) { 7206 for (i = 0; i < MAX_NUMNODES; i++) {
6506 cpumask_t nodemask = node_to_cpumask(i);
6507 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7207 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6508 7208
6509 cpus_and(nodemask, nodemask, *cpu_map); 7209 *nodemask = node_to_cpumask(i);
6510 if (cpus_empty(nodemask)) 7210 cpus_and(*nodemask, *nodemask, *cpu_map);
7211 if (cpus_empty(*nodemask))
6511 continue; 7212 continue;
6512 7213
6513 if (sg == NULL) 7214 if (sg == NULL)
@@ -6525,7 +7226,7 @@ next_sg:
6525 } 7226 }
6526} 7227}
6527#else 7228#else
6528static void free_sched_groups(const cpumask_t *cpu_map) 7229static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6529{ 7230{
6530} 7231}
6531#endif 7232#endif
@@ -6583,13 +7284,106 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6583} 7284}
6584 7285
6585/* 7286/*
7287 * Initializers for schedule domains
7288 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7289 */
7290
7291#define SD_INIT(sd, type) sd_init_##type(sd)
7292#define SD_INIT_FUNC(type) \
7293static noinline void sd_init_##type(struct sched_domain *sd) \
7294{ \
7295 memset(sd, 0, sizeof(*sd)); \
7296 *sd = SD_##type##_INIT; \
7297 sd->level = SD_LV_##type; \
7298}
7299
7300SD_INIT_FUNC(CPU)
7301#ifdef CONFIG_NUMA
7302 SD_INIT_FUNC(ALLNODES)
7303 SD_INIT_FUNC(NODE)
7304#endif
7305#ifdef CONFIG_SCHED_SMT
7306 SD_INIT_FUNC(SIBLING)
7307#endif
7308#ifdef CONFIG_SCHED_MC
7309 SD_INIT_FUNC(MC)
7310#endif
7311
7312/*
7313 * To minimize stack usage kmalloc room for cpumasks and share the
7314 * space as the usage in build_sched_domains() dictates. Used only
7315 * if the amount of space is significant.
7316 */
7317struct allmasks {
7318 cpumask_t tmpmask; /* make this one first */
7319 union {
7320 cpumask_t nodemask;
7321 cpumask_t this_sibling_map;
7322 cpumask_t this_core_map;
7323 };
7324 cpumask_t send_covered;
7325
7326#ifdef CONFIG_NUMA
7327 cpumask_t domainspan;
7328 cpumask_t covered;
7329 cpumask_t notcovered;
7330#endif
7331};
7332
7333#if NR_CPUS > 128
7334#define SCHED_CPUMASK_ALLOC 1
7335#define SCHED_CPUMASK_FREE(v) kfree(v)
7336#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7337#else
7338#define SCHED_CPUMASK_ALLOC 0
7339#define SCHED_CPUMASK_FREE(v)
7340#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7341#endif
7342
7343#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7344 ((unsigned long)(a) + offsetof(struct allmasks, v))
7345
7346static int default_relax_domain_level = -1;
7347
7348static int __init setup_relax_domain_level(char *str)
7349{
7350 default_relax_domain_level = simple_strtoul(str, NULL, 0);
7351 return 1;
7352}
7353__setup("relax_domain_level=", setup_relax_domain_level);
7354
7355static void set_domain_attribute(struct sched_domain *sd,
7356 struct sched_domain_attr *attr)
7357{
7358 int request;
7359
7360 if (!attr || attr->relax_domain_level < 0) {
7361 if (default_relax_domain_level < 0)
7362 return;
7363 else
7364 request = default_relax_domain_level;
7365 } else
7366 request = attr->relax_domain_level;
7367 if (request < sd->level) {
7368 /* turn off idle balance on this domain */
7369 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
7370 } else {
7371 /* turn on idle balance on this domain */
7372 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
7373 }
7374}
7375
7376/*
6586 * Build sched domains for a given set of cpus and attach the sched domains 7377 * Build sched domains for a given set of cpus and attach the sched domains
6587 * to the individual cpus 7378 * to the individual cpus
6588 */ 7379 */
6589static int build_sched_domains(const cpumask_t *cpu_map) 7380static int __build_sched_domains(const cpumask_t *cpu_map,
7381 struct sched_domain_attr *attr)
6590{ 7382{
6591 int i; 7383 int i;
6592 struct root_domain *rd; 7384 struct root_domain *rd;
7385 SCHED_CPUMASK_DECLARE(allmasks);
7386 cpumask_t *tmpmask;
6593#ifdef CONFIG_NUMA 7387#ifdef CONFIG_NUMA
6594 struct sched_group **sched_group_nodes = NULL; 7388 struct sched_group **sched_group_nodes = NULL;
6595 int sd_allnodes = 0; 7389 int sd_allnodes = 0;
@@ -6603,39 +7397,65 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6603 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7397 printk(KERN_WARNING "Can not alloc sched group node list\n");
6604 return -ENOMEM; 7398 return -ENOMEM;
6605 } 7399 }
6606 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6607#endif 7400#endif
6608 7401
6609 rd = alloc_rootdomain(); 7402 rd = alloc_rootdomain();
6610 if (!rd) { 7403 if (!rd) {
6611 printk(KERN_WARNING "Cannot alloc root domain\n"); 7404 printk(KERN_WARNING "Cannot alloc root domain\n");
7405#ifdef CONFIG_NUMA
7406 kfree(sched_group_nodes);
7407#endif
6612 return -ENOMEM; 7408 return -ENOMEM;
6613 } 7409 }
6614 7410
7411#if SCHED_CPUMASK_ALLOC
7412 /* get space for all scratch cpumask variables */
7413 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7414 if (!allmasks) {
7415 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7416 kfree(rd);
7417#ifdef CONFIG_NUMA
7418 kfree(sched_group_nodes);
7419#endif
7420 return -ENOMEM;
7421 }
7422#endif
7423 tmpmask = (cpumask_t *)allmasks;
7424
7425
7426#ifdef CONFIG_NUMA
7427 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7428#endif
7429
6615 /* 7430 /*
6616 * Set up domains for cpus specified by the cpu_map. 7431 * Set up domains for cpus specified by the cpu_map.
6617 */ 7432 */
6618 for_each_cpu_mask(i, *cpu_map) { 7433 for_each_cpu_mask(i, *cpu_map) {
6619 struct sched_domain *sd = NULL, *p; 7434 struct sched_domain *sd = NULL, *p;
6620 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 7435 SCHED_CPUMASK_VAR(nodemask, allmasks);
6621 7436
6622 cpus_and(nodemask, nodemask, *cpu_map); 7437 *nodemask = node_to_cpumask(cpu_to_node(i));
7438 cpus_and(*nodemask, *nodemask, *cpu_map);
6623 7439
6624#ifdef CONFIG_NUMA 7440#ifdef CONFIG_NUMA
6625 if (cpus_weight(*cpu_map) > 7441 if (cpus_weight(*cpu_map) >
6626 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 7442 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
6627 sd = &per_cpu(allnodes_domains, i); 7443 sd = &per_cpu(allnodes_domains, i);
6628 *sd = SD_ALLNODES_INIT; 7444 SD_INIT(sd, ALLNODES);
7445 set_domain_attribute(sd, attr);
6629 sd->span = *cpu_map; 7446 sd->span = *cpu_map;
6630 cpu_to_allnodes_group(i, cpu_map, &sd->groups); 7447 sd->first_cpu = first_cpu(sd->span);
7448 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
6631 p = sd; 7449 p = sd;
6632 sd_allnodes = 1; 7450 sd_allnodes = 1;
6633 } else 7451 } else
6634 p = NULL; 7452 p = NULL;
6635 7453
6636 sd = &per_cpu(node_domains, i); 7454 sd = &per_cpu(node_domains, i);
6637 *sd = SD_NODE_INIT; 7455 SD_INIT(sd, NODE);
6638 sd->span = sched_domain_node_span(cpu_to_node(i)); 7456 set_domain_attribute(sd, attr);
7457 sched_domain_node_span(cpu_to_node(i), &sd->span);
7458 sd->first_cpu = first_cpu(sd->span);
6639 sd->parent = p; 7459 sd->parent = p;
6640 if (p) 7460 if (p)
6641 p->child = sd; 7461 p->child = sd;
@@ -6644,94 +7464,120 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6644 7464
6645 p = sd; 7465 p = sd;
6646 sd = &per_cpu(phys_domains, i); 7466 sd = &per_cpu(phys_domains, i);
6647 *sd = SD_CPU_INIT; 7467 SD_INIT(sd, CPU);
6648 sd->span = nodemask; 7468 set_domain_attribute(sd, attr);
7469 sd->span = *nodemask;
7470 sd->first_cpu = first_cpu(sd->span);
6649 sd->parent = p; 7471 sd->parent = p;
6650 if (p) 7472 if (p)
6651 p->child = sd; 7473 p->child = sd;
6652 cpu_to_phys_group(i, cpu_map, &sd->groups); 7474 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
6653 7475
6654#ifdef CONFIG_SCHED_MC 7476#ifdef CONFIG_SCHED_MC
6655 p = sd; 7477 p = sd;
6656 sd = &per_cpu(core_domains, i); 7478 sd = &per_cpu(core_domains, i);
6657 *sd = SD_MC_INIT; 7479 SD_INIT(sd, MC);
7480 set_domain_attribute(sd, attr);
6658 sd->span = cpu_coregroup_map(i); 7481 sd->span = cpu_coregroup_map(i);
7482 sd->first_cpu = first_cpu(sd->span);
6659 cpus_and(sd->span, sd->span, *cpu_map); 7483 cpus_and(sd->span, sd->span, *cpu_map);
6660 sd->parent = p; 7484 sd->parent = p;
6661 p->child = sd; 7485 p->child = sd;
6662 cpu_to_core_group(i, cpu_map, &sd->groups); 7486 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
6663#endif 7487#endif
6664 7488
6665#ifdef CONFIG_SCHED_SMT 7489#ifdef CONFIG_SCHED_SMT
6666 p = sd; 7490 p = sd;
6667 sd = &per_cpu(cpu_domains, i); 7491 sd = &per_cpu(cpu_domains, i);
6668 *sd = SD_SIBLING_INIT; 7492 SD_INIT(sd, SIBLING);
7493 set_domain_attribute(sd, attr);
6669 sd->span = per_cpu(cpu_sibling_map, i); 7494 sd->span = per_cpu(cpu_sibling_map, i);
7495 sd->first_cpu = first_cpu(sd->span);
6670 cpus_and(sd->span, sd->span, *cpu_map); 7496 cpus_and(sd->span, sd->span, *cpu_map);
6671 sd->parent = p; 7497 sd->parent = p;
6672 p->child = sd; 7498 p->child = sd;
6673 cpu_to_cpu_group(i, cpu_map, &sd->groups); 7499 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
6674#endif 7500#endif
6675 } 7501 }
6676 7502
6677#ifdef CONFIG_SCHED_SMT 7503#ifdef CONFIG_SCHED_SMT
6678 /* Set up CPU (sibling) groups */ 7504 /* Set up CPU (sibling) groups */
6679 for_each_cpu_mask(i, *cpu_map) { 7505 for_each_cpu_mask(i, *cpu_map) {
6680 cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); 7506 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
6681 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 7507 SCHED_CPUMASK_VAR(send_covered, allmasks);
6682 if (i != first_cpu(this_sibling_map)) 7508
7509 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7510 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7511 if (i != first_cpu(*this_sibling_map))
6683 continue; 7512 continue;
6684 7513
6685 init_sched_build_groups(this_sibling_map, cpu_map, 7514 init_sched_build_groups(this_sibling_map, cpu_map,
6686 &cpu_to_cpu_group); 7515 &cpu_to_cpu_group,
7516 send_covered, tmpmask);
6687 } 7517 }
6688#endif 7518#endif
6689 7519
6690#ifdef CONFIG_SCHED_MC 7520#ifdef CONFIG_SCHED_MC
6691 /* Set up multi-core groups */ 7521 /* Set up multi-core groups */
6692 for_each_cpu_mask(i, *cpu_map) { 7522 for_each_cpu_mask(i, *cpu_map) {
6693 cpumask_t this_core_map = cpu_coregroup_map(i); 7523 SCHED_CPUMASK_VAR(this_core_map, allmasks);
6694 cpus_and(this_core_map, this_core_map, *cpu_map); 7524 SCHED_CPUMASK_VAR(send_covered, allmasks);
6695 if (i != first_cpu(this_core_map)) 7525
7526 *this_core_map = cpu_coregroup_map(i);
7527 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7528 if (i != first_cpu(*this_core_map))
6696 continue; 7529 continue;
7530
6697 init_sched_build_groups(this_core_map, cpu_map, 7531 init_sched_build_groups(this_core_map, cpu_map,
6698 &cpu_to_core_group); 7532 &cpu_to_core_group,
7533 send_covered, tmpmask);
6699 } 7534 }
6700#endif 7535#endif
6701 7536
6702 /* Set up physical groups */ 7537 /* Set up physical groups */
6703 for (i = 0; i < MAX_NUMNODES; i++) { 7538 for (i = 0; i < MAX_NUMNODES; i++) {
6704 cpumask_t nodemask = node_to_cpumask(i); 7539 SCHED_CPUMASK_VAR(nodemask, allmasks);
7540 SCHED_CPUMASK_VAR(send_covered, allmasks);
6705 7541
6706 cpus_and(nodemask, nodemask, *cpu_map); 7542 *nodemask = node_to_cpumask(i);
6707 if (cpus_empty(nodemask)) 7543 cpus_and(*nodemask, *nodemask, *cpu_map);
7544 if (cpus_empty(*nodemask))
6708 continue; 7545 continue;
6709 7546
6710 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); 7547 init_sched_build_groups(nodemask, cpu_map,
7548 &cpu_to_phys_group,
7549 send_covered, tmpmask);
6711 } 7550 }
6712 7551
6713#ifdef CONFIG_NUMA 7552#ifdef CONFIG_NUMA
6714 /* Set up node groups */ 7553 /* Set up node groups */
6715 if (sd_allnodes) 7554 if (sd_allnodes) {
6716 init_sched_build_groups(*cpu_map, cpu_map, 7555 SCHED_CPUMASK_VAR(send_covered, allmasks);
6717 &cpu_to_allnodes_group); 7556
7557 init_sched_build_groups(cpu_map, cpu_map,
7558 &cpu_to_allnodes_group,
7559 send_covered, tmpmask);
7560 }
6718 7561
6719 for (i = 0; i < MAX_NUMNODES; i++) { 7562 for (i = 0; i < MAX_NUMNODES; i++) {
6720 /* Set up node groups */ 7563 /* Set up node groups */
6721 struct sched_group *sg, *prev; 7564 struct sched_group *sg, *prev;
6722 cpumask_t nodemask = node_to_cpumask(i); 7565 SCHED_CPUMASK_VAR(nodemask, allmasks);
6723 cpumask_t domainspan; 7566 SCHED_CPUMASK_VAR(domainspan, allmasks);
6724 cpumask_t covered = CPU_MASK_NONE; 7567 SCHED_CPUMASK_VAR(covered, allmasks);
6725 int j; 7568 int j;
6726 7569
6727 cpus_and(nodemask, nodemask, *cpu_map); 7570 *nodemask = node_to_cpumask(i);
6728 if (cpus_empty(nodemask)) { 7571 cpus_clear(*covered);
7572
7573 cpus_and(*nodemask, *nodemask, *cpu_map);
7574 if (cpus_empty(*nodemask)) {
6729 sched_group_nodes[i] = NULL; 7575 sched_group_nodes[i] = NULL;
6730 continue; 7576 continue;
6731 } 7577 }
6732 7578
6733 domainspan = sched_domain_node_span(i); 7579 sched_domain_node_span(i, domainspan);
6734 cpus_and(domainspan, domainspan, *cpu_map); 7580 cpus_and(*domainspan, *domainspan, *cpu_map);
6735 7581
6736 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7582 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6737 if (!sg) { 7583 if (!sg) {
@@ -6740,31 +7586,31 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6740 goto error; 7586 goto error;
6741 } 7587 }
6742 sched_group_nodes[i] = sg; 7588 sched_group_nodes[i] = sg;
6743 for_each_cpu_mask(j, nodemask) { 7589 for_each_cpu_mask(j, *nodemask) {
6744 struct sched_domain *sd; 7590 struct sched_domain *sd;
6745 7591
6746 sd = &per_cpu(node_domains, j); 7592 sd = &per_cpu(node_domains, j);
6747 sd->groups = sg; 7593 sd->groups = sg;
6748 } 7594 }
6749 sg->__cpu_power = 0; 7595 sg->__cpu_power = 0;
6750 sg->cpumask = nodemask; 7596 sg->cpumask = *nodemask;
6751 sg->next = sg; 7597 sg->next = sg;
6752 cpus_or(covered, covered, nodemask); 7598 cpus_or(*covered, *covered, *nodemask);
6753 prev = sg; 7599 prev = sg;
6754 7600
6755 for (j = 0; j < MAX_NUMNODES; j++) { 7601 for (j = 0; j < MAX_NUMNODES; j++) {
6756 cpumask_t tmp, notcovered; 7602 SCHED_CPUMASK_VAR(notcovered, allmasks);
6757 int n = (i + j) % MAX_NUMNODES; 7603 int n = (i + j) % MAX_NUMNODES;
7604 node_to_cpumask_ptr(pnodemask, n);
6758 7605
6759 cpus_complement(notcovered, covered); 7606 cpus_complement(*notcovered, *covered);
6760 cpus_and(tmp, notcovered, *cpu_map); 7607 cpus_and(*tmpmask, *notcovered, *cpu_map);
6761 cpus_and(tmp, tmp, domainspan); 7608 cpus_and(*tmpmask, *tmpmask, *domainspan);
6762 if (cpus_empty(tmp)) 7609 if (cpus_empty(*tmpmask))
6763 break; 7610 break;
6764 7611
6765 nodemask = node_to_cpumask(n); 7612 cpus_and(*tmpmask, *tmpmask, *pnodemask);
6766 cpus_and(tmp, tmp, nodemask); 7613 if (cpus_empty(*tmpmask))
6767 if (cpus_empty(tmp))
6768 continue; 7614 continue;
6769 7615
6770 sg = kmalloc_node(sizeof(struct sched_group), 7616 sg = kmalloc_node(sizeof(struct sched_group),
@@ -6775,9 +7621,9 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6775 goto error; 7621 goto error;
6776 } 7622 }
6777 sg->__cpu_power = 0; 7623 sg->__cpu_power = 0;
6778 sg->cpumask = tmp; 7624 sg->cpumask = *tmpmask;
6779 sg->next = prev->next; 7625 sg->next = prev->next;
6780 cpus_or(covered, covered, tmp); 7626 cpus_or(*covered, *covered, *tmpmask);
6781 prev->next = sg; 7627 prev->next = sg;
6782 prev = sg; 7628 prev = sg;
6783 } 7629 }
@@ -6813,7 +7659,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6813 if (sd_allnodes) { 7659 if (sd_allnodes) {
6814 struct sched_group *sg; 7660 struct sched_group *sg;
6815 7661
6816 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); 7662 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
7663 tmpmask);
6817 init_numa_sched_groups_power(sg); 7664 init_numa_sched_groups_power(sg);
6818 } 7665 }
6819#endif 7666#endif
@@ -6831,17 +7678,26 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6831 cpu_attach_domain(sd, rd, i); 7678 cpu_attach_domain(sd, rd, i);
6832 } 7679 }
6833 7680
7681 SCHED_CPUMASK_FREE((void *)allmasks);
6834 return 0; 7682 return 0;
6835 7683
6836#ifdef CONFIG_NUMA 7684#ifdef CONFIG_NUMA
6837error: 7685error:
6838 free_sched_groups(cpu_map); 7686 free_sched_groups(cpu_map, tmpmask);
7687 SCHED_CPUMASK_FREE((void *)allmasks);
6839 return -ENOMEM; 7688 return -ENOMEM;
6840#endif 7689#endif
6841} 7690}
6842 7691
7692static int build_sched_domains(const cpumask_t *cpu_map)
7693{
7694 return __build_sched_domains(cpu_map, NULL);
7695}
7696
6843static cpumask_t *doms_cur; /* current sched domains */ 7697static cpumask_t *doms_cur; /* current sched domains */
6844static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7698static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7699static struct sched_domain_attr *dattr_cur; /* attribues of custom domains
7700 in 'doms_cur' */
6845 7701
6846/* 7702/*
6847 * Special case: If a kmalloc of a doms_cur partition (array of 7703 * Special case: If a kmalloc of a doms_cur partition (array of
@@ -6869,15 +7725,17 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
6869 if (!doms_cur) 7725 if (!doms_cur)
6870 doms_cur = &fallback_doms; 7726 doms_cur = &fallback_doms;
6871 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7727 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
7728 dattr_cur = NULL;
6872 err = build_sched_domains(doms_cur); 7729 err = build_sched_domains(doms_cur);
6873 register_sched_domain_sysctl(); 7730 register_sched_domain_sysctl();
6874 7731
6875 return err; 7732 return err;
6876} 7733}
6877 7734
6878static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 7735static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7736 cpumask_t *tmpmask)
6879{ 7737{
6880 free_sched_groups(cpu_map); 7738 free_sched_groups(cpu_map, tmpmask);
6881} 7739}
6882 7740
6883/* 7741/*
@@ -6886,6 +7744,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6886 */ 7744 */
6887static void detach_destroy_domains(const cpumask_t *cpu_map) 7745static void detach_destroy_domains(const cpumask_t *cpu_map)
6888{ 7746{
7747 cpumask_t tmpmask;
6889 int i; 7748 int i;
6890 7749
6891 unregister_sched_domain_sysctl(); 7750 unregister_sched_domain_sysctl();
@@ -6893,7 +7752,23 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6893 for_each_cpu_mask(i, *cpu_map) 7752 for_each_cpu_mask(i, *cpu_map)
6894 cpu_attach_domain(NULL, &def_root_domain, i); 7753 cpu_attach_domain(NULL, &def_root_domain, i);
6895 synchronize_sched(); 7754 synchronize_sched();
6896 arch_destroy_sched_domains(cpu_map); 7755 arch_destroy_sched_domains(cpu_map, &tmpmask);
7756}
7757
7758/* handle null as "default" */
7759static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7760 struct sched_domain_attr *new, int idx_new)
7761{
7762 struct sched_domain_attr tmp;
7763
7764 /* fast path */
7765 if (!new && !cur)
7766 return 1;
7767
7768 tmp = SD_ATTR_INIT;
7769 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7770 new ? (new + idx_new) : &tmp,
7771 sizeof(struct sched_domain_attr));
6897} 7772}
6898 7773
6899/* 7774/*
@@ -6917,7 +7792,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6917 * 7792 *
6918 * Call with hotplug lock held 7793 * Call with hotplug lock held
6919 */ 7794 */
6920void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) 7795void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7796 struct sched_domain_attr *dattr_new)
6921{ 7797{
6922 int i, j; 7798 int i, j;
6923 7799
@@ -6930,12 +7806,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6930 ndoms_new = 1; 7806 ndoms_new = 1;
6931 doms_new = &fallback_doms; 7807 doms_new = &fallback_doms;
6932 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7808 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7809 dattr_new = NULL;
6933 } 7810 }
6934 7811
6935 /* Destroy deleted domains */ 7812 /* Destroy deleted domains */
6936 for (i = 0; i < ndoms_cur; i++) { 7813 for (i = 0; i < ndoms_cur; i++) {
6937 for (j = 0; j < ndoms_new; j++) { 7814 for (j = 0; j < ndoms_new; j++) {
6938 if (cpus_equal(doms_cur[i], doms_new[j])) 7815 if (cpus_equal(doms_cur[i], doms_new[j])
7816 && dattrs_equal(dattr_cur, i, dattr_new, j))
6939 goto match1; 7817 goto match1;
6940 } 7818 }
6941 /* no match - a current sched domain not in new doms_new[] */ 7819 /* no match - a current sched domain not in new doms_new[] */
@@ -6947,11 +7825,13 @@ match1:
6947 /* Build new domains */ 7825 /* Build new domains */
6948 for (i = 0; i < ndoms_new; i++) { 7826 for (i = 0; i < ndoms_new; i++) {
6949 for (j = 0; j < ndoms_cur; j++) { 7827 for (j = 0; j < ndoms_cur; j++) {
6950 if (cpus_equal(doms_new[i], doms_cur[j])) 7828 if (cpus_equal(doms_new[i], doms_cur[j])
7829 && dattrs_equal(dattr_new, i, dattr_cur, j))
6951 goto match2; 7830 goto match2;
6952 } 7831 }
6953 /* no match - add a new doms_new */ 7832 /* no match - add a new doms_new */
6954 build_sched_domains(doms_new + i); 7833 __build_sched_domains(doms_new + i,
7834 dattr_new ? dattr_new + i : NULL);
6955match2: 7835match2:
6956 ; 7836 ;
6957 } 7837 }
@@ -6959,7 +7839,9 @@ match2:
6959 /* Remember the new sched domains */ 7839 /* Remember the new sched domains */
6960 if (doms_cur != &fallback_doms) 7840 if (doms_cur != &fallback_doms)
6961 kfree(doms_cur); 7841 kfree(doms_cur);
7842 kfree(dattr_cur); /* kfree(NULL) is safe */
6962 doms_cur = doms_new; 7843 doms_cur = doms_new;
7844 dattr_cur = dattr_new;
6963 ndoms_cur = ndoms_new; 7845 ndoms_cur = ndoms_new;
6964 7846
6965 register_sched_domain_sysctl(); 7847 register_sched_domain_sysctl();
@@ -7086,6 +7968,11 @@ void __init sched_init_smp(void)
7086{ 7968{
7087 cpumask_t non_isolated_cpus; 7969 cpumask_t non_isolated_cpus;
7088 7970
7971#if defined(CONFIG_NUMA)
7972 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7973 GFP_KERNEL);
7974 BUG_ON(sched_group_nodes_bycpu == NULL);
7975#endif
7089 get_online_cpus(); 7976 get_online_cpus();
7090 arch_init_sched_domains(&cpu_online_map); 7977 arch_init_sched_domains(&cpu_online_map);
7091 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7978 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
@@ -7096,13 +7983,18 @@ void __init sched_init_smp(void)
7096 hotcpu_notifier(update_sched_domains, 0); 7983 hotcpu_notifier(update_sched_domains, 0);
7097 7984
7098 /* Move init over to a non-isolated CPU */ 7985 /* Move init over to a non-isolated CPU */
7099 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7986 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
7100 BUG(); 7987 BUG();
7101 sched_init_granularity(); 7988 sched_init_granularity();
7102} 7989}
7103#else 7990#else
7104void __init sched_init_smp(void) 7991void __init sched_init_smp(void)
7105{ 7992{
7993#if defined(CONFIG_NUMA)
7994 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7995 GFP_KERNEL);
7996 BUG_ON(sched_group_nodes_bycpu == NULL);
7997#endif
7106 sched_init_granularity(); 7998 sched_init_granularity();
7107} 7999}
7108#endif /* CONFIG_SMP */ 8000#endif /* CONFIG_SMP */
@@ -7117,6 +8009,7 @@ int in_sched_functions(unsigned long addr)
7117static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 8009static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7118{ 8010{
7119 cfs_rq->tasks_timeline = RB_ROOT; 8011 cfs_rq->tasks_timeline = RB_ROOT;
8012 INIT_LIST_HEAD(&cfs_rq->tasks);
7120#ifdef CONFIG_FAIR_GROUP_SCHED 8013#ifdef CONFIG_FAIR_GROUP_SCHED
7121 cfs_rq->rq = rq; 8014 cfs_rq->rq = rq;
7122#endif 8015#endif
@@ -7146,6 +8039,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7146 8039
7147 rt_rq->rt_time = 0; 8040 rt_rq->rt_time = 0;
7148 rt_rq->rt_throttled = 0; 8041 rt_rq->rt_throttled = 0;
8042 rt_rq->rt_runtime = 0;
8043 spin_lock_init(&rt_rq->rt_runtime_lock);
7149 8044
7150#ifdef CONFIG_RT_GROUP_SCHED 8045#ifdef CONFIG_RT_GROUP_SCHED
7151 rt_rq->rt_nr_boosted = 0; 8046 rt_rq->rt_nr_boosted = 0;
@@ -7154,10 +8049,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7154} 8049}
7155 8050
7156#ifdef CONFIG_FAIR_GROUP_SCHED 8051#ifdef CONFIG_FAIR_GROUP_SCHED
7157static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, 8052static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7158 struct cfs_rq *cfs_rq, struct sched_entity *se, 8053 struct sched_entity *se, int cpu, int add,
7159 int cpu, int add) 8054 struct sched_entity *parent)
7160{ 8055{
8056 struct rq *rq = cpu_rq(cpu);
7161 tg->cfs_rq[cpu] = cfs_rq; 8057 tg->cfs_rq[cpu] = cfs_rq;
7162 init_cfs_rq(cfs_rq, rq); 8058 init_cfs_rq(cfs_rq, rq);
7163 cfs_rq->tg = tg; 8059 cfs_rq->tg = tg;
@@ -7165,45 +8061,132 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7165 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 8061 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7166 8062
7167 tg->se[cpu] = se; 8063 tg->se[cpu] = se;
7168 se->cfs_rq = &rq->cfs; 8064 /* se could be NULL for init_task_group */
8065 if (!se)
8066 return;
8067
8068 if (!parent)
8069 se->cfs_rq = &rq->cfs;
8070 else
8071 se->cfs_rq = parent->my_q;
8072
7169 se->my_q = cfs_rq; 8073 se->my_q = cfs_rq;
7170 se->load.weight = tg->shares; 8074 se->load.weight = tg->shares;
7171 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); 8075 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7172 se->parent = NULL; 8076 se->parent = parent;
7173} 8077}
7174#endif 8078#endif
7175 8079
7176#ifdef CONFIG_RT_GROUP_SCHED 8080#ifdef CONFIG_RT_GROUP_SCHED
7177static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, 8081static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7178 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 8082 struct sched_rt_entity *rt_se, int cpu, int add,
7179 int cpu, int add) 8083 struct sched_rt_entity *parent)
7180{ 8084{
8085 struct rq *rq = cpu_rq(cpu);
8086
7181 tg->rt_rq[cpu] = rt_rq; 8087 tg->rt_rq[cpu] = rt_rq;
7182 init_rt_rq(rt_rq, rq); 8088 init_rt_rq(rt_rq, rq);
7183 rt_rq->tg = tg; 8089 rt_rq->tg = tg;
7184 rt_rq->rt_se = rt_se; 8090 rt_rq->rt_se = rt_se;
8091 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7185 if (add) 8092 if (add)
7186 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 8093 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7187 8094
7188 tg->rt_se[cpu] = rt_se; 8095 tg->rt_se[cpu] = rt_se;
8096 if (!rt_se)
8097 return;
8098
8099 if (!parent)
8100 rt_se->rt_rq = &rq->rt;
8101 else
8102 rt_se->rt_rq = parent->my_q;
8103
7189 rt_se->rt_rq = &rq->rt; 8104 rt_se->rt_rq = &rq->rt;
7190 rt_se->my_q = rt_rq; 8105 rt_se->my_q = rt_rq;
7191 rt_se->parent = NULL; 8106 rt_se->parent = parent;
7192 INIT_LIST_HEAD(&rt_se->run_list); 8107 INIT_LIST_HEAD(&rt_se->run_list);
7193} 8108}
7194#endif 8109#endif
7195 8110
7196void __init sched_init(void) 8111void __init sched_init(void)
7197{ 8112{
7198 int highest_cpu = 0;
7199 int i, j; 8113 int i, j;
8114 unsigned long alloc_size = 0, ptr;
8115
8116#ifdef CONFIG_FAIR_GROUP_SCHED
8117 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8118#endif
8119#ifdef CONFIG_RT_GROUP_SCHED
8120 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8121#endif
8122#ifdef CONFIG_USER_SCHED
8123 alloc_size *= 2;
8124#endif
8125 /*
8126 * As sched_init() is called before page_alloc is setup,
8127 * we use alloc_bootmem().
8128 */
8129 if (alloc_size) {
8130 ptr = (unsigned long)alloc_bootmem_low(alloc_size);
8131
8132#ifdef CONFIG_FAIR_GROUP_SCHED
8133 init_task_group.se = (struct sched_entity **)ptr;
8134 ptr += nr_cpu_ids * sizeof(void **);
8135
8136 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
8137 ptr += nr_cpu_ids * sizeof(void **);
8138
8139#ifdef CONFIG_USER_SCHED
8140 root_task_group.se = (struct sched_entity **)ptr;
8141 ptr += nr_cpu_ids * sizeof(void **);
8142
8143 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8144 ptr += nr_cpu_ids * sizeof(void **);
8145#endif
8146#endif
8147#ifdef CONFIG_RT_GROUP_SCHED
8148 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
8149 ptr += nr_cpu_ids * sizeof(void **);
8150
8151 init_task_group.rt_rq = (struct rt_rq **)ptr;
8152 ptr += nr_cpu_ids * sizeof(void **);
8153
8154#ifdef CONFIG_USER_SCHED
8155 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8156 ptr += nr_cpu_ids * sizeof(void **);
8157
8158 root_task_group.rt_rq = (struct rt_rq **)ptr;
8159 ptr += nr_cpu_ids * sizeof(void **);
8160#endif
8161#endif
8162 }
7200 8163
7201#ifdef CONFIG_SMP 8164#ifdef CONFIG_SMP
8165 init_aggregate();
7202 init_defrootdomain(); 8166 init_defrootdomain();
7203#endif 8167#endif
7204 8168
8169 init_rt_bandwidth(&def_rt_bandwidth,
8170 global_rt_period(), global_rt_runtime());
8171
8172#ifdef CONFIG_RT_GROUP_SCHED
8173 init_rt_bandwidth(&init_task_group.rt_bandwidth,
8174 global_rt_period(), global_rt_runtime());
8175#ifdef CONFIG_USER_SCHED
8176 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8177 global_rt_period(), RUNTIME_INF);
8178#endif
8179#endif
8180
7205#ifdef CONFIG_GROUP_SCHED 8181#ifdef CONFIG_GROUP_SCHED
7206 list_add(&init_task_group.list, &task_groups); 8182 list_add(&init_task_group.list, &task_groups);
8183 INIT_LIST_HEAD(&init_task_group.children);
8184
8185#ifdef CONFIG_USER_SCHED
8186 INIT_LIST_HEAD(&root_task_group.children);
8187 init_task_group.parent = &root_task_group;
8188 list_add(&init_task_group.siblings, &root_task_group.children);
8189#endif
7207#endif 8190#endif
7208 8191
7209 for_each_possible_cpu(i) { 8192 for_each_possible_cpu(i) {
@@ -7214,26 +8197,68 @@ void __init sched_init(void)
7214 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 8197 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7215 rq->nr_running = 0; 8198 rq->nr_running = 0;
7216 rq->clock = 1; 8199 rq->clock = 1;
8200 update_last_tick_seen(rq);
7217 init_cfs_rq(&rq->cfs, rq); 8201 init_cfs_rq(&rq->cfs, rq);
7218 init_rt_rq(&rq->rt, rq); 8202 init_rt_rq(&rq->rt, rq);
7219#ifdef CONFIG_FAIR_GROUP_SCHED 8203#ifdef CONFIG_FAIR_GROUP_SCHED
7220 init_task_group.shares = init_task_group_load; 8204 init_task_group.shares = init_task_group_load;
7221 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8205 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7222 init_tg_cfs_entry(rq, &init_task_group, 8206#ifdef CONFIG_CGROUP_SCHED
8207 /*
8208 * How much cpu bandwidth does init_task_group get?
8209 *
8210 * In case of task-groups formed thr' the cgroup filesystem, it
8211 * gets 100% of the cpu resources in the system. This overall
8212 * system cpu resource is divided among the tasks of
8213 * init_task_group and its child task-groups in a fair manner,
8214 * based on each entity's (task or task-group's) weight
8215 * (se->load.weight).
8216 *
8217 * In other words, if init_task_group has 10 tasks of weight
8218 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8219 * then A0's share of the cpu resource is:
8220 *
8221 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8222 *
8223 * We achieve this by letting init_task_group's tasks sit
8224 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8225 */
8226 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
8227#elif defined CONFIG_USER_SCHED
8228 root_task_group.shares = NICE_0_LOAD;
8229 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
8230 /*
8231 * In case of task-groups formed thr' the user id of tasks,
8232 * init_task_group represents tasks belonging to root user.
8233 * Hence it forms a sibling of all subsequent groups formed.
8234 * In this case, init_task_group gets only a fraction of overall
8235 * system cpu resource, based on the weight assigned to root
8236 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
8237 * by letting tasks of init_task_group sit in a separate cfs_rq
8238 * (init_cfs_rq) and having one entity represent this group of
8239 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
8240 */
8241 init_tg_cfs_entry(&init_task_group,
7223 &per_cpu(init_cfs_rq, i), 8242 &per_cpu(init_cfs_rq, i),
7224 &per_cpu(init_sched_entity, i), i, 1); 8243 &per_cpu(init_sched_entity, i), i, 1,
8244 root_task_group.se[i]);
7225 8245
7226#endif 8246#endif
8247#endif /* CONFIG_FAIR_GROUP_SCHED */
8248
8249 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7227#ifdef CONFIG_RT_GROUP_SCHED 8250#ifdef CONFIG_RT_GROUP_SCHED
7228 init_task_group.rt_runtime =
7229 sysctl_sched_rt_runtime * NSEC_PER_USEC;
7230 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 8251 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7231 init_tg_rt_entry(rq, &init_task_group, 8252#ifdef CONFIG_CGROUP_SCHED
8253 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8254#elif defined CONFIG_USER_SCHED
8255 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
8256 init_tg_rt_entry(&init_task_group,
7232 &per_cpu(init_rt_rq, i), 8257 &per_cpu(init_rt_rq, i),
7233 &per_cpu(init_sched_rt_entity, i), i, 1); 8258 &per_cpu(init_sched_rt_entity, i), i, 1,
8259 root_task_group.rt_se[i]);
8260#endif
7234#endif 8261#endif
7235 rq->rt_period_expire = 0;
7236 rq->rt_throttled = 0;
7237 8262
7238 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8263 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7239 rq->cpu_load[j] = 0; 8264 rq->cpu_load[j] = 0;
@@ -7250,7 +8275,6 @@ void __init sched_init(void)
7250#endif 8275#endif
7251 init_rq_hrtick(rq); 8276 init_rq_hrtick(rq);
7252 atomic_set(&rq->nr_iowait, 0); 8277 atomic_set(&rq->nr_iowait, 0);
7253 highest_cpu = i;
7254 } 8278 }
7255 8279
7256 set_load_weight(&init_task); 8280 set_load_weight(&init_task);
@@ -7260,7 +8284,6 @@ void __init sched_init(void)
7260#endif 8284#endif
7261 8285
7262#ifdef CONFIG_SMP 8286#ifdef CONFIG_SMP
7263 nr_cpu_ids = highest_cpu + 1;
7264 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8287 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
7265#endif 8288#endif
7266 8289
@@ -7419,8 +8442,6 @@ void set_curr_task(int cpu, struct task_struct *p)
7419 8442
7420#endif 8443#endif
7421 8444
7422#ifdef CONFIG_GROUP_SCHED
7423
7424#ifdef CONFIG_FAIR_GROUP_SCHED 8445#ifdef CONFIG_FAIR_GROUP_SCHED
7425static void free_fair_sched_group(struct task_group *tg) 8446static void free_fair_sched_group(struct task_group *tg)
7426{ 8447{
@@ -7437,17 +8458,18 @@ static void free_fair_sched_group(struct task_group *tg)
7437 kfree(tg->se); 8458 kfree(tg->se);
7438} 8459}
7439 8460
7440static int alloc_fair_sched_group(struct task_group *tg) 8461static
8462int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7441{ 8463{
7442 struct cfs_rq *cfs_rq; 8464 struct cfs_rq *cfs_rq;
7443 struct sched_entity *se; 8465 struct sched_entity *se, *parent_se;
7444 struct rq *rq; 8466 struct rq *rq;
7445 int i; 8467 int i;
7446 8468
7447 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); 8469 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7448 if (!tg->cfs_rq) 8470 if (!tg->cfs_rq)
7449 goto err; 8471 goto err;
7450 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 8472 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7451 if (!tg->se) 8473 if (!tg->se)
7452 goto err; 8474 goto err;
7453 8475
@@ -7466,7 +8488,8 @@ static int alloc_fair_sched_group(struct task_group *tg)
7466 if (!se) 8488 if (!se)
7467 goto err; 8489 goto err;
7468 8490
7469 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); 8491 parent_se = parent ? parent->se[i] : NULL;
8492 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
7470 } 8493 }
7471 8494
7472 return 1; 8495 return 1;
@@ -7490,7 +8513,8 @@ static inline void free_fair_sched_group(struct task_group *tg)
7490{ 8513{
7491} 8514}
7492 8515
7493static inline int alloc_fair_sched_group(struct task_group *tg) 8516static inline
8517int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7494{ 8518{
7495 return 1; 8519 return 1;
7496} 8520}
@@ -7509,6 +8533,8 @@ static void free_rt_sched_group(struct task_group *tg)
7509{ 8533{
7510 int i; 8534 int i;
7511 8535
8536 destroy_rt_bandwidth(&tg->rt_bandwidth);
8537
7512 for_each_possible_cpu(i) { 8538 for_each_possible_cpu(i) {
7513 if (tg->rt_rq) 8539 if (tg->rt_rq)
7514 kfree(tg->rt_rq[i]); 8540 kfree(tg->rt_rq[i]);
@@ -7520,21 +8546,23 @@ static void free_rt_sched_group(struct task_group *tg)
7520 kfree(tg->rt_se); 8546 kfree(tg->rt_se);
7521} 8547}
7522 8548
7523static int alloc_rt_sched_group(struct task_group *tg) 8549static
8550int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
7524{ 8551{
7525 struct rt_rq *rt_rq; 8552 struct rt_rq *rt_rq;
7526 struct sched_rt_entity *rt_se; 8553 struct sched_rt_entity *rt_se, *parent_se;
7527 struct rq *rq; 8554 struct rq *rq;
7528 int i; 8555 int i;
7529 8556
7530 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); 8557 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
7531 if (!tg->rt_rq) 8558 if (!tg->rt_rq)
7532 goto err; 8559 goto err;
7533 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); 8560 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
7534 if (!tg->rt_se) 8561 if (!tg->rt_se)
7535 goto err; 8562 goto err;
7536 8563
7537 tg->rt_runtime = 0; 8564 init_rt_bandwidth(&tg->rt_bandwidth,
8565 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
7538 8566
7539 for_each_possible_cpu(i) { 8567 for_each_possible_cpu(i) {
7540 rq = cpu_rq(i); 8568 rq = cpu_rq(i);
@@ -7549,7 +8577,8 @@ static int alloc_rt_sched_group(struct task_group *tg)
7549 if (!rt_se) 8577 if (!rt_se)
7550 goto err; 8578 goto err;
7551 8579
7552 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); 8580 parent_se = parent ? parent->rt_se[i] : NULL;
8581 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
7553 } 8582 }
7554 8583
7555 return 1; 8584 return 1;
@@ -7573,7 +8602,8 @@ static inline void free_rt_sched_group(struct task_group *tg)
7573{ 8602{
7574} 8603}
7575 8604
7576static inline int alloc_rt_sched_group(struct task_group *tg) 8605static inline
8606int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
7577{ 8607{
7578 return 1; 8608 return 1;
7579} 8609}
@@ -7587,6 +8617,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
7587} 8617}
7588#endif 8618#endif
7589 8619
8620#ifdef CONFIG_GROUP_SCHED
7590static void free_sched_group(struct task_group *tg) 8621static void free_sched_group(struct task_group *tg)
7591{ 8622{
7592 free_fair_sched_group(tg); 8623 free_fair_sched_group(tg);
@@ -7595,7 +8626,7 @@ static void free_sched_group(struct task_group *tg)
7595} 8626}
7596 8627
7597/* allocate runqueue etc for a new task group */ 8628/* allocate runqueue etc for a new task group */
7598struct task_group *sched_create_group(void) 8629struct task_group *sched_create_group(struct task_group *parent)
7599{ 8630{
7600 struct task_group *tg; 8631 struct task_group *tg;
7601 unsigned long flags; 8632 unsigned long flags;
@@ -7605,10 +8636,10 @@ struct task_group *sched_create_group(void)
7605 if (!tg) 8636 if (!tg)
7606 return ERR_PTR(-ENOMEM); 8637 return ERR_PTR(-ENOMEM);
7607 8638
7608 if (!alloc_fair_sched_group(tg)) 8639 if (!alloc_fair_sched_group(tg, parent))
7609 goto err; 8640 goto err;
7610 8641
7611 if (!alloc_rt_sched_group(tg)) 8642 if (!alloc_rt_sched_group(tg, parent))
7612 goto err; 8643 goto err;
7613 8644
7614 spin_lock_irqsave(&task_group_lock, flags); 8645 spin_lock_irqsave(&task_group_lock, flags);
@@ -7617,6 +8648,12 @@ struct task_group *sched_create_group(void)
7617 register_rt_sched_group(tg, i); 8648 register_rt_sched_group(tg, i);
7618 } 8649 }
7619 list_add_rcu(&tg->list, &task_groups); 8650 list_add_rcu(&tg->list, &task_groups);
8651
8652 WARN_ON(!parent); /* root should already exist */
8653
8654 tg->parent = parent;
8655 list_add_rcu(&tg->siblings, &parent->children);
8656 INIT_LIST_HEAD(&tg->children);
7620 spin_unlock_irqrestore(&task_group_lock, flags); 8657 spin_unlock_irqrestore(&task_group_lock, flags);
7621 8658
7622 return tg; 8659 return tg;
@@ -7645,6 +8682,7 @@ void sched_destroy_group(struct task_group *tg)
7645 unregister_rt_sched_group(tg, i); 8682 unregister_rt_sched_group(tg, i);
7646 } 8683 }
7647 list_del_rcu(&tg->list); 8684 list_del_rcu(&tg->list);
8685 list_del_rcu(&tg->siblings);
7648 spin_unlock_irqrestore(&task_group_lock, flags); 8686 spin_unlock_irqrestore(&task_group_lock, flags);
7649 8687
7650 /* wait for possible concurrent references to cfs_rqs complete */ 8688 /* wait for possible concurrent references to cfs_rqs complete */
@@ -7688,16 +8726,14 @@ void sched_move_task(struct task_struct *tsk)
7688 8726
7689 task_rq_unlock(rq, &flags); 8727 task_rq_unlock(rq, &flags);
7690} 8728}
8729#endif
7691 8730
7692#ifdef CONFIG_FAIR_GROUP_SCHED 8731#ifdef CONFIG_FAIR_GROUP_SCHED
7693static void set_se_shares(struct sched_entity *se, unsigned long shares) 8732static void __set_se_shares(struct sched_entity *se, unsigned long shares)
7694{ 8733{
7695 struct cfs_rq *cfs_rq = se->cfs_rq; 8734 struct cfs_rq *cfs_rq = se->cfs_rq;
7696 struct rq *rq = cfs_rq->rq;
7697 int on_rq; 8735 int on_rq;
7698 8736
7699 spin_lock_irq(&rq->lock);
7700
7701 on_rq = se->on_rq; 8737 on_rq = se->on_rq;
7702 if (on_rq) 8738 if (on_rq)
7703 dequeue_entity(cfs_rq, se, 0); 8739 dequeue_entity(cfs_rq, se, 0);
@@ -7707,8 +8743,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
7707 8743
7708 if (on_rq) 8744 if (on_rq)
7709 enqueue_entity(cfs_rq, se, 0); 8745 enqueue_entity(cfs_rq, se, 0);
8746}
7710 8747
7711 spin_unlock_irq(&rq->lock); 8748static void set_se_shares(struct sched_entity *se, unsigned long shares)
8749{
8750 struct cfs_rq *cfs_rq = se->cfs_rq;
8751 struct rq *rq = cfs_rq->rq;
8752 unsigned long flags;
8753
8754 spin_lock_irqsave(&rq->lock, flags);
8755 __set_se_shares(se, shares);
8756 spin_unlock_irqrestore(&rq->lock, flags);
7712} 8757}
7713 8758
7714static DEFINE_MUTEX(shares_mutex); 8759static DEFINE_MUTEX(shares_mutex);
@@ -7719,12 +8764,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7719 unsigned long flags; 8764 unsigned long flags;
7720 8765
7721 /* 8766 /*
8767 * We can't change the weight of the root cgroup.
8768 */
8769 if (!tg->se[0])
8770 return -EINVAL;
8771
8772 /*
7722 * A weight of 0 or 1 can cause arithmetics problems. 8773 * A weight of 0 or 1 can cause arithmetics problems.
7723 * (The default weight is 1024 - so there's no practical 8774 * (The default weight is 1024 - so there's no practical
7724 * limitation from this.) 8775 * limitation from this.)
7725 */ 8776 */
7726 if (shares < 2) 8777 if (shares < MIN_SHARES)
7727 shares = 2; 8778 shares = MIN_SHARES;
7728 8779
7729 mutex_lock(&shares_mutex); 8780 mutex_lock(&shares_mutex);
7730 if (tg->shares == shares) 8781 if (tg->shares == shares)
@@ -7733,6 +8784,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7733 spin_lock_irqsave(&task_group_lock, flags); 8784 spin_lock_irqsave(&task_group_lock, flags);
7734 for_each_possible_cpu(i) 8785 for_each_possible_cpu(i)
7735 unregister_fair_sched_group(tg, i); 8786 unregister_fair_sched_group(tg, i);
8787 list_del_rcu(&tg->siblings);
7736 spin_unlock_irqrestore(&task_group_lock, flags); 8788 spin_unlock_irqrestore(&task_group_lock, flags);
7737 8789
7738 /* wait for any ongoing reference to this group to finish */ 8790 /* wait for any ongoing reference to this group to finish */
@@ -7743,8 +8795,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7743 * w/o tripping rebalance_share or load_balance_fair. 8795 * w/o tripping rebalance_share or load_balance_fair.
7744 */ 8796 */
7745 tg->shares = shares; 8797 tg->shares = shares;
7746 for_each_possible_cpu(i) 8798 for_each_possible_cpu(i) {
7747 set_se_shares(tg->se[i], shares); 8799 /*
8800 * force a rebalance
8801 */
8802 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8803 set_se_shares(tg->se[i], shares/nr_cpu_ids);
8804 }
7748 8805
7749 /* 8806 /*
7750 * Enable load balance activity on this group, by inserting it back on 8807 * Enable load balance activity on this group, by inserting it back on
@@ -7753,6 +8810,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7753 spin_lock_irqsave(&task_group_lock, flags); 8810 spin_lock_irqsave(&task_group_lock, flags);
7754 for_each_possible_cpu(i) 8811 for_each_possible_cpu(i)
7755 register_fair_sched_group(tg, i); 8812 register_fair_sched_group(tg, i);
8813 list_add_rcu(&tg->siblings, &tg->parent->children);
7756 spin_unlock_irqrestore(&task_group_lock, flags); 8814 spin_unlock_irqrestore(&task_group_lock, flags);
7757done: 8815done:
7758 mutex_unlock(&shares_mutex); 8816 mutex_unlock(&shares_mutex);
@@ -7779,26 +8837,58 @@ static unsigned long to_ratio(u64 period, u64 runtime)
7779 return div64_64(runtime << 16, period); 8837 return div64_64(runtime << 16, period);
7780} 8838}
7781 8839
8840#ifdef CONFIG_CGROUP_SCHED
8841static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8842{
8843 struct task_group *tgi, *parent = tg->parent;
8844 unsigned long total = 0;
8845
8846 if (!parent) {
8847 if (global_rt_period() < period)
8848 return 0;
8849
8850 return to_ratio(period, runtime) <
8851 to_ratio(global_rt_period(), global_rt_runtime());
8852 }
8853
8854 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
8855 return 0;
8856
8857 rcu_read_lock();
8858 list_for_each_entry_rcu(tgi, &parent->children, siblings) {
8859 if (tgi == tg)
8860 continue;
8861
8862 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8863 tgi->rt_bandwidth.rt_runtime);
8864 }
8865 rcu_read_unlock();
8866
8867 return total + to_ratio(period, runtime) <
8868 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8869 parent->rt_bandwidth.rt_runtime);
8870}
8871#elif defined CONFIG_USER_SCHED
7782static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8872static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7783{ 8873{
7784 struct task_group *tgi; 8874 struct task_group *tgi;
7785 unsigned long total = 0; 8875 unsigned long total = 0;
7786 unsigned long global_ratio = 8876 unsigned long global_ratio =
7787 to_ratio(sysctl_sched_rt_period, 8877 to_ratio(global_rt_period(), global_rt_runtime());
7788 sysctl_sched_rt_runtime < 0 ?
7789 RUNTIME_INF : sysctl_sched_rt_runtime);
7790 8878
7791 rcu_read_lock(); 8879 rcu_read_lock();
7792 list_for_each_entry_rcu(tgi, &task_groups, list) { 8880 list_for_each_entry_rcu(tgi, &task_groups, list) {
7793 if (tgi == tg) 8881 if (tgi == tg)
7794 continue; 8882 continue;
7795 8883
7796 total += to_ratio(period, tgi->rt_runtime); 8884 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8885 tgi->rt_bandwidth.rt_runtime);
7797 } 8886 }
7798 rcu_read_unlock(); 8887 rcu_read_unlock();
7799 8888
7800 return total + to_ratio(period, runtime) < global_ratio; 8889 return total + to_ratio(period, runtime) < global_ratio;
7801} 8890}
8891#endif
7802 8892
7803/* Must be called with tasklist_lock held */ 8893/* Must be called with tasklist_lock held */
7804static inline int tg_has_rt_tasks(struct task_group *tg) 8894static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -7811,19 +8901,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
7811 return 0; 8901 return 0;
7812} 8902}
7813 8903
7814int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 8904static int tg_set_bandwidth(struct task_group *tg,
8905 u64 rt_period, u64 rt_runtime)
7815{ 8906{
7816 u64 rt_runtime, rt_period; 8907 int i, err = 0;
7817 int err = 0;
7818
7819 rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
7820 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7821 if (rt_runtime_us == -1)
7822 rt_runtime = RUNTIME_INF;
7823 8908
7824 mutex_lock(&rt_constraints_mutex); 8909 mutex_lock(&rt_constraints_mutex);
7825 read_lock(&tasklist_lock); 8910 read_lock(&tasklist_lock);
7826 if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { 8911 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
7827 err = -EBUSY; 8912 err = -EBUSY;
7828 goto unlock; 8913 goto unlock;
7829 } 8914 }
@@ -7831,7 +8916,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7831 err = -EINVAL; 8916 err = -EINVAL;
7832 goto unlock; 8917 goto unlock;
7833 } 8918 }
7834 tg->rt_runtime = rt_runtime; 8919
8920 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8921 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8922 tg->rt_bandwidth.rt_runtime = rt_runtime;
8923
8924 for_each_possible_cpu(i) {
8925 struct rt_rq *rt_rq = tg->rt_rq[i];
8926
8927 spin_lock(&rt_rq->rt_runtime_lock);
8928 rt_rq->rt_runtime = rt_runtime;
8929 spin_unlock(&rt_rq->rt_runtime_lock);
8930 }
8931 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7835 unlock: 8932 unlock:
7836 read_unlock(&tasklist_lock); 8933 read_unlock(&tasklist_lock);
7837 mutex_unlock(&rt_constraints_mutex); 8934 mutex_unlock(&rt_constraints_mutex);
@@ -7839,19 +8936,109 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7839 return err; 8936 return err;
7840} 8937}
7841 8938
8939int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8940{
8941 u64 rt_runtime, rt_period;
8942
8943 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8944 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8945 if (rt_runtime_us < 0)
8946 rt_runtime = RUNTIME_INF;
8947
8948 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8949}
8950
7842long sched_group_rt_runtime(struct task_group *tg) 8951long sched_group_rt_runtime(struct task_group *tg)
7843{ 8952{
7844 u64 rt_runtime_us; 8953 u64 rt_runtime_us;
7845 8954
7846 if (tg->rt_runtime == RUNTIME_INF) 8955 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7847 return -1; 8956 return -1;
7848 8957
7849 rt_runtime_us = tg->rt_runtime; 8958 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7850 do_div(rt_runtime_us, NSEC_PER_USEC); 8959 do_div(rt_runtime_us, NSEC_PER_USEC);
7851 return rt_runtime_us; 8960 return rt_runtime_us;
7852} 8961}
8962
8963int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8964{
8965 u64 rt_runtime, rt_period;
8966
8967 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8968 rt_runtime = tg->rt_bandwidth.rt_runtime;
8969
8970 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8971}
8972
8973long sched_group_rt_period(struct task_group *tg)
8974{
8975 u64 rt_period_us;
8976
8977 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8978 do_div(rt_period_us, NSEC_PER_USEC);
8979 return rt_period_us;
8980}
8981
8982static int sched_rt_global_constraints(void)
8983{
8984 int ret = 0;
8985
8986 mutex_lock(&rt_constraints_mutex);
8987 if (!__rt_schedulable(NULL, 1, 0))
8988 ret = -EINVAL;
8989 mutex_unlock(&rt_constraints_mutex);
8990
8991 return ret;
8992}
8993#else
8994static int sched_rt_global_constraints(void)
8995{
8996 unsigned long flags;
8997 int i;
8998
8999 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9000 for_each_possible_cpu(i) {
9001 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9002
9003 spin_lock(&rt_rq->rt_runtime_lock);
9004 rt_rq->rt_runtime = global_rt_runtime();
9005 spin_unlock(&rt_rq->rt_runtime_lock);
9006 }
9007 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
9008
9009 return 0;
9010}
7853#endif 9011#endif
7854#endif /* CONFIG_GROUP_SCHED */ 9012
9013int sched_rt_handler(struct ctl_table *table, int write,
9014 struct file *filp, void __user *buffer, size_t *lenp,
9015 loff_t *ppos)
9016{
9017 int ret;
9018 int old_period, old_runtime;
9019 static DEFINE_MUTEX(mutex);
9020
9021 mutex_lock(&mutex);
9022 old_period = sysctl_sched_rt_period;
9023 old_runtime = sysctl_sched_rt_runtime;
9024
9025 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
9026
9027 if (!ret && write) {
9028 ret = sched_rt_global_constraints();
9029 if (ret) {
9030 sysctl_sched_rt_period = old_period;
9031 sysctl_sched_rt_runtime = old_runtime;
9032 } else {
9033 def_rt_bandwidth.rt_runtime = global_rt_runtime();
9034 def_rt_bandwidth.rt_period =
9035 ns_to_ktime(global_rt_period());
9036 }
9037 }
9038 mutex_unlock(&mutex);
9039
9040 return ret;
9041}
7855 9042
7856#ifdef CONFIG_CGROUP_SCHED 9043#ifdef CONFIG_CGROUP_SCHED
7857 9044
@@ -7865,7 +9052,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7865static struct cgroup_subsys_state * 9052static struct cgroup_subsys_state *
7866cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) 9053cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7867{ 9054{
7868 struct task_group *tg; 9055 struct task_group *tg, *parent;
7869 9056
7870 if (!cgrp->parent) { 9057 if (!cgrp->parent) {
7871 /* This is early initialization for the top cgroup */ 9058 /* This is early initialization for the top cgroup */
@@ -7873,11 +9060,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7873 return &init_task_group.css; 9060 return &init_task_group.css;
7874 } 9061 }
7875 9062
7876 /* we support only 1-level deep hierarchical scheduler atm */ 9063 parent = cgroup_tg(cgrp->parent);
7877 if (cgrp->parent->parent) 9064 tg = sched_create_group(parent);
7878 return ERR_PTR(-EINVAL);
7879
7880 tg = sched_create_group();
7881 if (IS_ERR(tg)) 9065 if (IS_ERR(tg))
7882 return ERR_PTR(-ENOMEM); 9066 return ERR_PTR(-ENOMEM);
7883 9067
@@ -7901,7 +9085,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7901{ 9085{
7902#ifdef CONFIG_RT_GROUP_SCHED 9086#ifdef CONFIG_RT_GROUP_SCHED
7903 /* Don't accept realtime tasks when there is no way for them to run */ 9087 /* Don't accept realtime tasks when there is no way for them to run */
7904 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) 9088 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
7905 return -EINVAL; 9089 return -EINVAL;
7906#else 9090#else
7907 /* We don't support RT-tasks being in separate groups */ 9091 /* We don't support RT-tasks being in separate groups */
@@ -7935,7 +9119,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7935#endif 9119#endif
7936 9120
7937#ifdef CONFIG_RT_GROUP_SCHED 9121#ifdef CONFIG_RT_GROUP_SCHED
7938static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 9122static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7939 struct file *file, 9123 struct file *file,
7940 const char __user *userbuf, 9124 const char __user *userbuf,
7941 size_t nbytes, loff_t *unused_ppos) 9125 size_t nbytes, loff_t *unused_ppos)
@@ -7979,6 +9163,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
7979 9163
7980 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 9164 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
7981} 9165}
9166
9167static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9168 u64 rt_period_us)
9169{
9170 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9171}
9172
9173static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9174{
9175 return sched_group_rt_period(cgroup_tg(cgrp));
9176}
7982#endif 9177#endif
7983 9178
7984static struct cftype cpu_files[] = { 9179static struct cftype cpu_files[] = {
@@ -7995,6 +9190,11 @@ static struct cftype cpu_files[] = {
7995 .read = cpu_rt_runtime_read, 9190 .read = cpu_rt_runtime_read,
7996 .write = cpu_rt_runtime_write, 9191 .write = cpu_rt_runtime_write,
7997 }, 9192 },
9193 {
9194 .name = "rt_period_us",
9195 .read_uint = cpu_rt_period_read_uint,
9196 .write_uint = cpu_rt_period_write_uint,
9197 },
7998#endif 9198#endif
7999}; 9199};
8000 9200
@@ -8035,9 +9235,9 @@ struct cpuacct {
8035struct cgroup_subsys cpuacct_subsys; 9235struct cgroup_subsys cpuacct_subsys;
8036 9236
8037/* return cpu accounting group corresponding to this container */ 9237/* return cpu accounting group corresponding to this container */
8038static inline struct cpuacct *cgroup_ca(struct cgroup *cont) 9238static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
8039{ 9239{
8040 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), 9240 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
8041 struct cpuacct, css); 9241 struct cpuacct, css);
8042} 9242}
8043 9243
@@ -8050,7 +9250,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
8050 9250
8051/* create a new cpu accounting group */ 9251/* create a new cpu accounting group */
8052static struct cgroup_subsys_state *cpuacct_create( 9252static struct cgroup_subsys_state *cpuacct_create(
8053 struct cgroup_subsys *ss, struct cgroup *cont) 9253 struct cgroup_subsys *ss, struct cgroup *cgrp)
8054{ 9254{
8055 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 9255 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8056 9256
@@ -8068,18 +9268,18 @@ static struct cgroup_subsys_state *cpuacct_create(
8068 9268
8069/* destroy an existing cpu accounting group */ 9269/* destroy an existing cpu accounting group */
8070static void 9270static void
8071cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 9271cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8072{ 9272{
8073 struct cpuacct *ca = cgroup_ca(cont); 9273 struct cpuacct *ca = cgroup_ca(cgrp);
8074 9274
8075 free_percpu(ca->cpuusage); 9275 free_percpu(ca->cpuusage);
8076 kfree(ca); 9276 kfree(ca);
8077} 9277}
8078 9278
8079/* return total cpu usage (in nanoseconds) of a group */ 9279/* return total cpu usage (in nanoseconds) of a group */
8080static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) 9280static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8081{ 9281{
8082 struct cpuacct *ca = cgroup_ca(cont); 9282 struct cpuacct *ca = cgroup_ca(cgrp);
8083 u64 totalcpuusage = 0; 9283 u64 totalcpuusage = 0;
8084 int i; 9284 int i;
8085 9285
@@ -8098,16 +9298,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
8098 return totalcpuusage; 9298 return totalcpuusage;
8099} 9299}
8100 9300
9301static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9302 u64 reset)
9303{
9304 struct cpuacct *ca = cgroup_ca(cgrp);
9305 int err = 0;
9306 int i;
9307
9308 if (reset) {
9309 err = -EINVAL;
9310 goto out;
9311 }
9312
9313 for_each_possible_cpu(i) {
9314 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9315
9316 spin_lock_irq(&cpu_rq(i)->lock);
9317 *cpuusage = 0;
9318 spin_unlock_irq(&cpu_rq(i)->lock);
9319 }
9320out:
9321 return err;
9322}
9323
8101static struct cftype files[] = { 9324static struct cftype files[] = {
8102 { 9325 {
8103 .name = "usage", 9326 .name = "usage",
8104 .read_uint = cpuusage_read, 9327 .read_uint = cpuusage_read,
9328 .write_uint = cpuusage_write,
8105 }, 9329 },
8106}; 9330};
8107 9331
8108static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) 9332static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8109{ 9333{
8110 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 9334 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8111} 9335}
8112 9336
8113/* 9337/*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ef358ba07683..f3f4af4b8b0f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
67 (long long)(p->nvcsw + p->nivcsw), 67 (long long)(p->nvcsw + p->nivcsw),
68 p->prio); 68 p->prio);
69#ifdef CONFIG_SCHEDSTATS 69#ifdef CONFIG_SCHEDSTATS
70 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", 70 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
71 SPLIT_NS(p->se.vruntime), 71 SPLIT_NS(p->se.vruntime),
72 SPLIT_NS(p->se.sum_exec_runtime), 72 SPLIT_NS(p->se.sum_exec_runtime),
73 SPLIT_NS(p->se.sum_sleep_runtime)); 73 SPLIT_NS(p->se.sum_sleep_runtime));
74#else 74#else
75 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", 75 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
76 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 76 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
77#endif 77#endif
78
79#ifdef CONFIG_CGROUP_SCHED
80 {
81 char path[64];
82
83 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
84 SEQ_printf(m, " %s", path);
85 }
86#endif
87 SEQ_printf(m, "\n");
78} 88}
79 89
80static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) 90static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
@@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
109 struct sched_entity *last; 119 struct sched_entity *last;
110 unsigned long flags; 120 unsigned long flags;
111 121
112 SEQ_printf(m, "\ncfs_rq\n"); 122#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED)
123 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
124#else
125 char path[128] = "";
126 struct cgroup *cgroup = NULL;
127 struct task_group *tg = cfs_rq->tg;
128
129 if (tg)
130 cgroup = tg->css.cgroup;
131
132 if (cgroup)
133 cgroup_path(cgroup, path, sizeof(path));
134
135 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
136#endif
113 137
114 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
115 SPLIT_NS(cfs_rq->exec_clock)); 139 SPLIT_NS(cfs_rq->exec_clock));
@@ -143,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
143#endif 167#endif
144 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", 168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
145 cfs_rq->nr_spread_over); 169 cfs_rq->nr_spread_over);
170#ifdef CONFIG_FAIR_GROUP_SCHED
171#ifdef CONFIG_SMP
172 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
173#endif
174#endif
146} 175}
147 176
148static void print_cpu(struct seq_file *m, int cpu) 177static void print_cpu(struct seq_file *m, int cpu)
@@ -214,7 +243,6 @@ static int sched_debug_show(struct seq_file *m, void *v)
214 PN(sysctl_sched_latency); 243 PN(sysctl_sched_latency);
215 PN(sysctl_sched_min_granularity); 244 PN(sysctl_sched_min_granularity);
216 PN(sysctl_sched_wakeup_granularity); 245 PN(sysctl_sched_wakeup_granularity);
217 PN(sysctl_sched_batch_wakeup_granularity);
218 PN(sysctl_sched_child_runs_first); 246 PN(sysctl_sched_child_runs_first);
219 P(sysctl_sched_features); 247 P(sysctl_sched_features);
220#undef PN 248#undef PN
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0080968d3e4a..89fa32b4edf2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -62,24 +62,14 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1;
62unsigned int __read_mostly sysctl_sched_compat_yield; 62unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_BATCH wake-up granularity.
66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 *
68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies.
71 */
72unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
73
74/*
75 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
76 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
77 * 67 *
78 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
79 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
80 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
81 */ 71 */
82unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
83 73
84const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
85 75
@@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
87 * CFS operations on generic schedulable entities: 77 * CFS operations on generic schedulable entities:
88 */ 78 */
89 79
80static inline struct task_struct *task_of(struct sched_entity *se)
81{
82 return container_of(se, struct task_struct, se);
83}
84
90#ifdef CONFIG_FAIR_GROUP_SCHED 85#ifdef CONFIG_FAIR_GROUP_SCHED
91 86
92/* cpu runqueue to which this cfs_rq is attached */ 87/* cpu runqueue to which this cfs_rq is attached */
@@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
98/* An entity is a task if it doesn't "own" a runqueue */ 93/* An entity is a task if it doesn't "own" a runqueue */
99#define entity_is_task(se) (!se->my_q) 94#define entity_is_task(se) (!se->my_q)
100 95
96/* Walk up scheduling entities hierarchy */
97#define for_each_sched_entity(se) \
98 for (; se; se = se->parent)
99
100static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
101{
102 return p->se.cfs_rq;
103}
104
105/* runqueue on which this entity is (to be) queued */
106static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
107{
108 return se->cfs_rq;
109}
110
111/* runqueue "owned" by this group */
112static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
113{
114 return grp->my_q;
115}
116
117/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
118 * another cpu ('this_cpu')
119 */
120static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
121{
122 return cfs_rq->tg->cfs_rq[this_cpu];
123}
124
125/* Iterate thr' all leaf cfs_rq's on a runqueue */
126#define for_each_leaf_cfs_rq(rq, cfs_rq) \
127 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
128
129/* Do the two (enqueued) entities belong to the same group ? */
130static inline int
131is_same_group(struct sched_entity *se, struct sched_entity *pse)
132{
133 if (se->cfs_rq == pse->cfs_rq)
134 return 1;
135
136 return 0;
137}
138
139static inline struct sched_entity *parent_entity(struct sched_entity *se)
140{
141 return se->parent;
142}
143
101#else /* CONFIG_FAIR_GROUP_SCHED */ 144#else /* CONFIG_FAIR_GROUP_SCHED */
102 145
103static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 146static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
107 150
108#define entity_is_task(se) 1 151#define entity_is_task(se) 1
109 152
110#endif /* CONFIG_FAIR_GROUP_SCHED */ 153#define for_each_sched_entity(se) \
154 for (; se; se = NULL)
111 155
112static inline struct task_struct *task_of(struct sched_entity *se) 156static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
113{ 157{
114 return container_of(se, struct task_struct, se); 158 return &task_rq(p)->cfs;
159}
160
161static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
162{
163 struct task_struct *p = task_of(se);
164 struct rq *rq = task_rq(p);
165
166 return &rq->cfs;
167}
168
169/* runqueue "owned" by this group */
170static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
171{
172 return NULL;
173}
174
175static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
176{
177 return &cpu_rq(this_cpu)->cfs;
178}
179
180#define for_each_leaf_cfs_rq(rq, cfs_rq) \
181 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
182
183static inline int
184is_same_group(struct sched_entity *se, struct sched_entity *pse)
185{
186 return 1;
187}
188
189static inline struct sched_entity *parent_entity(struct sched_entity *se)
190{
191 return NULL;
115} 192}
116 193
194#endif /* CONFIG_FAIR_GROUP_SCHED */
195
117 196
118/************************************************************** 197/**************************************************************
119 * Scheduling class tree data structure manipulation methods: 198 * Scheduling class tree data structure manipulation methods:
@@ -255,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
255#endif 334#endif
256 335
257/* 336/*
337 * delta *= w / rw
338 */
339static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se)
341{
342 for_each_sched_entity(se) {
343 delta = calc_delta_mine(delta,
344 se->load.weight, &cfs_rq_of(se)->load);
345 }
346
347 return delta;
348}
349
350/*
351 * delta *= rw / w
352 */
353static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{
356 for_each_sched_entity(se) {
357 delta = calc_delta_mine(delta,
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360
361 return delta;
362}
363
364/*
258 * The idea is to set a period in which each task runs once. 365 * The idea is to set a period in which each task runs once.
259 * 366 *
260 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 367 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -283,29 +390,54 @@ static u64 __sched_period(unsigned long nr_running)
283 */ 390 */
284static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
285{ 392{
286 return calc_delta_mine(__sched_period(cfs_rq->nr_running), 393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
287 se->load.weight, &cfs_rq->load);
288} 394}
289 395
290/* 396/*
291 * We calculate the vruntime slice. 397 * We calculate the vruntime slice of a to be inserted task
292 * 398 *
293 * vs = s/w = p/rw 399 * vs = s*rw/w = p
294 */ 400 */
295static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) 401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
296{ 402{
297 u64 vslice = __sched_period(nr_running); 403 unsigned long nr_running = cfs_rq->nr_running;
298 404
299 vslice *= NICE_0_LOAD; 405 if (!se->on_rq)
300 do_div(vslice, rq_weight); 406 nr_running++;
301 407
302 return vslice; 408 return __sched_period(nr_running);
303} 409}
304 410
305static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
306{ 424{
307 return __sched_vslice(cfs_rq->load.weight + se->load.weight, 425 struct load_weight lw = {
308 cfs_rq->nr_running + 1); 426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432
433 if (se->load.weight < NICE_0_LOAD)
434 se_lw = &lw;
435
436 delta = calc_delta_mine(delta,
437 cfs_rq_of(se)->load.weight, se_lw);
438 }
439
440 return delta;
309} 441}
310 442
311/* 443/*
@@ -322,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
322 454
323 curr->sum_exec_runtime += delta_exec; 455 curr->sum_exec_runtime += delta_exec;
324 schedstat_add(cfs_rq, exec_clock, delta_exec); 456 schedstat_add(cfs_rq, exec_clock, delta_exec);
325 delta_exec_weighted = delta_exec; 457 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
326 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
327 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
328 &curr->load);
329 }
330 curr->vruntime += delta_exec_weighted; 458 curr->vruntime += delta_exec_weighted;
331} 459}
332 460
@@ -413,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 * Scheduling class queueing methods: 541 * Scheduling class queueing methods:
414 */ 542 */
415 543
544#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
545static void
546add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
547{
548 cfs_rq->task_weight += weight;
549}
550#else
551static inline void
552add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
553{
554}
555#endif
556
416static void 557static void
417account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 558account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
418{ 559{
419 update_load_add(&cfs_rq->load, se->load.weight); 560 update_load_add(&cfs_rq->load, se->load.weight);
561 if (!parent_entity(se))
562 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
563 if (entity_is_task(se))
564 add_cfs_task_weight(cfs_rq, se->load.weight);
420 cfs_rq->nr_running++; 565 cfs_rq->nr_running++;
421 se->on_rq = 1; 566 se->on_rq = 1;
567 list_add(&se->group_node, &cfs_rq->tasks);
422} 568}
423 569
424static void 570static void
425account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 571account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
426{ 572{
427 update_load_sub(&cfs_rq->load, se->load.weight); 573 update_load_sub(&cfs_rq->load, se->load.weight);
574 if (!parent_entity(se))
575 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
576 if (entity_is_task(se))
577 add_cfs_task_weight(cfs_rq, -se->load.weight);
428 cfs_rq->nr_running--; 578 cfs_rq->nr_running--;
429 se->on_rq = 0; 579 se->on_rq = 0;
580 list_del_init(&se->group_node);
430} 581}
431 582
432static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 583static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -510,8 +661,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
510 661
511 if (!initial) { 662 if (!initial) {
512 /* sleeps upto a single latency don't count. */ 663 /* sleeps upto a single latency don't count. */
513 if (sched_feat(NEW_FAIR_SLEEPERS)) 664 if (sched_feat(NEW_FAIR_SLEEPERS)) {
514 vruntime -= sysctl_sched_latency; 665 if (sched_feat(NORMALIZED_SLEEPER))
666 vruntime -= calc_delta_weight(sysctl_sched_latency, se);
667 else
668 vruntime -= sysctl_sched_latency;
669 }
515 670
516 /* ensure we never gain time by being placed backwards. */ 671 /* ensure we never gain time by being placed backwards. */
517 vruntime = max_vruntime(se->vruntime, vruntime); 672 vruntime = max_vruntime(se->vruntime, vruntime);
@@ -627,20 +782,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
627 se->prev_sum_exec_runtime = se->sum_exec_runtime; 782 se->prev_sum_exec_runtime = se->sum_exec_runtime;
628} 783}
629 784
785static int
786wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
787
630static struct sched_entity * 788static struct sched_entity *
631pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 789pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
632{ 790{
633 s64 diff, gran;
634
635 if (!cfs_rq->next) 791 if (!cfs_rq->next)
636 return se; 792 return se;
637 793
638 diff = cfs_rq->next->vruntime - se->vruntime; 794 if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
639 if (diff < 0)
640 return se;
641
642 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
643 if (diff > gran)
644 return se; 795 return se;
645 796
646 return cfs_rq->next; 797 return cfs_rq->next;
@@ -708,101 +859,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
708 * CFS operations on tasks: 859 * CFS operations on tasks:
709 */ 860 */
710 861
711#ifdef CONFIG_FAIR_GROUP_SCHED
712
713/* Walk up scheduling entities hierarchy */
714#define for_each_sched_entity(se) \
715 for (; se; se = se->parent)
716
717static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
718{
719 return p->se.cfs_rq;
720}
721
722/* runqueue on which this entity is (to be) queued */
723static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
724{
725 return se->cfs_rq;
726}
727
728/* runqueue "owned" by this group */
729static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
730{
731 return grp->my_q;
732}
733
734/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
735 * another cpu ('this_cpu')
736 */
737static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
738{
739 return cfs_rq->tg->cfs_rq[this_cpu];
740}
741
742/* Iterate thr' all leaf cfs_rq's on a runqueue */
743#define for_each_leaf_cfs_rq(rq, cfs_rq) \
744 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
745
746/* Do the two (enqueued) entities belong to the same group ? */
747static inline int
748is_same_group(struct sched_entity *se, struct sched_entity *pse)
749{
750 if (se->cfs_rq == pse->cfs_rq)
751 return 1;
752
753 return 0;
754}
755
756static inline struct sched_entity *parent_entity(struct sched_entity *se)
757{
758 return se->parent;
759}
760
761#else /* CONFIG_FAIR_GROUP_SCHED */
762
763#define for_each_sched_entity(se) \
764 for (; se; se = NULL)
765
766static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
767{
768 return &task_rq(p)->cfs;
769}
770
771static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
772{
773 struct task_struct *p = task_of(se);
774 struct rq *rq = task_rq(p);
775
776 return &rq->cfs;
777}
778
779/* runqueue "owned" by this group */
780static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
781{
782 return NULL;
783}
784
785static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
786{
787 return &cpu_rq(this_cpu)->cfs;
788}
789
790#define for_each_leaf_cfs_rq(rq, cfs_rq) \
791 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
792
793static inline int
794is_same_group(struct sched_entity *se, struct sched_entity *pse)
795{
796 return 1;
797}
798
799static inline struct sched_entity *parent_entity(struct sched_entity *se)
800{
801 return NULL;
802}
803
804#endif /* CONFIG_FAIR_GROUP_SCHED */
805
806#ifdef CONFIG_SCHED_HRTICK 862#ifdef CONFIG_SCHED_HRTICK
807static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 863static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
808{ 864{
@@ -916,7 +972,7 @@ static void yield_task_fair(struct rq *rq)
916 /* 972 /*
917 * Already in the rightmost position? 973 * Already in the rightmost position?
918 */ 974 */
919 if (unlikely(rightmost->vruntime < se->vruntime)) 975 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
920 return; 976 return;
921 977
922 /* 978 /*
@@ -955,7 +1011,9 @@ static int wake_idle(int cpu, struct task_struct *p)
955 return cpu; 1011 return cpu;
956 1012
957 for_each_domain(cpu, sd) { 1013 for_each_domain(cpu, sd) {
958 if (sd->flags & SD_WAKE_IDLE) { 1014 if ((sd->flags & SD_WAKE_IDLE)
1015 || ((sd->flags & SD_WAKE_IDLE_FAR)
1016 && !task_hot(p, task_rq(p)->clock, sd))) {
959 cpus_and(tmp, sd->span, p->cpus_allowed); 1017 cpus_and(tmp, sd->span, p->cpus_allowed);
960 for_each_cpu_mask(i, tmp) { 1018 for_each_cpu_mask(i, tmp) {
961 if (idle_cpu(i)) { 1019 if (idle_cpu(i)) {
@@ -1099,6 +1157,58 @@ out:
1099} 1157}
1100#endif /* CONFIG_SMP */ 1158#endif /* CONFIG_SMP */
1101 1159
1160static unsigned long wakeup_gran(struct sched_entity *se)
1161{
1162 unsigned long gran = sysctl_sched_wakeup_granularity;
1163
1164 /*
1165 * More easily preempt - nice tasks, while not making it harder for
1166 * + nice tasks.
1167 */
1168 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
1169
1170 return gran;
1171}
1172
1173/*
1174 * Should 'se' preempt 'curr'.
1175 *
1176 * |s1
1177 * |s2
1178 * |s3
1179 * g
1180 * |<--->|c
1181 *
1182 * w(c, s1) = -1
1183 * w(c, s2) = 0
1184 * w(c, s3) = 1
1185 *
1186 */
1187static int
1188wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1189{
1190 s64 gran, vdiff = curr->vruntime - se->vruntime;
1191
1192 if (vdiff < 0)
1193 return -1;
1194
1195 gran = wakeup_gran(curr);
1196 if (vdiff > gran)
1197 return 1;
1198
1199 return 0;
1200}
1201
1202/* return depth at which a sched entity is present in the hierarchy */
1203static inline int depth_se(struct sched_entity *se)
1204{
1205 int depth = 0;
1206
1207 for_each_sched_entity(se)
1208 depth++;
1209
1210 return depth;
1211}
1102 1212
1103/* 1213/*
1104 * Preempt the current task with a newly woken task if needed: 1214 * Preempt the current task with a newly woken task if needed:
@@ -1108,7 +1218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1108 struct task_struct *curr = rq->curr; 1218 struct task_struct *curr = rq->curr;
1109 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1219 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1110 struct sched_entity *se = &curr->se, *pse = &p->se; 1220 struct sched_entity *se = &curr->se, *pse = &p->se;
1111 unsigned long gran; 1221 int se_depth, pse_depth;
1112 1222
1113 if (unlikely(rt_prio(p->prio))) { 1223 if (unlikely(rt_prio(p->prio))) {
1114 update_rq_clock(rq); 1224 update_rq_clock(rq);
@@ -1133,20 +1243,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1133 if (!sched_feat(WAKEUP_PREEMPT)) 1243 if (!sched_feat(WAKEUP_PREEMPT))
1134 return; 1244 return;
1135 1245
1136 while (!is_same_group(se, pse)) { 1246 /*
1247 * preemption test can be made between sibling entities who are in the
1248 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
1249 * both tasks until we find their ancestors who are siblings of common
1250 * parent.
1251 */
1252
1253 /* First walk up until both entities are at same depth */
1254 se_depth = depth_se(se);
1255 pse_depth = depth_se(pse);
1256
1257 while (se_depth > pse_depth) {
1258 se_depth--;
1137 se = parent_entity(se); 1259 se = parent_entity(se);
1260 }
1261
1262 while (pse_depth > se_depth) {
1263 pse_depth--;
1138 pse = parent_entity(pse); 1264 pse = parent_entity(pse);
1139 } 1265 }
1140 1266
1141 gran = sysctl_sched_wakeup_granularity; 1267 while (!is_same_group(se, pse)) {
1142 /* 1268 se = parent_entity(se);
1143 * More easily preempt - nice tasks, while not making 1269 pse = parent_entity(pse);
1144 * it harder for + nice tasks. 1270 }
1145 */
1146 if (unlikely(se->load.weight > NICE_0_LOAD))
1147 gran = calc_delta_fair(gran, &se->load);
1148 1271
1149 if (pse->vruntime + gran < se->vruntime) 1272 if (wakeup_preempt_entity(se, pse) == 1)
1150 resched_task(curr); 1273 resched_task(curr);
1151} 1274}
1152 1275
@@ -1197,15 +1320,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1197 * the current task: 1320 * the current task:
1198 */ 1321 */
1199static struct task_struct * 1322static struct task_struct *
1200__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) 1323__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1201{ 1324{
1202 struct task_struct *p; 1325 struct task_struct *p = NULL;
1326 struct sched_entity *se;
1327
1328 if (next == &cfs_rq->tasks)
1329 return NULL;
1330
1331 /* Skip over entities that are not tasks */
1332 do {
1333 se = list_entry(next, struct sched_entity, group_node);
1334 next = next->next;
1335 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1203 1336
1204 if (!curr) 1337 if (next == &cfs_rq->tasks)
1205 return NULL; 1338 return NULL;
1206 1339
1207 p = rb_entry(curr, struct task_struct, se.run_node); 1340 cfs_rq->balance_iterator = next;
1208 cfs_rq->rb_load_balance_curr = rb_next(curr); 1341
1342 if (entity_is_task(se))
1343 p = task_of(se);
1209 1344
1210 return p; 1345 return p;
1211} 1346}
@@ -1214,85 +1349,100 @@ static struct task_struct *load_balance_start_fair(void *arg)
1214{ 1349{
1215 struct cfs_rq *cfs_rq = arg; 1350 struct cfs_rq *cfs_rq = arg;
1216 1351
1217 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); 1352 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
1218} 1353}
1219 1354
1220static struct task_struct *load_balance_next_fair(void *arg) 1355static struct task_struct *load_balance_next_fair(void *arg)
1221{ 1356{
1222 struct cfs_rq *cfs_rq = arg; 1357 struct cfs_rq *cfs_rq = arg;
1223 1358
1224 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1359 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1225} 1360}
1226 1361
1227#ifdef CONFIG_FAIR_GROUP_SCHED 1362static unsigned long
1228static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1363__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1364 unsigned long max_load_move, struct sched_domain *sd,
1365 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1366 struct cfs_rq *cfs_rq)
1229{ 1367{
1230 struct sched_entity *curr; 1368 struct rq_iterator cfs_rq_iterator;
1231 struct task_struct *p;
1232
1233 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1234 return MAX_PRIO;
1235
1236 curr = cfs_rq->curr;
1237 if (!curr)
1238 curr = __pick_next_entity(cfs_rq);
1239 1369
1240 p = task_of(curr); 1370 cfs_rq_iterator.start = load_balance_start_fair;
1371 cfs_rq_iterator.next = load_balance_next_fair;
1372 cfs_rq_iterator.arg = cfs_rq;
1241 1373
1242 return p->prio; 1374 return balance_tasks(this_rq, this_cpu, busiest,
1375 max_load_move, sd, idle, all_pinned,
1376 this_best_prio, &cfs_rq_iterator);
1243} 1377}
1244#endif
1245 1378
1379#ifdef CONFIG_FAIR_GROUP_SCHED
1246static unsigned long 1380static unsigned long
1247load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1381load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1248 unsigned long max_load_move, 1382 unsigned long max_load_move,
1249 struct sched_domain *sd, enum cpu_idle_type idle, 1383 struct sched_domain *sd, enum cpu_idle_type idle,
1250 int *all_pinned, int *this_best_prio) 1384 int *all_pinned, int *this_best_prio)
1251{ 1385{
1252 struct cfs_rq *busy_cfs_rq;
1253 long rem_load_move = max_load_move; 1386 long rem_load_move = max_load_move;
1254 struct rq_iterator cfs_rq_iterator; 1387 int busiest_cpu = cpu_of(busiest);
1255 1388 struct task_group *tg;
1256 cfs_rq_iterator.start = load_balance_start_fair;
1257 cfs_rq_iterator.next = load_balance_next_fair;
1258 1389
1259 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1390 rcu_read_lock();
1260#ifdef CONFIG_FAIR_GROUP_SCHED 1391 list_for_each_entry(tg, &task_groups, list) {
1261 struct cfs_rq *this_cfs_rq;
1262 long imbalance; 1392 long imbalance;
1263 unsigned long maxload; 1393 unsigned long this_weight, busiest_weight;
1394 long rem_load, max_load, moved_load;
1395
1396 /*
1397 * empty group
1398 */
1399 if (!aggregate(tg, sd)->task_weight)
1400 continue;
1401
1402 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1403 rem_load /= aggregate(tg, sd)->load + 1;
1404
1405 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1406 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1407
1408 imbalance = (busiest_weight - this_weight) / 2;
1264 1409
1265 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1410 if (imbalance < 0)
1411 imbalance = busiest_weight;
1266 1412
1267 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1413 max_load = max(rem_load, imbalance);
1268 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1414 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1269 if (imbalance <= 0) 1415 max_load, sd, idle, all_pinned, this_best_prio,
1416 tg->cfs_rq[busiest_cpu]);
1417
1418 if (!moved_load)
1270 continue; 1419 continue;
1271 1420
1272 /* Don't pull more than imbalance/2 */ 1421 move_group_shares(tg, sd, busiest_cpu, this_cpu);
1273 imbalance /= 2;
1274 maxload = min(rem_load_move, imbalance);
1275 1422
1276 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1423 moved_load *= aggregate(tg, sd)->load;
1277#else 1424 moved_load /= aggregate(tg, sd)->rq_weight + 1;
1278# define maxload rem_load_move
1279#endif
1280 /*
1281 * pass busy_cfs_rq argument into
1282 * load_balance_[start|next]_fair iterators
1283 */
1284 cfs_rq_iterator.arg = busy_cfs_rq;
1285 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1286 maxload, sd, idle, all_pinned,
1287 this_best_prio,
1288 &cfs_rq_iterator);
1289 1425
1290 if (rem_load_move <= 0) 1426 rem_load_move -= moved_load;
1427 if (rem_load_move < 0)
1291 break; 1428 break;
1292 } 1429 }
1430 rcu_read_unlock();
1293 1431
1294 return max_load_move - rem_load_move; 1432 return max_load_move - rem_load_move;
1295} 1433}
1434#else
1435static unsigned long
1436load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1437 unsigned long max_load_move,
1438 struct sched_domain *sd, enum cpu_idle_type idle,
1439 int *all_pinned, int *this_best_prio)
1440{
1441 return __load_balance_fair(this_rq, this_cpu, busiest,
1442 max_load_move, sd, idle, all_pinned,
1443 this_best_prio, &busiest->cfs);
1444}
1445#endif
1296 1446
1297static int 1447static int
1298move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1448move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1461,16 +1611,40 @@ static const struct sched_class fair_sched_class = {
1461}; 1611};
1462 1612
1463#ifdef CONFIG_SCHED_DEBUG 1613#ifdef CONFIG_SCHED_DEBUG
1614static void
1615print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
1616{
1617 struct sched_entity *se;
1618
1619 if (!cfs_rq)
1620 return;
1621
1622 list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
1623 int i;
1624
1625 for (i = depth; i; i--)
1626 seq_puts(m, " ");
1627
1628 seq_printf(m, "%lu %s %lu\n",
1629 se->load.weight,
1630 entity_is_task(se) ? "T" : "G",
1631 calc_delta_weight(SCHED_LOAD_SCALE, se)
1632 );
1633 if (!entity_is_task(se))
1634 print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
1635 }
1636}
1637
1464static void print_cfs_stats(struct seq_file *m, int cpu) 1638static void print_cfs_stats(struct seq_file *m, int cpu)
1465{ 1639{
1466 struct cfs_rq *cfs_rq; 1640 struct cfs_rq *cfs_rq;
1467 1641
1468#ifdef CONFIG_FAIR_GROUP_SCHED
1469 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1470#endif
1471 rcu_read_lock(); 1642 rcu_read_lock();
1472 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1643 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1473 print_cfs_rq(m, cpu, cfs_rq); 1644 print_cfs_rq(m, cpu, cfs_rq);
1645
1646 seq_printf(m, "\nWeight tree:\n");
1647 print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
1474 rcu_read_unlock(); 1648 rcu_read_unlock();
1475} 1649}
1476#endif 1650#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
new file mode 100644
index 000000000000..1c7283cb9581
--- /dev/null
+++ b/kernel/sched_features.h
@@ -0,0 +1,10 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
2SCHED_FEAT(WAKEUP_PREEMPT, 1)
3SCHED_FEAT(START_DEBIT, 1)
4SCHED_FEAT(AFFINE_WAKEUPS, 1)
5SCHED_FEAT(CACHE_HOT_BUDDY, 1)
6SCHED_FEAT(SYNC_WAKEUPS, 1)
7SCHED_FEAT(HRTICK, 1)
8SCHED_FEAT(DOUBLE_TICK, 0)
9SCHED_FEAT(NORMALIZED_SLEEPER, 1)
10SCHED_FEAT(DEADLINE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0a6d2e516420..c2730a5a4f05 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
62 if (!rt_rq->tg) 62 if (!rt_rq->tg)
63 return RUNTIME_INF; 63 return RUNTIME_INF;
64 64
65 return rt_rq->tg->rt_runtime; 65 return rt_rq->rt_runtime;
66}
67
68static inline u64 sched_rt_period(struct rt_rq *rt_rq)
69{
70 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
66} 71}
67 72
68#define for_each_leaf_rt_rq(rt_rq, rq) \ 73#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -127,14 +132,39 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
127 return p->prio != p->normal_prio; 132 return p->prio != p->normal_prio;
128} 133}
129 134
135#ifdef CONFIG_SMP
136static inline cpumask_t sched_rt_period_mask(void)
137{
138 return cpu_rq(smp_processor_id())->rd->span;
139}
140#else
141static inline cpumask_t sched_rt_period_mask(void)
142{
143 return cpu_online_map;
144}
145#endif
146
147static inline
148struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
149{
150 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
151}
152
153static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
154{
155 return &rt_rq->tg->rt_bandwidth;
156}
157
130#else 158#else
131 159
132static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 160static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
133{ 161{
134 if (sysctl_sched_rt_runtime == -1) 162 return rt_rq->rt_runtime;
135 return RUNTIME_INF; 163}
136 164
137 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 165static inline u64 sched_rt_period(struct rt_rq *rt_rq)
166{
167 return ktime_to_ns(def_rt_bandwidth.rt_period);
138} 168}
139 169
140#define for_each_leaf_rt_rq(rt_rq, rq) \ 170#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -173,6 +203,102 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
173{ 203{
174 return rt_rq->rt_throttled; 204 return rt_rq->rt_throttled;
175} 205}
206
207static inline cpumask_t sched_rt_period_mask(void)
208{
209 return cpu_online_map;
210}
211
212static inline
213struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
214{
215 return &cpu_rq(cpu)->rt;
216}
217
218static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
219{
220 return &def_rt_bandwidth;
221}
222
223#endif
224
225static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
226{
227 int i, idle = 1;
228 cpumask_t span;
229
230 if (rt_b->rt_runtime == RUNTIME_INF)
231 return 1;
232
233 span = sched_rt_period_mask();
234 for_each_cpu_mask(i, span) {
235 int enqueue = 0;
236 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
237 struct rq *rq = rq_of_rt_rq(rt_rq);
238
239 spin_lock(&rq->lock);
240 if (rt_rq->rt_time) {
241 u64 runtime;
242
243 spin_lock(&rt_rq->rt_runtime_lock);
244 runtime = rt_rq->rt_runtime;
245 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
246 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
247 rt_rq->rt_throttled = 0;
248 enqueue = 1;
249 }
250 if (rt_rq->rt_time || rt_rq->rt_nr_running)
251 idle = 0;
252 spin_unlock(&rt_rq->rt_runtime_lock);
253 }
254
255 if (enqueue)
256 sched_rt_rq_enqueue(rt_rq);
257 spin_unlock(&rq->lock);
258 }
259
260 return idle;
261}
262
263#ifdef CONFIG_SMP
264static int balance_runtime(struct rt_rq *rt_rq)
265{
266 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
267 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
268 int i, weight, more = 0;
269 u64 rt_period;
270
271 weight = cpus_weight(rd->span);
272
273 spin_lock(&rt_b->rt_runtime_lock);
274 rt_period = ktime_to_ns(rt_b->rt_period);
275 for_each_cpu_mask(i, rd->span) {
276 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
277 s64 diff;
278
279 if (iter == rt_rq)
280 continue;
281
282 spin_lock(&iter->rt_runtime_lock);
283 diff = iter->rt_runtime - iter->rt_time;
284 if (diff > 0) {
285 do_div(diff, weight);
286 if (rt_rq->rt_runtime + diff > rt_period)
287 diff = rt_period - rt_rq->rt_runtime;
288 iter->rt_runtime -= diff;
289 rt_rq->rt_runtime += diff;
290 more = 1;
291 if (rt_rq->rt_runtime == rt_period) {
292 spin_unlock(&iter->rt_runtime_lock);
293 break;
294 }
295 }
296 spin_unlock(&iter->rt_runtime_lock);
297 }
298 spin_unlock(&rt_b->rt_runtime_lock);
299
300 return more;
301}
176#endif 302#endif
177 303
178static inline int rt_se_prio(struct sched_rt_entity *rt_se) 304static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -197,12 +323,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
197 if (rt_rq->rt_throttled) 323 if (rt_rq->rt_throttled)
198 return rt_rq_throttled(rt_rq); 324 return rt_rq_throttled(rt_rq);
199 325
326 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
327 return 0;
328
329#ifdef CONFIG_SMP
200 if (rt_rq->rt_time > runtime) { 330 if (rt_rq->rt_time > runtime) {
201 struct rq *rq = rq_of_rt_rq(rt_rq); 331 int more;
202 332
203 rq->rt_throttled = 1; 333 spin_unlock(&rt_rq->rt_runtime_lock);
204 rt_rq->rt_throttled = 1; 334 more = balance_runtime(rt_rq);
335 spin_lock(&rt_rq->rt_runtime_lock);
205 336
337 if (more)
338 runtime = sched_rt_runtime(rt_rq);
339 }
340#endif
341
342 if (rt_rq->rt_time > runtime) {
343 rt_rq->rt_throttled = 1;
206 if (rt_rq_throttled(rt_rq)) { 344 if (rt_rq_throttled(rt_rq)) {
207 sched_rt_rq_dequeue(rt_rq); 345 sched_rt_rq_dequeue(rt_rq);
208 return 1; 346 return 1;
@@ -212,29 +350,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
212 return 0; 350 return 0;
213} 351}
214 352
215static void update_sched_rt_period(struct rq *rq)
216{
217 struct rt_rq *rt_rq;
218 u64 period;
219
220 while (rq->clock > rq->rt_period_expire) {
221 period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
222 rq->rt_period_expire += period;
223
224 for_each_leaf_rt_rq(rt_rq, rq) {
225 u64 runtime = sched_rt_runtime(rt_rq);
226
227 rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
228 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
229 rt_rq->rt_throttled = 0;
230 sched_rt_rq_enqueue(rt_rq);
231 }
232 }
233
234 rq->rt_throttled = 0;
235 }
236}
237
238/* 353/*
239 * Update the current task's runtime statistics. Skip current tasks that 354 * Update the current task's runtime statistics. Skip current tasks that
240 * are not in our scheduling class. 355 * are not in our scheduling class.
@@ -259,9 +374,15 @@ static void update_curr_rt(struct rq *rq)
259 curr->se.exec_start = rq->clock; 374 curr->se.exec_start = rq->clock;
260 cpuacct_charge(curr, delta_exec); 375 cpuacct_charge(curr, delta_exec);
261 376
262 rt_rq->rt_time += delta_exec; 377 for_each_sched_rt_entity(rt_se) {
263 if (sched_rt_runtime_exceeded(rt_rq)) 378 rt_rq = rt_rq_of_se(rt_se);
264 resched_task(curr); 379
380 spin_lock(&rt_rq->rt_runtime_lock);
381 rt_rq->rt_time += delta_exec;
382 if (sched_rt_runtime_exceeded(rt_rq))
383 resched_task(curr);
384 spin_unlock(&rt_rq->rt_runtime_lock);
385 }
265} 386}
266 387
267static inline 388static inline
@@ -284,6 +405,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
284#ifdef CONFIG_RT_GROUP_SCHED 405#ifdef CONFIG_RT_GROUP_SCHED
285 if (rt_se_boosted(rt_se)) 406 if (rt_se_boosted(rt_se))
286 rt_rq->rt_nr_boosted++; 407 rt_rq->rt_nr_boosted++;
408
409 if (rt_rq->tg)
410 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
411#else
412 start_rt_bandwidth(&def_rt_bandwidth);
287#endif 413#endif
288} 414}
289 415
@@ -353,27 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
353/* 479/*
354 * Because the prio of an upper entry depends on the lower 480 * Because the prio of an upper entry depends on the lower
355 * entries, we must remove entries top - down. 481 * entries, we must remove entries top - down.
356 *
357 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
358 * doesn't matter much for now, as h=2 for GROUP_SCHED.
359 */ 482 */
360static void dequeue_rt_stack(struct task_struct *p) 483static void dequeue_rt_stack(struct task_struct *p)
361{ 484{
362 struct sched_rt_entity *rt_se, *top_se; 485 struct sched_rt_entity *rt_se, *back = NULL;
363 486
364 /* 487 rt_se = &p->rt;
365 * dequeue all, top - down. 488 for_each_sched_rt_entity(rt_se) {
366 */ 489 rt_se->back = back;
367 do { 490 back = rt_se;
368 rt_se = &p->rt; 491 }
369 top_se = NULL; 492
370 for_each_sched_rt_entity(rt_se) { 493 for (rt_se = back; rt_se; rt_se = rt_se->back) {
371 if (on_rt_rq(rt_se)) 494 if (on_rt_rq(rt_se))
372 top_se = rt_se; 495 dequeue_rt_entity(rt_se);
373 } 496 }
374 if (top_se)
375 dequeue_rt_entity(top_se);
376 } while (top_se);
377} 497}
378 498
379/* 499/*
@@ -393,6 +513,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
393 */ 513 */
394 for_each_sched_rt_entity(rt_se) 514 for_each_sched_rt_entity(rt_se)
395 enqueue_rt_entity(rt_se); 515 enqueue_rt_entity(rt_se);
516
517 inc_cpu_load(rq, p->se.load.weight);
396} 518}
397 519
398static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 520static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -412,6 +534,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
412 if (rt_rq && rt_rq->rt_nr_running) 534 if (rt_rq && rt_rq->rt_nr_running)
413 enqueue_rt_entity(rt_se); 535 enqueue_rt_entity(rt_se);
414 } 536 }
537
538 dec_cpu_load(rq, p->se.load.weight);
415} 539}
416 540
417/* 541/*
@@ -1001,7 +1125,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1001 return 0; 1125 return 0;
1002} 1126}
1003 1127
1004static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) 1128static void set_cpus_allowed_rt(struct task_struct *p,
1129 const cpumask_t *new_mask)
1005{ 1130{
1006 int weight = cpus_weight(*new_mask); 1131 int weight = cpus_weight(*new_mask);
1007 1132
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 5b32433e7ee5..5bae2e0c3ff2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -9,6 +9,11 @@
9static int show_schedstat(struct seq_file *seq, void *v) 9static int show_schedstat(struct seq_file *seq, void *v)
10{ 10{
11 int cpu; 11 int cpu;
12 int mask_len = NR_CPUS/32 * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
12 17
13 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
14 seq_printf(seq, "timestamp %lu\n", jiffies); 19 seq_printf(seq, "timestamp %lu\n", jiffies);
@@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
36 preempt_disable(); 41 preempt_disable();
37 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
38 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
39 char mask_str[NR_CPUS];
40 44
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span); 45 cpumask_scnprintf(mask_str, mask_len, sd->span);
42 seq_printf(seq, "domain%d %s", dcount++, mask_str); 46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) { 48 itype++) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 31e9f2a47928..3c44956ee7e2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -356,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
356/* Tasklets */ 356/* Tasklets */
357struct tasklet_head 357struct tasklet_head
358{ 358{
359 struct tasklet_struct *list; 359 struct tasklet_struct *head;
360 struct tasklet_struct **tail;
360}; 361};
361 362
362/* Some compilers disobey section attribute on statics when not 363/* Some compilers disobey section attribute on statics when not
@@ -369,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t)
369 unsigned long flags; 370 unsigned long flags;
370 371
371 local_irq_save(flags); 372 local_irq_save(flags);
372 t->next = __get_cpu_var(tasklet_vec).list; 373 t->next = NULL;
373 __get_cpu_var(tasklet_vec).list = t; 374 *__get_cpu_var(tasklet_vec).tail = t;
375 __get_cpu_var(tasklet_vec).tail = &(t->next);
374 raise_softirq_irqoff(TASKLET_SOFTIRQ); 376 raise_softirq_irqoff(TASKLET_SOFTIRQ);
375 local_irq_restore(flags); 377 local_irq_restore(flags);
376} 378}
@@ -382,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
382 unsigned long flags; 384 unsigned long flags;
383 385
384 local_irq_save(flags); 386 local_irq_save(flags);
385 t->next = __get_cpu_var(tasklet_hi_vec).list; 387 t->next = NULL;
386 __get_cpu_var(tasklet_hi_vec).list = t; 388 *__get_cpu_var(tasklet_hi_vec).tail = t;
389 __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
387 raise_softirq_irqoff(HI_SOFTIRQ); 390 raise_softirq_irqoff(HI_SOFTIRQ);
388 local_irq_restore(flags); 391 local_irq_restore(flags);
389} 392}
@@ -395,8 +398,9 @@ static void tasklet_action(struct softirq_action *a)
395 struct tasklet_struct *list; 398 struct tasklet_struct *list;
396 399
397 local_irq_disable(); 400 local_irq_disable();
398 list = __get_cpu_var(tasklet_vec).list; 401 list = __get_cpu_var(tasklet_vec).head;
399 __get_cpu_var(tasklet_vec).list = NULL; 402 __get_cpu_var(tasklet_vec).head = NULL;
403 __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
400 local_irq_enable(); 404 local_irq_enable();
401 405
402 while (list) { 406 while (list) {
@@ -416,8 +420,9 @@ static void tasklet_action(struct softirq_action *a)
416 } 420 }
417 421
418 local_irq_disable(); 422 local_irq_disable();
419 t->next = __get_cpu_var(tasklet_vec).list; 423 t->next = NULL;
420 __get_cpu_var(tasklet_vec).list = t; 424 *__get_cpu_var(tasklet_vec).tail = t;
425 __get_cpu_var(tasklet_vec).tail = &(t->next);
421 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 426 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
422 local_irq_enable(); 427 local_irq_enable();
423 } 428 }
@@ -428,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a)
428 struct tasklet_struct *list; 433 struct tasklet_struct *list;
429 434
430 local_irq_disable(); 435 local_irq_disable();
431 list = __get_cpu_var(tasklet_hi_vec).list; 436 list = __get_cpu_var(tasklet_hi_vec).head;
432 __get_cpu_var(tasklet_hi_vec).list = NULL; 437 __get_cpu_var(tasklet_hi_vec).head = NULL;
438 __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
433 local_irq_enable(); 439 local_irq_enable();
434 440
435 while (list) { 441 while (list) {
@@ -449,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a)
449 } 455 }
450 456
451 local_irq_disable(); 457 local_irq_disable();
452 t->next = __get_cpu_var(tasklet_hi_vec).list; 458 t->next = NULL;
453 __get_cpu_var(tasklet_hi_vec).list = t; 459 *__get_cpu_var(tasklet_hi_vec).tail = t;
460 __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
454 __raise_softirq_irqoff(HI_SOFTIRQ); 461 __raise_softirq_irqoff(HI_SOFTIRQ);
455 local_irq_enable(); 462 local_irq_enable();
456 } 463 }
@@ -487,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill);
487 494
488void __init softirq_init(void) 495void __init softirq_init(void)
489{ 496{
497 int cpu;
498
499 for_each_possible_cpu(cpu) {
500 per_cpu(tasklet_vec, cpu).tail =
501 &per_cpu(tasklet_vec, cpu).head;
502 per_cpu(tasklet_hi_vec, cpu).tail =
503 &per_cpu(tasklet_hi_vec, cpu).head;
504 }
505
490 open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); 506 open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
491 open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); 507 open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
492} 508}
@@ -555,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
555 return; 571 return;
556 572
557 /* CPU is dead, so no lock needed. */ 573 /* CPU is dead, so no lock needed. */
558 for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { 574 for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
559 if (*i == t) { 575 if (*i == t) {
560 *i = t->next; 576 *i = t->next;
577 /* If this was the tail element, move the tail ptr */
578 if (*i == NULL)
579 per_cpu(tasklet_vec, cpu).tail = i;
561 return; 580 return;
562 } 581 }
563 } 582 }
@@ -566,20 +585,20 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
566 585
567static void takeover_tasklets(unsigned int cpu) 586static void takeover_tasklets(unsigned int cpu)
568{ 587{
569 struct tasklet_struct **i;
570
571 /* CPU is dead, so no lock needed. */ 588 /* CPU is dead, so no lock needed. */
572 local_irq_disable(); 589 local_irq_disable();
573 590
574 /* Find end, append list for that CPU. */ 591 /* Find end, append list for that CPU. */
575 for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); 592 *__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head;
576 *i = per_cpu(tasklet_vec, cpu).list; 593 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
577 per_cpu(tasklet_vec, cpu).list = NULL; 594 per_cpu(tasklet_vec, cpu).head = NULL;
595 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
578 raise_softirq_irqoff(TASKLET_SOFTIRQ); 596 raise_softirq_irqoff(TASKLET_SOFTIRQ);
579 597
580 for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); 598 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
581 *i = per_cpu(tasklet_hi_vec, cpu).list; 599 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
582 per_cpu(tasklet_hi_vec, cpu).list = NULL; 600 per_cpu(tasklet_hi_vec, cpu).head = NULL;
601 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
583 raise_softirq_irqoff(HI_SOFTIRQ); 602 raise_softirq_irqoff(HI_SOFTIRQ);
584 603
585 local_irq_enable(); 604 local_irq_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6f4e0e13f70c..e1b2a5b1b105 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,7 +35,7 @@ static int stopmachine(void *cpu)
35 int irqs_disabled = 0; 35 int irqs_disabled = 0;
36 int prepared = 0; 36 int prepared = 0;
37 37
38 set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); 38 set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
39 39
40 /* Ack: we are alive */ 40 /* Ack: we are alive */
41 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ 41 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2a2d6889bab..fd3364827ccf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -270,17 +270,6 @@ static struct ctl_table kern_table[] = {
270 }, 270 },
271 { 271 {
272 .ctl_name = CTL_UNNUMBERED, 272 .ctl_name = CTL_UNNUMBERED,
273 .procname = "sched_batch_wakeup_granularity_ns",
274 .data = &sysctl_sched_batch_wakeup_granularity,
275 .maxlen = sizeof(unsigned int),
276 .mode = 0644,
277 .proc_handler = &proc_dointvec_minmax,
278 .strategy = &sysctl_intvec,
279 .extra1 = &min_wakeup_granularity_ns,
280 .extra2 = &max_wakeup_granularity_ns,
281 },
282 {
283 .ctl_name = CTL_UNNUMBERED,
284 .procname = "sched_child_runs_first", 273 .procname = "sched_child_runs_first",
285 .data = &sysctl_sched_child_runs_first, 274 .data = &sysctl_sched_child_runs_first,
286 .maxlen = sizeof(unsigned int), 275 .maxlen = sizeof(unsigned int),
@@ -318,7 +307,7 @@ static struct ctl_table kern_table[] = {
318 .data = &sysctl_sched_rt_period, 307 .data = &sysctl_sched_rt_period,
319 .maxlen = sizeof(unsigned int), 308 .maxlen = sizeof(unsigned int),
320 .mode = 0644, 309 .mode = 0644,
321 .proc_handler = &proc_dointvec, 310 .proc_handler = &sched_rt_handler,
322 }, 311 },
323 { 312 {
324 .ctl_name = CTL_UNNUMBERED, 313 .ctl_name = CTL_UNNUMBERED,
@@ -326,7 +315,7 @@ static struct ctl_table kern_table[] = {
326 .data = &sysctl_sched_rt_runtime, 315 .data = &sysctl_sched_rt_runtime,
327 .maxlen = sizeof(int), 316 .maxlen = sizeof(int),
328 .mode = 0644, 317 .mode = 0644,
329 .proc_handler = &proc_dointvec, 318 .proc_handler = &sched_rt_handler,
330 }, 319 },
331 { 320 {
332 .ctl_name = CTL_UNNUMBERED, 321 .ctl_name = CTL_UNNUMBERED,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69dba0c71727..d358d4e3a958 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -191,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
191void tick_nohz_stop_sched_tick(void) 191void tick_nohz_stop_sched_tick(void)
192{ 192{
193 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 193 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
194 unsigned long rt_jiffies;
195 struct tick_sched *ts; 194 struct tick_sched *ts;
196 ktime_t last_update, expires, now; 195 ktime_t last_update, expires, now;
197 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 196 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -243,10 +242,6 @@ void tick_nohz_stop_sched_tick(void)
243 next_jiffies = get_next_timer_interrupt(last_jiffies); 242 next_jiffies = get_next_timer_interrupt(last_jiffies);
244 delta_jiffies = next_jiffies - last_jiffies; 243 delta_jiffies = next_jiffies - last_jiffies;
245 244
246 rt_jiffies = rt_needs_cpu(cpu);
247 if (rt_jiffies && rt_jiffies < delta_jiffies)
248 delta_jiffies = rt_jiffies;
249
250 if (rcu_needs_cpu(cpu)) 245 if (rcu_needs_cpu(cpu))
251 delta_jiffies = 1; 246 delta_jiffies = 1;
252 /* 247 /*
diff --git a/kernel/user.c b/kernel/user.c
index 7132022a040c..debce602bfdd 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up)
101{ 101{
102 int rc = 0; 102 int rc = 0;
103 103
104 up->tg = sched_create_group(); 104 up->tg = sched_create_group(&root_task_group);
105 if (IS_ERR(up->tg)) 105 if (IS_ERR(up->tg))
106 rc = -ENOMEM; 106 rc = -ENOMEM;
107 107
@@ -193,6 +193,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
193 193
194static struct kobj_attribute cpu_rt_runtime_attr = 194static struct kobj_attribute cpu_rt_runtime_attr =
195 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); 195 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
196
197static ssize_t cpu_rt_period_show(struct kobject *kobj,
198 struct kobj_attribute *attr,
199 char *buf)
200{
201 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
202
203 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
204}
205
206static ssize_t cpu_rt_period_store(struct kobject *kobj,
207 struct kobj_attribute *attr,
208 const char *buf, size_t size)
209{
210 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
211 unsigned long rt_period;
212 int rc;
213
214 sscanf(buf, "%lu", &rt_period);
215
216 rc = sched_group_set_rt_period(up->tg, rt_period);
217
218 return (rc ? rc : size);
219}
220
221static struct kobj_attribute cpu_rt_period_attr =
222 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
196#endif 223#endif
197 224
198/* default attributes per uid directory */ 225/* default attributes per uid directory */
@@ -202,6 +229,7 @@ static struct attribute *uids_attributes[] = {
202#endif 229#endif
203#ifdef CONFIG_RT_GROUP_SCHED 230#ifdef CONFIG_RT_GROUP_SCHED
204 &cpu_rt_runtime_attr.attr, 231 &cpu_rt_runtime_attr.attr,
232 &cpu_rt_period_attr.attr,
205#endif 233#endif
206 NULL 234 NULL
207}; 235};
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 2c9242e3fed0..a6939e18d7bb 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -316,6 +316,22 @@ int bitmap_scnprintf(char *buf, unsigned int buflen,
316EXPORT_SYMBOL(bitmap_scnprintf); 316EXPORT_SYMBOL(bitmap_scnprintf);
317 317
318/** 318/**
319 * bitmap_scnprintf_len - return buffer length needed to convert
320 * bitmap to an ASCII hex string.
321 * @len: number of bits to be converted
322 */
323int bitmap_scnprintf_len(unsigned int len)
324{
325 /* we need 9 chars per word for 32 bit words (8 hexdigits + sep/null) */
326 int bitslen = ALIGN(len, CHUNKSZ);
327 int wordlen = CHUNKSZ / 4;
328 int buflen = (bitslen / wordlen) * (wordlen + 1) * sizeof(char);
329
330 return buflen;
331}
332EXPORT_SYMBOL(bitmap_scnprintf_len);
333
334/**
319 * __bitmap_parse - convert an ASCII hex string into a bitmap. 335 * __bitmap_parse - convert an ASCII hex string into a bitmap.
320 * @buf: pointer to buffer containing string. 336 * @buf: pointer to buffer containing string.
321 * @buflen: buffer size in bytes. If string is smaller than this 337 * @buflen: buffer size in bytes. If string is smaller than this
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index b0012e27fea8..f4026bae6eed 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -82,9 +82,10 @@ EXPORT_SYMBOL_GPL(percpu_populate);
82int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, 82int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
83 cpumask_t *mask) 83 cpumask_t *mask)
84{ 84{
85 cpumask_t populated = CPU_MASK_NONE; 85 cpumask_t populated;
86 int cpu; 86 int cpu;
87 87
88 cpus_clear(populated);
88 for_each_cpu_mask(cpu, *mask) 89 for_each_cpu_mask(cpu, *mask)
89 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { 90 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
90 __percpu_depopulate_mask(__pdata, &populated); 91 __percpu_depopulate_mask(__pdata, &populated);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 402a504f1228..32e796af12a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2029,6 +2029,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2029 int n, val; 2029 int n, val;
2030 int min_val = INT_MAX; 2030 int min_val = INT_MAX;
2031 int best_node = -1; 2031 int best_node = -1;
2032 node_to_cpumask_ptr(tmp, 0);
2032 2033
2033 /* Use the local node if we haven't already */ 2034 /* Use the local node if we haven't already */
2034 if (!node_isset(node, *used_node_mask)) { 2035 if (!node_isset(node, *used_node_mask)) {
@@ -2037,7 +2038,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2037 } 2038 }
2038 2039
2039 for_each_node_state(n, N_HIGH_MEMORY) { 2040 for_each_node_state(n, N_HIGH_MEMORY) {
2040 cpumask_t tmp;
2041 2041
2042 /* Don't want a node to appear more than once */ 2042 /* Don't want a node to appear more than once */
2043 if (node_isset(n, *used_node_mask)) 2043 if (node_isset(n, *used_node_mask))
@@ -2050,8 +2050,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2050 val += (n < node); 2050 val += (n < node);
2051 2051
2052 /* Give preference to headless and unused nodes */ 2052 /* Give preference to headless and unused nodes */
2053 tmp = node_to_cpumask(n); 2053 node_to_cpumask_ptr_next(tmp, n);
2054 if (!cpus_empty(tmp)) 2054 if (!cpus_empty(*tmp))
2055 val += PENALTY_FOR_NODE_WITH_CPUS; 2055 val += PENALTY_FOR_NODE_WITH_CPUS;
2056 2056
2057 /* Slight preference for less loaded node */ 2057 /* Slight preference for less loaded node */
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8f6ee073c0e3..0ceacff56457 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -187,8 +187,8 @@ static int pdflush(void *dummy)
187 * This is needed as pdflush's are dynamically created and destroyed. 187 * This is needed as pdflush's are dynamically created and destroyed.
188 * The boottime pdflush's are easily placed w/o these 2 lines. 188 * The boottime pdflush's are easily placed w/o these 2 lines.
189 */ 189 */
190 cpus_allowed = cpuset_cpus_allowed(current); 190 cpuset_cpus_allowed(current, &cpus_allowed);
191 set_cpus_allowed(current, cpus_allowed); 191 set_cpus_allowed_ptr(current, &cpus_allowed);
192 192
193 return __pdflush(&my_work); 193 return __pdflush(&my_work);
194} 194}
diff --git a/mm/slab.c b/mm/slab.c
index 04b308c3bc54..03927cb5ec9e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1160,14 +1160,13 @@ static void __cpuinit cpuup_canceled(long cpu)
1160 struct kmem_cache *cachep; 1160 struct kmem_cache *cachep;
1161 struct kmem_list3 *l3 = NULL; 1161 struct kmem_list3 *l3 = NULL;
1162 int node = cpu_to_node(cpu); 1162 int node = cpu_to_node(cpu);
1163 node_to_cpumask_ptr(mask, node);
1163 1164
1164 list_for_each_entry(cachep, &cache_chain, next) { 1165 list_for_each_entry(cachep, &cache_chain, next) {
1165 struct array_cache *nc; 1166 struct array_cache *nc;
1166 struct array_cache *shared; 1167 struct array_cache *shared;
1167 struct array_cache **alien; 1168 struct array_cache **alien;
1168 cpumask_t mask;
1169 1169
1170 mask = node_to_cpumask(node);
1171 /* cpu is dead; no one can alloc from it. */ 1170 /* cpu is dead; no one can alloc from it. */
1172 nc = cachep->array[cpu]; 1171 nc = cachep->array[cpu];
1173 cachep->array[cpu] = NULL; 1172 cachep->array[cpu] = NULL;
@@ -1183,7 +1182,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1183 if (nc) 1182 if (nc)
1184 free_block(cachep, nc->entry, nc->avail, node); 1183 free_block(cachep, nc->entry, nc->avail, node);
1185 1184
1186 if (!cpus_empty(mask)) { 1185 if (!cpus_empty(*mask)) {
1187 spin_unlock_irq(&l3->list_lock); 1186 spin_unlock_irq(&l3->list_lock);
1188 goto free_array_cache; 1187 goto free_array_cache;
1189 } 1188 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4046434046e6..f80a5b7c057f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1647,11 +1647,10 @@ static int kswapd(void *p)
1647 struct reclaim_state reclaim_state = { 1647 struct reclaim_state reclaim_state = {
1648 .reclaimed_slab = 0, 1648 .reclaimed_slab = 0,
1649 }; 1649 };
1650 cpumask_t cpumask; 1650 node_to_cpumask_ptr(cpumask, pgdat->node_id);
1651 1651
1652 cpumask = node_to_cpumask(pgdat->node_id); 1652 if (!cpus_empty(*cpumask))
1653 if (!cpus_empty(cpumask)) 1653 set_cpus_allowed_ptr(tsk, cpumask);
1654 set_cpus_allowed(tsk, cpumask);
1655 current->reclaim_state = &reclaim_state; 1654 current->reclaim_state = &reclaim_state;
1656 1655
1657 /* 1656 /*
@@ -1880,17 +1879,16 @@ out:
1880static int __devinit cpu_callback(struct notifier_block *nfb, 1879static int __devinit cpu_callback(struct notifier_block *nfb,
1881 unsigned long action, void *hcpu) 1880 unsigned long action, void *hcpu)
1882{ 1881{
1883 pg_data_t *pgdat;
1884 cpumask_t mask;
1885 int nid; 1882 int nid;
1886 1883
1887 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 1884 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
1888 for_each_node_state(nid, N_HIGH_MEMORY) { 1885 for_each_node_state(nid, N_HIGH_MEMORY) {
1889 pgdat = NODE_DATA(nid); 1886 pg_data_t *pgdat = NODE_DATA(nid);
1890 mask = node_to_cpumask(pgdat->node_id); 1887 node_to_cpumask_ptr(mask, pgdat->node_id);
1891 if (any_online_cpu(mask) != NR_CPUS) 1888
1889 if (any_online_cpu(*mask) < nr_cpu_ids)
1892 /* One of our CPUs online: restore mask */ 1890 /* One of our CPUs online: restore mask */
1893 set_cpus_allowed(pgdat->kswapd, mask); 1891 set_cpus_allowed_ptr(pgdat->kswapd, mask);
1894 } 1892 }
1895 } 1893 }
1896 return NOTIFY_OK; 1894 return NOTIFY_OK;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a290e1523297..090af78d68b5 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -301,7 +301,6 @@ static inline int
301svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) 301svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
302{ 302{
303 struct svc_pool_map *m = &svc_pool_map; 303 struct svc_pool_map *m = &svc_pool_map;
304 unsigned int node; /* or cpu */
305 304
306 /* 305 /*
307 * The caller checks for sv_nrpools > 1, which 306 * The caller checks for sv_nrpools > 1, which
@@ -314,16 +313,23 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
314 default: 313 default:
315 return 0; 314 return 0;
316 case SVC_POOL_PERCPU: 315 case SVC_POOL_PERCPU:
317 node = m->pool_to[pidx]; 316 {
317 unsigned int cpu = m->pool_to[pidx];
318
318 *oldmask = current->cpus_allowed; 319 *oldmask = current->cpus_allowed;
319 set_cpus_allowed(current, cpumask_of_cpu(node)); 320 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
320 return 1; 321 return 1;
322 }
321 case SVC_POOL_PERNODE: 323 case SVC_POOL_PERNODE:
322 node = m->pool_to[pidx]; 324 {
325 unsigned int node = m->pool_to[pidx];
326 node_to_cpumask_ptr(nodecpumask, node);
327
323 *oldmask = current->cpus_allowed; 328 *oldmask = current->cpus_allowed;
324 set_cpus_allowed(current, node_to_cpumask(node)); 329 set_cpus_allowed_ptr(current, nodecpumask);
325 return 1; 330 return 1;
326 } 331 }
332 }
327} 333}
328 334
329/* 335/*