diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-22 21:27:32 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-22 21:27:32 -0400 |
commit | d79ee93de909dfb252279b9a95978bbda9a814a9 (patch) | |
tree | bfccca60fd36259ff4bcc5e78a2c272fbd680065 /arch/x86 | |
parent | 2ff2b289a695807e291e1ed9f639d8a3ba5f4254 (diff) | |
parent | 1c2927f18576d65631d8e0ddd19e1d023183222e (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"The biggest change is the cleanup/simplification of the load-balancer:
instead of the current practice of architectures twiddling scheduler
internal data structures and providing the scheduler domains in
colorfully inconsistent ways, we now have generic scheduler code in
kernel/sched/core.c:sched_init_numa() that looks at the architecture's
node_distance() parameters and (while not fully trusting it) deducts a
NUMA topology from it.
This inevitably changes balancing behavior - hopefully for the better.
There are various smaller optimizations, cleanups and fixlets as well"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched: Taint kernel with TAINT_WARN after sleep-in-atomic bug
sched: Remove stale power aware scheduling remnants and dysfunctional knobs
sched/debug: Fix printing large integers on 32-bit platforms
sched/fair: Improve the ->group_imb logic
sched/nohz: Fix rq->cpu_load[] calculations
sched/numa: Don't scale the imbalance
sched/fair: Revert sched-domain iteration breakage
sched/x86: Rewrite set_cpu_sibling_map()
sched/numa: Fix the new NUMA topology bits
sched/numa: Rewrite the CONFIG_NUMA sched domain support
sched/fair: Propagate 'struct lb_env' usage into find_busiest_group
sched/fair: Add some serialization to the sched_domain load-balance walk
sched/fair: Let minimally loaded cpu balance the group
sched: Change rq->nr_running to unsigned int
x86/numa: Check for nonsensical topologies on real hw as well
x86/numa: Hard partition cpu topology masks on node boundaries
x86/numa: Allow specifying node_distance() for numa=fake
x86/sched: Make mwait_usable() heed to "idle=" kernel parameters properly
sched: Update documentation and comments
sched_rt: Avoid unnecessary dequeue and enqueue of pushable tasks in set_cpus_allowed_rt()
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/topology.h | 38 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 108 | ||||
-rw-r--r-- | arch/x86/mm/numa_emulation.c | 8 |
4 files changed, 83 insertions, 79 deletions
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index b9676ae37ada..095b21507b6a 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h | |||
@@ -92,44 +92,6 @@ extern void setup_node_to_cpumask_map(void); | |||
92 | 92 | ||
93 | #define pcibus_to_node(bus) __pcibus_to_node(bus) | 93 | #define pcibus_to_node(bus) __pcibus_to_node(bus) |
94 | 94 | ||
95 | #ifdef CONFIG_X86_32 | ||
96 | # define SD_CACHE_NICE_TRIES 1 | ||
97 | # define SD_IDLE_IDX 1 | ||
98 | #else | ||
99 | # define SD_CACHE_NICE_TRIES 2 | ||
100 | # define SD_IDLE_IDX 2 | ||
101 | #endif | ||
102 | |||
103 | /* sched_domains SD_NODE_INIT for NUMA machines */ | ||
104 | #define SD_NODE_INIT (struct sched_domain) { \ | ||
105 | .min_interval = 8, \ | ||
106 | .max_interval = 32, \ | ||
107 | .busy_factor = 32, \ | ||
108 | .imbalance_pct = 125, \ | ||
109 | .cache_nice_tries = SD_CACHE_NICE_TRIES, \ | ||
110 | .busy_idx = 3, \ | ||
111 | .idle_idx = SD_IDLE_IDX, \ | ||
112 | .newidle_idx = 0, \ | ||
113 | .wake_idx = 0, \ | ||
114 | .forkexec_idx = 0, \ | ||
115 | \ | ||
116 | .flags = 1*SD_LOAD_BALANCE \ | ||
117 | | 1*SD_BALANCE_NEWIDLE \ | ||
118 | | 1*SD_BALANCE_EXEC \ | ||
119 | | 1*SD_BALANCE_FORK \ | ||
120 | | 0*SD_BALANCE_WAKE \ | ||
121 | | 1*SD_WAKE_AFFINE \ | ||
122 | | 0*SD_PREFER_LOCAL \ | ||
123 | | 0*SD_SHARE_CPUPOWER \ | ||
124 | | 0*SD_POWERSAVINGS_BALANCE \ | ||
125 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
126 | | 1*SD_SERIALIZE \ | ||
127 | | 0*SD_PREFER_SIBLING \ | ||
128 | , \ | ||
129 | .last_balance = jiffies, \ | ||
130 | .balance_interval = 1, \ | ||
131 | } | ||
132 | |||
133 | extern int __node_distance(int, int); | 95 | extern int __node_distance(int, int); |
134 | #define node_distance(a, b) __node_distance(a, b) | 96 | #define node_distance(a, b) __node_distance(a, b) |
135 | 97 | ||
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index dc8ca8ea78c4..8040b752ee4f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -582,9 +582,17 @@ int mwait_usable(const struct cpuinfo_x86 *c) | |||
582 | { | 582 | { |
583 | u32 eax, ebx, ecx, edx; | 583 | u32 eax, ebx, ecx, edx; |
584 | 584 | ||
585 | /* Use mwait if idle=mwait boot option is given */ | ||
585 | if (boot_option_idle_override == IDLE_FORCE_MWAIT) | 586 | if (boot_option_idle_override == IDLE_FORCE_MWAIT) |
586 | return 1; | 587 | return 1; |
587 | 588 | ||
589 | /* | ||
590 | * Any idle= boot option other than idle=mwait means that we must not | ||
591 | * use mwait. Eg: idle=halt or idle=poll or idle=nomwait | ||
592 | */ | ||
593 | if (boot_option_idle_override != IDLE_NO_OVERRIDE) | ||
594 | return 0; | ||
595 | |||
588 | if (c->cpuid_level < MWAIT_INFO) | 596 | if (c->cpuid_level < MWAIT_INFO) |
589 | return 0; | 597 | return 0; |
590 | 598 | ||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3acaf51dfddb..433529e29be4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -299,59 +299,90 @@ void __cpuinit smp_store_cpu_info(int id) | |||
299 | identify_secondary_cpu(c); | 299 | identify_secondary_cpu(c); |
300 | } | 300 | } |
301 | 301 | ||
302 | static void __cpuinit link_thread_siblings(int cpu1, int cpu2) | 302 | static bool __cpuinit |
303 | topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) | ||
303 | { | 304 | { |
304 | cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); | 305 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; |
305 | cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); | 306 | |
306 | cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); | 307 | return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), |
307 | cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); | 308 | "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " |
308 | cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); | 309 | "[node: %d != %d]. Ignoring dependency.\n", |
309 | cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); | 310 | cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); |
310 | } | 311 | } |
311 | 312 | ||
313 | #define link_mask(_m, c1, c2) \ | ||
314 | do { \ | ||
315 | cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ | ||
316 | cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ | ||
317 | } while (0) | ||
318 | |||
319 | static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
320 | { | ||
321 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { | ||
322 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; | ||
323 | |||
324 | if (c->phys_proc_id == o->phys_proc_id && | ||
325 | per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && | ||
326 | c->compute_unit_id == o->compute_unit_id) | ||
327 | return topology_sane(c, o, "smt"); | ||
328 | |||
329 | } else if (c->phys_proc_id == o->phys_proc_id && | ||
330 | c->cpu_core_id == o->cpu_core_id) { | ||
331 | return topology_sane(c, o, "smt"); | ||
332 | } | ||
333 | |||
334 | return false; | ||
335 | } | ||
336 | |||
337 | static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
338 | { | ||
339 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; | ||
340 | |||
341 | if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && | ||
342 | per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) | ||
343 | return topology_sane(c, o, "llc"); | ||
344 | |||
345 | return false; | ||
346 | } | ||
347 | |||
348 | static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
349 | { | ||
350 | if (c->phys_proc_id == o->phys_proc_id) | ||
351 | return topology_sane(c, o, "mc"); | ||
352 | |||
353 | return false; | ||
354 | } | ||
312 | 355 | ||
313 | void __cpuinit set_cpu_sibling_map(int cpu) | 356 | void __cpuinit set_cpu_sibling_map(int cpu) |
314 | { | 357 | { |
315 | int i; | 358 | bool has_mc = boot_cpu_data.x86_max_cores > 1; |
359 | bool has_smt = smp_num_siblings > 1; | ||
316 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 360 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
361 | struct cpuinfo_x86 *o; | ||
362 | int i; | ||
317 | 363 | ||
318 | cpumask_set_cpu(cpu, cpu_sibling_setup_mask); | 364 | cpumask_set_cpu(cpu, cpu_sibling_setup_mask); |
319 | 365 | ||
320 | if (smp_num_siblings > 1) { | 366 | if (!has_smt && !has_mc) { |
321 | for_each_cpu(i, cpu_sibling_setup_mask) { | ||
322 | struct cpuinfo_x86 *o = &cpu_data(i); | ||
323 | |||
324 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { | ||
325 | if (c->phys_proc_id == o->phys_proc_id && | ||
326 | per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && | ||
327 | c->compute_unit_id == o->compute_unit_id) | ||
328 | link_thread_siblings(cpu, i); | ||
329 | } else if (c->phys_proc_id == o->phys_proc_id && | ||
330 | c->cpu_core_id == o->cpu_core_id) { | ||
331 | link_thread_siblings(cpu, i); | ||
332 | } | ||
333 | } | ||
334 | } else { | ||
335 | cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); | 367 | cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); |
336 | } | 368 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); |
337 | 369 | cpumask_set_cpu(cpu, cpu_core_mask(cpu)); | |
338 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); | ||
339 | |||
340 | if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { | ||
341 | cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); | ||
342 | c->booted_cores = 1; | 370 | c->booted_cores = 1; |
343 | return; | 371 | return; |
344 | } | 372 | } |
345 | 373 | ||
346 | for_each_cpu(i, cpu_sibling_setup_mask) { | 374 | for_each_cpu(i, cpu_sibling_setup_mask) { |
347 | if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && | 375 | o = &cpu_data(i); |
348 | per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { | 376 | |
349 | cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); | 377 | if ((i == cpu) || (has_smt && match_smt(c, o))) |
350 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); | 378 | link_mask(sibling, cpu, i); |
351 | } | 379 | |
352 | if (c->phys_proc_id == cpu_data(i).phys_proc_id) { | 380 | if ((i == cpu) || (has_mc && match_llc(c, o))) |
353 | cpumask_set_cpu(i, cpu_core_mask(cpu)); | 381 | link_mask(llc_shared, cpu, i); |
354 | cpumask_set_cpu(cpu, cpu_core_mask(i)); | 382 | |
383 | if ((i == cpu) || (has_mc && match_mc(c, o))) { | ||
384 | link_mask(core, cpu, i); | ||
385 | |||
355 | /* | 386 | /* |
356 | * Does this new cpu bringup a new core? | 387 | * Does this new cpu bringup a new core? |
357 | */ | 388 | */ |
@@ -382,8 +413,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) | |||
382 | * For perf, we return last level cache shared map. | 413 | * For perf, we return last level cache shared map. |
383 | * And for power savings, we return cpu_core_map | 414 | * And for power savings, we return cpu_core_map |
384 | */ | 415 | */ |
385 | if ((sched_mc_power_savings || sched_smt_power_savings) && | 416 | if (!(cpu_has(c, X86_FEATURE_AMD_DCM))) |
386 | !(cpu_has(c, X86_FEATURE_AMD_DCM))) | ||
387 | return cpu_core_mask(cpu); | 417 | return cpu_core_mask(cpu); |
388 | else | 418 | else |
389 | return cpu_llc_shared_mask(cpu); | 419 | return cpu_llc_shared_mask(cpu); |
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 53489ff6bf82..871dd8868170 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c | |||
@@ -339,9 +339,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) | |||
339 | } else { | 339 | } else { |
340 | unsigned long n; | 340 | unsigned long n; |
341 | 341 | ||
342 | n = simple_strtoul(emu_cmdline, NULL, 0); | 342 | n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); |
343 | ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); | 343 | ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); |
344 | } | 344 | } |
345 | if (*emu_cmdline == ':') | ||
346 | emu_cmdline++; | ||
345 | 347 | ||
346 | if (ret < 0) | 348 | if (ret < 0) |
347 | goto no_emu; | 349 | goto no_emu; |
@@ -418,7 +420,9 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) | |||
418 | int physj = emu_nid_to_phys[j]; | 420 | int physj = emu_nid_to_phys[j]; |
419 | int dist; | 421 | int dist; |
420 | 422 | ||
421 | if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) | 423 | if (get_option(&emu_cmdline, &dist) == 2) |
424 | ; | ||
425 | else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) | ||
422 | dist = physi == physj ? | 426 | dist = physi == physj ? |
423 | LOCAL_DISTANCE : REMOTE_DISTANCE; | 427 | LOCAL_DISTANCE : REMOTE_DISTANCE; |
424 | else | 428 | else |